diff --git a/WORKSPACE b/WORKSPACE
index 74ea14d0fd7..622fa4d1412 100644
--- a/WORKSPACE
+++ b/WORKSPACE
@@ -49,34 +49,34 @@ remote_config_workspace()
 # Apple and Swift rules.
 http_archive(
     name = "build_bazel_rules_apple",
-    sha256 = "6efdde60c91724a2be7f89b0c0a64f01138a45e63ba5add2dca2645d981d23a1",
-    urls = ["https://github.com/bazelbuild/rules_apple/releases/download/0.17.2/rules_apple.0.17.2.tar.gz"],
+    sha256 = "a045a436b642c70fb0c10ca84ff0fd2dcbd59cc89100d597a61e8374afafb366",
+    urls = ["https://github.com/bazelbuild/rules_apple/releases/download/0.18.0/rules_apple.0.18.0.tar.gz"],
 )  # https://github.com/bazelbuild/rules_apple/releases
 http_archive(
     name = "build_bazel_rules_swift",
-    sha256 = "96a86afcbdab215f8363e65a10cf023b752e90b23abf02272c4fc668fcb70311",
-    urls = ["https://github.com/bazelbuild/rules_swift/releases/download/0.11.1/rules_swift.0.11.1.tar.gz"],
+    sha256 = "18cd4df4e410b0439a4935f9ca035bd979993d42372ba79e7f2d4fafe9596ef0",
+    urls = ["https://github.com/bazelbuild/rules_swift/releases/download/0.12.1/rules_swift.0.12.1.tar.gz"],
 )  # https://github.com/bazelbuild/rules_swift/releases
 http_archive(
     name = "build_bazel_apple_support",
-    sha256 = "7356dbd44dea71570a929d1d4731e870622151a5f27164d966dda97305f33471",
-    urls = ["https://github.com/bazelbuild/apple_support/releases/download/0.6.0/apple_support.0.6.0.tar.gz"],
+    sha256 = "122ebf7fe7d1c8e938af6aeaee0efe788a3a2449ece5a8d6a428cb18d6f88033",
+    urls = ["https://github.com/bazelbuild/apple_support/releases/download/0.7.1/apple_support.0.7.1.tar.gz"],
 )  # https://github.com/bazelbuild/apple_support/releases
 http_archive(
     name = "bazel_skylib",
-    sha256 = "2ef429f5d7ce7111263289644d233707dba35e39696377ebab8b0bc701f7818e",
-    urls = ["https://github.com/bazelbuild/bazel-skylib/releases/download/0.8.0/bazel-skylib.0.8.0.tar.gz"],
+    sha256 = "1dde365491125a3db70731e25658dfdd3bc5dbdfd11b840b3e987ecf043c7ca0",
+    urls = ["https://github.com/bazelbuild/bazel-skylib/releases/download/0.9.0/bazel-skylib.0.9.0.tar.gz"],
 )  # https://github.com/bazelbuild/bazel-skylib/releases
 http_archive(
     name = "com_github_apple_swift_swift_protobuf",
     type = "zip",
-    strip_prefix = "swift-protobuf-1.5.0/",
-    urls = ["https://github.com/apple/swift-protobuf/archive/1.5.0.zip"],
+    strip_prefix = "swift-protobuf-1.6.0/",
+    urls = ["https://github.com/apple/swift-protobuf/archive/1.6.0.zip"],
 )  # https://github.com/apple/swift-protobuf/releases
 http_file(
     name = "xctestrunner",
     executable = 1,
-    urls = ["https://github.com/google/xctestrunner/releases/download/0.2.7/ios_test_runner.par"],
+    urls = ["https://github.com/google/xctestrunner/releases/download/0.2.9/ios_test_runner.par"],
 )  # https://github.com/google/xctestrunner/releases
 # Use `swift_rules_dependencies` to fetch the toolchains. With the
 # `git_repository` rules above, the following call will skip redefining them.
diff --git a/arm_compiler.BUILD b/arm_compiler.BUILD
index db2e9bbe1e1..cffe3fac70d 100644
--- a/arm_compiler.BUILD
+++ b/arm_compiler.BUILD
@@ -3,56 +3,56 @@ package(default_visibility = ["//visibility:public"])
 filegroup(
     name = "gcc",
     srcs = [
-        "bin/arm-linux-gnueabihf-gcc",
+        "bin/arm-rpi-linux-gnueabihf-gcc",
     ],
 )
 
 filegroup(
     name = "ar",
     srcs = [
-        "bin/arm-linux-gnueabihf-ar",
+        "bin/arm-rpi-linux-gnueabihf-ar",
     ],
 )
 
 filegroup(
     name = "ld",
     srcs = [
-        "bin/arm-linux-gnueabihf-ld",
+        "bin/arm-rpi-linux-gnueabihf-ld",
     ],
 )
 
 filegroup(
     name = "nm",
     srcs = [
-        "bin/arm-linux-gnueabihf-nm",
+        "bin/arm-rpi-linux-gnueabihf-nm",
     ],
 )
 
 filegroup(
     name = "objcopy",
     srcs = [
-        "bin/arm-linux-gnueabihf-objcopy",
+        "bin/arm-rpi-linux-gnueabihf-objcopy",
     ],
 )
 
 filegroup(
     name = "objdump",
     srcs = [
-        "bin/arm-linux-gnueabihf-objdump",
+        "bin/arm-rpi-linux-gnueabihf-objdump",
     ],
 )
 
 filegroup(
     name = "strip",
     srcs = [
-        "bin/arm-linux-gnueabihf-strip",
+        "bin/arm-rpi-linux-gnueabihf-strip",
     ],
 )
 
 filegroup(
     name = "as",
     srcs = [
-        "bin/arm-linux-gnueabihf-as",
+        "bin/arm-rpi-linux-gnueabihf-as",
     ],
 )
 
diff --git a/configure.py b/configure.py
index 4391fab507a..115c170238f 100644
--- a/configure.py
+++ b/configure.py
@@ -1388,11 +1388,18 @@ def main():
   if (environ_cp.get('TF_NEED_CUDA') == '1' and
       'TF_CUDA_CONFIG_REPO' not in environ_cp):
 
+    tensor_rt_question = (
+        'Do you wish to build TensorFlow with TensorRT support?  NB! There ' +
+        'are known ODR violations between TensorRT and cuDNN that may result ' +
+        'in application crashes and/or data corruption. Please see ' +
+        'https://github.com/tensorflow/tensorflow/issues/32480 for details.')
+
     set_action_env_var(
         environ_cp,
         'TF_NEED_TENSORRT',
         'TensorRT',
         False,
+        question=tensor_rt_question,
         bazel_config_name='tensorrt')
 
     environ_save = dict(environ_cp)
diff --git a/tensorflow/c/c_api.cc b/tensorflow/c/c_api.cc
index ed4f10e0f77..ae6e582a421 100644
--- a/tensorflow/c/c_api.cc
+++ b/tensorflow/c/c_api.cc
@@ -159,7 +159,7 @@ TF_DeprecatedSession* TF_NewDeprecatedSession(const TF_SessionOptions* opt,
                                               TF_Status* status) {
   Session* session;
   status->status = NewSession(opt->options, &session);
-  if (TF_GetCode(status) == TF_OK) {
+  if (status->status.ok()) {
     return new TF_DeprecatedSession({session});
   } else {
     DCHECK_EQ(nullptr, session);
@@ -332,7 +332,7 @@ bool ExtendSessionGraphHelper(TF_Session* session, TF_Status* status) {
       // TODO(nolivia): check this on a subset of the graph instead of all of
       // it.
       status->status = graph::ValidateGraphHasNoCycle(session->graph->graph);
-      if (TF_GetCode(status) != TF_OK) {
+      if (!status->status.ok()) {
         session->graph->mu.unlock();
         return false;
       }
@@ -352,7 +352,7 @@ bool ExtendSessionGraphHelper(TF_Session* session, TF_Status* status) {
       *graph_def.mutable_library() = graph.flib_def().ToProto();
       session->graph->mu.unlock();
       status->status = session->session->Extend(std::move(graph_def));
-      if (TF_GetCode(status) != TF_OK) {
+      if (!status->status.ok()) {
         // Contract is we always delete input_values[i].
         return false;
       }
@@ -382,7 +382,7 @@ static bool TF_Run_Inputs(TF_Tensor* const* c_inputs,
   const int ninputs = input_pairs->size();
   for (int i = 0; i < ninputs; ++i) {
     status->status = TF_TensorToTensor(c_inputs[i], &(*input_pairs)[i].second);
-    if (TF_GetCode(status) != TF_OK) return false;
+    if (!status->status.ok()) return false;
   }
   return true;
 }
@@ -439,7 +439,7 @@ static void TF_Run_Helper(
     // Serialize back to upstream client, who now owns the new buffer
     if (run_metadata != nullptr) {
       status->status = MessageToBuffer(run_metadata_proto, run_metadata);
-      if (TF_GetCode(status) != TF_OK) return;
+      if (!status->status.ok()) return;
     }
   } else {
     // NOTE(zongheng): PRun does not support RunOptions yet.
@@ -459,7 +459,7 @@ static void TF_Run_Helper(
       continue;
     }
     c_outputs[i] = TF_TensorFromTensor(src, status);
-    if (TF_GetCode(status) != TF_OK) return;
+    if (!status->status.ok()) return;
   }
 }
 
@@ -516,7 +516,7 @@ void TF_PRunSetup(TF_DeprecatedSession* s,
   string new_handle;
   status->status = s->session->PRunSetup(input_names, output_names,
                                          target_oper_names, &new_handle);
-  if (TF_GetCode(status) == TF_OK) {
+  if (status->status.ok()) {
     char* buf = new char[new_handle.size() + 1];
     memcpy(buf, new_handle.c_str(), new_handle.size() + 1);
     *handle = buf;
@@ -555,7 +555,7 @@ TF_Library* TF_LoadLibrary(const char* library_filename, TF_Status* status) {
   status->status = tensorflow::LoadLibrary(
       library_filename, &lib_handle->lib_handle, &lib_handle->op_list.data,
       &lib_handle->op_list.length);
-  if (TF_GetCode(status) != TF_OK) {
+  if (!status->status.ok()) {
     delete lib_handle;
     return nullptr;
   }
@@ -983,7 +983,7 @@ void TF_SetAttrTensor(TF_OperationDescription* desc, const char* attr_name,
                       TF_Tensor* value, TF_Status* status) {
   Tensor t;
   status->status = TF_TensorToTensor(value, &t);
-  if (TF_GetCode(status) == TF_OK) desc->node_builder.Attr(attr_name, t);
+  if (status->status.ok()) desc->node_builder.Attr(attr_name, t);
 }
 
 void TF_SetAttrTensorList(TF_OperationDescription* desc, const char* attr_name,
@@ -993,13 +993,13 @@ void TF_SetAttrTensorList(TF_OperationDescription* desc, const char* attr_name,
   std::vector<Tensor> t;
   t.reserve(num_values);
 
-  for (int i = 0; i < num_values && TF_GetCode(status) == TF_OK; ++i) {
+  for (int i = 0; i < num_values && status->status.ok(); ++i) {
     Tensor v;
     status->status = TF_TensorToTensor(values[i], &v);
     t.emplace_back(v);
   }
 
-  if (TF_GetCode(status) == TF_OK) desc->node_builder.Attr(attr_name, t);
+  if (status->status.ok()) desc->node_builder.Attr(attr_name, t);
 }
 
 void TF_SetAttrValueProto(TF_OperationDescription* desc, const char* attr_name,
@@ -1048,11 +1048,11 @@ static TF_Operation* TF_FinishOperationLocked(TF_OperationDescription* desc,
     status->status = desc->node_builder.Finalize(&desc->graph->graph, &ret,
                                                  /*consume=*/true);
 
-    if (TF_GetCode(status) == TF_OK) {
+    if (status->status.ok()) {
       // Run shape inference function for newly added node.
       status->status = desc->graph->refiner.AddNode(ret);
     }
-    if (TF_GetCode(status) == TF_OK) {
+    if (status->status.ok()) {
       // Add the node to the name-to-node mapping.
       desc->graph->name_map[ret->name()] = ret;
     } else if (ret != nullptr) {
@@ -1101,7 +1101,7 @@ int TF_OperationOutputListLength(TF_Operation* oper, const char* arg_name,
   NameRangeMap name_ranges;
   status->status =
       NameRangesForNode(oper->node, oper->node.op_def(), nullptr, &name_ranges);
-  if (TF_GetCode(status) != TF_OK) return -1;
+  if (!status->status.ok()) return -1;
   auto iter = name_ranges.find(arg_name);
   if (iter == name_ranges.end()) {
     status->status = InvalidArgument("Output arg '", arg_name, "' not found");
@@ -1123,7 +1123,7 @@ int TF_OperationInputListLength(TF_Operation* oper, const char* arg_name,
   NameRangeMap name_ranges;
   status->status =
       NameRangesForNode(oper->node, oper->node.op_def(), &name_ranges, nullptr);
-  if (TF_GetCode(status) != TF_OK) return -1;
+  if (!status->status.ok()) return -1;
   auto iter = name_ranges.find(arg_name);
   if (iter == name_ranges.end()) {
     status->status = InvalidArgument("Input arg '", arg_name, "' not found");
@@ -1142,6 +1142,16 @@ TF_Output TF_OperationInput(TF_Input oper_in) {
   return {ToOperation(edge->src()), edge->src_output()};
 }
 
+void TF_OperationAllInputs(TF_Operation* oper, TF_Output* inputs,
+                           int max_inputs) {
+  for (auto* edge : oper->node.in_edges()) {
+    if (edge->dst_input() >= 0 && edge->dst_input() < max_inputs) {
+      inputs[edge->dst_input()] = {ToOperation(edge->src()),
+                                   edge->src_output()};
+    }
+  }
+}
+
 int TF_OperationOutputNumConsumers(TF_Output oper_out) {
   int count = 0;
   for (const auto* edge : oper_out.oper->node.out_edges()) {
@@ -1221,7 +1231,7 @@ TF_AttrMetadata TF_OperationGetAttrMetadata(TF_Operation* oper,
                                             TF_Status* status) {
   TF_AttrMetadata metadata;
   const auto* attr = GetAttrValue(oper, attr_name, status);
-  if (TF_GetCode(status) != TF_OK) return metadata;
+  if (!status->status.ok()) return metadata;
   switch (attr->value_case()) {
 #define SINGLE_CASE(kK, attr_type, size_expr) \
   case tensorflow::AttrValue::kK:             \
@@ -1328,7 +1338,7 @@ void TF_OperationGetAttrString(TF_Operation* oper, const char* attr_name,
                                void* value, size_t max_length,
                                TF_Status* status) {
   const auto* attr = GetAttrValue(oper, attr_name, status);
-  if (TF_GetCode(status) != TF_OK) return;
+  if (!status->status.ok()) return;
   if (attr->value_case() != tensorflow::AttrValue::kS) {
     status->status =
         InvalidArgument("Attribute '", attr_name, "' is not a string");
@@ -1346,7 +1356,7 @@ void TF_OperationGetAttrStringList(TF_Operation* oper, const char* attr_name,
                                    int max_values, void* storage,
                                    size_t storage_size, TF_Status* status) {
   const auto* attr = GetAttrValue(oper, attr_name, status);
-  if (TF_GetCode(status) != TF_OK) return;
+  if (!status->status.ok()) return;
   if (attr->value_case() != tensorflow::AttrValue::kList) {
     status->status =
         InvalidArgument("Value for '", attr_name, "' is not a list");
@@ -1379,7 +1389,7 @@ void TF_OperationGetAttrStringList(TF_Operation* oper, const char* attr_name,
   void func##List(TF_Operation* oper, const char* attr_name, c_type* values, \
                   int max_values, TF_Status* status) {                       \
     const auto* attr = GetAttrValue(oper, attr_name, status);                \
-    if (TF_GetCode(status) != TF_OK) return;                                 \
+    if (!status->status.ok()) return;                                        \
     if (attr->value_case() != tensorflow::AttrValue::kList) {                \
       status->status =                                                       \
           InvalidArgument("Value for '", attr_name, "' is not a list.");     \
@@ -1401,7 +1411,7 @@ void TF_OperationGetAttrShape(TF_Operation* oper, const char* attr_name,
   PartialTensorShape shape;
   status->status =
       tensorflow::GetNodeAttr(oper->node.attrs(), attr_name, &shape);
-  if (TF_GetCode(status) != TF_OK) return;
+  if (!status->status.ok()) return;
   auto len = std::min(shape.dims(), num_dims);
   for (int i = 0; i < len; ++i) {
     value[i] = shape.dim_size(i);
@@ -1415,7 +1425,7 @@ void TF_OperationGetAttrShapeList(TF_Operation* oper, const char* attr_name,
   std::vector<PartialTensorShape> shapes;
   status->status =
       tensorflow::GetNodeAttr(oper->node.attrs(), attr_name, &shapes);
-  if (TF_GetCode(status) != TF_OK) return;
+  if (!status->status.ok()) return;
   auto len = std::min(static_cast<int>(shapes.size()), num_shapes);
   int64_t* p = storage;
   int storage_left = storage_size;
@@ -1443,7 +1453,7 @@ void TF_OperationGetAttrTensorShapeProto(TF_Operation* oper,
                                          const char* attr_name,
                                          TF_Buffer* value, TF_Status* status) {
   const auto* attr = GetAttrValue(oper, attr_name, status);
-  if (TF_GetCode(status) != TF_OK) return;
+  if (!status->status.ok()) return;
   if (attr->value_case() != tensorflow::AttrValue::kShape) {
     status->status =
         InvalidArgument("Value for '", attr_name, "' is not a shape.");
@@ -1457,7 +1467,7 @@ void TF_OperationGetAttrTensorShapeProtoList(TF_Operation* oper,
                                              TF_Buffer** values, int max_values,
                                              TF_Status* status) {
   const auto* attr = GetAttrValue(oper, attr_name, status);
-  if (TF_GetCode(status) != TF_OK) return;
+  if (!status->status.ok()) return;
   if (attr->value_case() != tensorflow::AttrValue::kList) {
     status->status =
         InvalidArgument("Value for '", attr_name, "' is not a list");
@@ -1467,7 +1477,7 @@ void TF_OperationGetAttrTensorShapeProtoList(TF_Operation* oper,
   for (int i = 0; i < len; ++i) {
     values[i] = TF_NewBuffer();
     status->status = MessageToBuffer(attr->list().shape(i), values[i]);
-    if (TF_GetCode(status) != TF_OK) {
+    if (!status->status.ok()) {
       // Delete everything allocated to far, the operation has failed.
       for (int j = 0; j <= i; ++j) {
         TF_DeleteBuffer(values[j]);
@@ -1482,7 +1492,7 @@ void TF_OperationGetAttrTensor(TF_Operation* oper, const char* attr_name,
   *value = nullptr;
   Tensor t;
   status->status = tensorflow::GetNodeAttr(oper->node.attrs(), attr_name, &t);
-  if (TF_GetCode(status) != TF_OK) return;
+  if (!status->status.ok()) return;
   *value = TF_TensorFromTensor(t, status);
 }
 
@@ -1491,7 +1501,7 @@ void TF_OperationGetAttrTensorList(TF_Operation* oper, const char* attr_name,
                                    TF_Status* status) {
   std::vector<Tensor> ts;
   status->status = tensorflow::GetNodeAttr(oper->node.attrs(), attr_name, &ts);
-  if (TF_GetCode(status) != TF_OK) return;
+  if (!status->status.ok()) return;
   const auto len = std::min(max_values, static_cast<int>(ts.size()));
   for (int i = 0; i < len; ++i) {
     values[i] = TF_TensorFromTensor(ts[i], status);
@@ -1502,7 +1512,7 @@ void TF_OperationGetAttrValueProto(TF_Operation* oper, const char* attr_name,
                                    TF_Buffer* output_attr_value,
                                    TF_Status* status) {
   const auto* attr = GetAttrValue(oper, attr_name, status);
-  if (TF_GetCode(status) != TF_OK) return;
+  if (!status->status.ok()) return;
   status->status = MessageToBuffer(*attr, output_attr_value);
 }
 
@@ -1583,7 +1593,7 @@ void TF_GraphGetOpDef(TF_Graph* graph, const char* op_name,
   {
     mutex_lock l(graph->mu);
     status->status = graph->graph.op_registry()->LookUpOpDef(op_name, &op_def);
-    if (TF_GetCode(status) != TF_OK) return;
+    if (!status->status.ok()) return;
   }
   status->status = MessageToBuffer(*op_def, output_op_def);
 }
@@ -1701,7 +1711,7 @@ static void GraphImportGraphDefLocked(TF_Graph* graph, const GraphDef& def,
   tensorflow::ImportGraphDefResults results;
   status->status = tensorflow::ImportGraphDef(opts->opts, def, &graph->graph,
                                               &graph->refiner, &results);
-  if (TF_GetCode(status) != TF_OK) return;
+  if (!status->status.ok()) return;
 
   // Add new nodes to name_map
   for (int i = last_node_id; i < graph->graph.num_node_ids(); ++i) {
@@ -1755,7 +1765,7 @@ TF_ImportGraphDefResults* TF_GraphImportGraphDefWithResults(
   auto results = new TF_ImportGraphDefResults();
   mutex_lock l(graph->mu);
   GraphImportGraphDefLocked(graph, def, options, results, status);
-  if (TF_GetCode(status) != TF_OK) {
+  if (!status->status.ok()) {
     delete results;
     return nullptr;
   }
@@ -1813,7 +1823,7 @@ bool CreateInput(const TF_Output& parent_input, TF_Graph* g, const char* name,
   TF_SetAttrType(desc, "dtype", TF_OperationOutputType(parent_input));
   // TODO(skyewm): set placeholder shape
   TF_Operation* oper = TF_FinishOperation(desc, status);
-  if (TF_GetCode(status) != TF_OK) return false;
+  if (!status->status.ok()) return false;
   *input = {oper, 0};
   return true;
 }
@@ -1958,7 +1968,7 @@ TF_WhileParams TF_NewWhile(TF_Graph* g, TF_Output* inputs, int ninputs,
   TF_WhileParams params = {ninputs,    cond_graph,  cond_inputs,  cond_output,
                            body_graph, body_inputs, body_outputs, name};
 
-  if (TF_GetCode(status) != TF_OK) {
+  if (!status->status.ok()) {
     FreeWhileResources(&params);
     return EmptyWhileParams();
   }
@@ -2160,7 +2170,7 @@ TF_Session* TF_NewSession(TF_Graph* graph, const TF_SessionOptions* opt,
                           TF_Status* status) {
   Session* session;
   status->status = NewSession(opt->options, &session);
-  if (TF_GetCode(status) == TF_OK) {
+  if (status->status.ok()) {
     TF_Session* new_session = new TF_Session(session, graph);
     if (graph != nullptr) {
       mutex_lock l(graph->mu);
@@ -2208,7 +2218,7 @@ TF_Session* TF_LoadSessionFromSavedModel(
   status->status =
       tensorflow::LoadSavedModel(session_options->options, run_options_proto,
                                  export_dir, tag_set, &bundle);
-  if (TF_GetCode(status) != TF_OK) return nullptr;
+  if (!status->status.ok()) return nullptr;
 
   // Create a TF_Graph from the MetaGraphDef. This is safe as long as Session
   // extends using GraphDefs. The Graph instance is different, but equivalent
@@ -2221,11 +2231,11 @@ TF_Session* TF_LoadSessionFromSavedModel(
   GraphImportGraphDefLocked(graph, bundle.meta_graph_def.graph_def(),
                             import_opts, &results, status);
   TF_DeleteImportGraphDefOptions(import_opts);
-  if (TF_GetCode(status) != TF_OK) return nullptr;
+  if (!status->status.ok()) return nullptr;
 
   if (meta_graph_def != nullptr) {
     status->status = MessageToBuffer(bundle.meta_graph_def, meta_graph_def);
-    if (TF_GetCode(status) != TF_OK) return nullptr;
+    if (!status->status.ok()) return nullptr;
   }
 
   TF_Session* session = new TF_Session(bundle.session.release(), graph);
@@ -2325,7 +2335,7 @@ void TF_SessionPRunSetup(TF_Session* session, const TF_Output* inputs,
   string new_handle;
   status->status = session->session->PRunSetup(input_names, output_names,
                                                target_names, &new_handle);
-  if (TF_GetCode(status) == TF_OK) {
+  if (status->status.ok()) {
     char* buf = new char[new_handle.size() + 1];
     memcpy(buf, new_handle.c_str(), new_handle.size() + 1);
     *handle = buf;
@@ -2387,9 +2397,9 @@ unsigned char TF_TryEvaluateConstant(TF_Graph* graph, TF_Output output,
       tensor, graph->refiner, *graph->graph.op_registry(),
       graph->graph.versions().producer(), &evaluated, &result_tensor);
   if (evaluated) {
-    DCHECK(TF_GetCode(status) == TF_OK);
+    DCHECK(status->status.ok());
     *result = TF_TensorFromTensor(result_tensor, status);
-    if (TF_GetCode(status) != TF_OK) evaluated = false;
+    if (!status->status.ok()) evaluated = false;
   }
   return evaluated;
 }
@@ -2444,7 +2454,7 @@ TF_Buffer* TF_ApiDefMapGet(TF_ApiDefMap* api_def_map, const char* name,
 
   TF_Buffer* ret = TF_NewBuffer();
   status->status = MessageToBuffer(*api_def, ret);
-  if (TF_GetCode(status) != TF_OK) {
+  if (!status->status.ok()) {
     TF_DeleteBuffer(ret);
     return nullptr;
   }
@@ -2456,7 +2466,7 @@ TF_Buffer* TF_GetAllRegisteredKernels(TF_Status* status) {
   tensorflow::KernelList kernel_list = tensorflow::GetAllRegisteredKernels();
   TF_Buffer* ret = TF_NewBuffer();
   status->status = MessageToBuffer(kernel_list, ret);
-  if (TF_GetCode(status) != TF_OK) {
+  if (!status->status.ok()) {
     TF_DeleteBuffer(ret);
     return nullptr;
   }
@@ -2468,7 +2478,7 @@ TF_Buffer* TF_GetRegisteredKernelsForOp(const char* name, TF_Status* status) {
       tensorflow::GetRegisteredKernelsForOp(name);
   TF_Buffer* ret = TF_NewBuffer();
   status->status = MessageToBuffer(kernel_list, ret);
-  if (TF_GetCode(status) != TF_OK) {
+  if (!status->status.ok()) {
     TF_DeleteBuffer(ret);
     return nullptr;
   }
@@ -2498,7 +2508,7 @@ TF_Server* TF_NewServer(const void* proto, size_t proto_len,
 
   std::unique_ptr<tensorflow::ServerInterface> out_server;
   status->status = tensorflow::NewServer(server_def, &out_server);
-  if (TF_GetCode(status) != TF_OK) return nullptr;
+  if (!status->status.ok()) return nullptr;
 
   return new TF_Server(std::move(out_server));
 #endif  // defined(IS_MOBILE_PLATFORM) || defined(IS_SLIM_BUILD)
diff --git a/tensorflow/c/c_api.h b/tensorflow/c/c_api.h
index 4eeedd4cbc9..0c413f6ebae 100644
--- a/tensorflow/c/c_api.h
+++ b/tensorflow/c/c_api.h
@@ -435,6 +435,15 @@ TF_CAPI_EXPORT extern int TF_OperationInputListLength(TF_Operation* oper,
 // producer.index) to consumer.oper's input (given by consumer.index).
 TF_CAPI_EXPORT extern TF_Output TF_OperationInput(TF_Input oper_in);
 
+// Get list of all inputs of a specific operation.  `inputs` must point to
+// an array of length at least `max_inputs` (ideally set to
+// TF_OperationNumInputs(oper)).  Beware that a concurrent
+// modification of the graph can increase the number of inputs of
+// an operation.
+TF_CAPI_EXPORT extern void TF_OperationAllInputs(TF_Operation* oper,
+                                                 TF_Output* inputs,
+                                                 int max_inputs);
+
 // Get the number of current consumers of a specific output of an
 // operation.  Note that this number can change when new operations
 // are added to the graph.
diff --git a/tensorflow/c/c_api_experimental.cc b/tensorflow/c/c_api_experimental.cc
index f04f0175696..1fe9276ffc6 100644
--- a/tensorflow/c/c_api_experimental.cc
+++ b/tensorflow/c/c_api_experimental.cc
@@ -510,10 +510,6 @@ TFE_TensorHandle* TFE_DequeueVariantTensor(TF_Session* session, int tensor_id,
   return createTFEDequeue(ctx, TF_VARIANT, queue, status);
 }
 
-static void CheckOk(TF_Status* status) {
-  CHECK_EQ(TF_GetCode(status), TF_OK) << TF_Message(status);
-}
-
 void TFE_TensorHandlePrintDebugString(TFE_TensorHandle* handle) {
   auto* status = TF_NewStatus();
   if (!TFE_TensorHandleIsConcrete(handle)) {
diff --git a/tensorflow/c/eager/BUILD b/tensorflow/c/eager/BUILD
index 2742bead4e4..95005971e91 100644
--- a/tensorflow/c/eager/BUILD
+++ b/tensorflow/c/eager/BUILD
@@ -9,7 +9,6 @@ load(
 )
 load(
     "//tensorflow/core/platform:default/build_config.bzl",
-    "tf_additional_device_tracer_test_flags",
     "tf_kernel_tests_linkstatic",
 )
 load(
@@ -27,6 +26,7 @@ tf_cuda_library(
         "c_api.cc",
         "c_api_debug.cc",
         "c_api_experimental.h",
+        "c_api_internal.cc",
         "c_api_internal.h",
     ],
     hdrs = ["c_api.h"],
@@ -237,8 +237,7 @@ tf_cuda_cc_test(
     srcs = [
         "c_api_experimental_test.cc",
     ],
-    args =
-        ["--heap_check=local"] + tf_additional_device_tracer_test_flags(),
+    args = ["--heap_check=local"],
     extra_copts = tfe_xla_copts(),
     linkstatic = tf_kernel_tests_linkstatic(),
     tags = tf_cuda_tests_tags() + ["nomac"],
diff --git a/tensorflow/c/eager/c_api.cc b/tensorflow/c/eager/c_api.cc
index ff16eaf322d..10a1fa42f57 100644
--- a/tensorflow/c/eager/c_api.cc
+++ b/tensorflow/c/eager/c_api.cc
@@ -33,7 +33,7 @@ limitations under the License.
 #include "tensorflow/c/eager/c_api_internal.h"
 #include "tensorflow/core/common_runtime/eager/context.h"
 #include "tensorflow/core/framework/device_attributes.pb.h"
-#include "tensorflow/core/platform/host_info.h"
+#include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/platform.h"  // NOLINT
 #ifdef TENSORFLOW_EAGER_USE_XLA
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
@@ -61,6 +61,7 @@ limitations under the License.
 #include "tensorflow/core/framework/rendezvous.h"
 #include "tensorflow/core/framework/tensor_shape.pb.h"
 #include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/lib/core/blocking_counter.h"
 #include "tensorflow/core/lib/core/refcount.h"
 #include "tensorflow/core/lib/core/stringpiece.h"
 #include "tensorflow/core/lib/gtl/cleanup.h"
@@ -100,32 +101,34 @@ string DeviceName(const tensorflow::Device* d) {
 tensorflow::Status GetAllRemoteDevices(
     const std::vector<string>& remote_workers,
     tensorflow::WorkerCacheInterface* worker_cache,
-    std::unique_ptr<tensorflow::DeviceMgr>* device_mgr) {
+    std::unique_ptr<tensorflow::DynamicDeviceMgr>* device_mgr) {
   std::vector<std::unique_ptr<tensorflow::Device>> remote_devices;
-  tensorflow::Status status;
-  // TODO(nareshmodi) do this in parallel instead of serially.
-  for (const string& remote_worker : remote_workers) {
-    tensorflow::Notification n;
+  tensorflow::mutex remote_devices_mu;
+  int num_remote_workers = remote_workers.size();
+  tensorflow::BlockingCounter counter(num_remote_workers);
+  std::vector<tensorflow::Status> statuses(num_remote_workers);
+  for (int i = 0; i < num_remote_workers; i++) {
     tensorflow::NewRemoteDevices(
-        tensorflow::Env::Default(), worker_cache, remote_worker,
-        [&status, &n, &remote_devices](
+        tensorflow::Env::Default(), worker_cache, remote_workers[i],
+        [i, &statuses, &counter, &remote_devices, &remote_devices_mu](
             const tensorflow::Status& s,
             std::vector<tensorflow::Device*>* devices) {
-          status = s;
+          statuses[i] = s;
           if (s.ok()) {
+            tensorflow::mutex_lock l(remote_devices_mu);
             for (tensorflow::Device* d : *devices) {
               remote_devices.emplace_back(d);
             }
           }
-          n.Notify();
+          counter.DecrementCount();
         });
-    n.WaitForNotification();
   }
-  std::unique_ptr<tensorflow::DeviceMgr> remote_device_mgr(
-      new tensorflow::StaticDeviceMgr(std::move(remote_devices)));
-
-  TF_RETURN_IF_ERROR(status);
-
+  counter.Wait();
+  for (int i = 0; i < num_remote_workers; i++) {
+    TF_RETURN_IF_ERROR(statuses[i]);
+  }
+  auto remote_device_mgr = absl::make_unique<tensorflow::DynamicDeviceMgr>();
+  TF_RETURN_IF_ERROR(remote_device_mgr->AddDevices(std::move(remote_devices)));
   *device_mgr = std::move(remote_device_mgr);
   return tensorflow::Status::OK();
 }
@@ -135,11 +138,15 @@ tensorflow::Status CreateRemoteContexts(
     int keep_alive_secs, const tensorflow::ServerDef& server_def,
     tensorflow::eager::EagerClientCache* remote_eager_workers, bool async,
     const tensorflow::eager::CreateContextRequest& base_request) {
-  for (int i = 0; i < remote_workers.size(); i++) {
+  int num_remote_workers = remote_workers.size();
+  tensorflow::BlockingCounter counter(num_remote_workers);
+  std::vector<tensorflow::Status> statuses(num_remote_workers);
+  for (int i = 0; i < num_remote_workers; i++) {
     const string& remote_worker = remote_workers[i];
 
     tensorflow::eager::CreateContextRequest request(base_request);
-    tensorflow::eager::CreateContextResponse response;
+    tensorflow::eager::CreateContextResponse* response =
+        new tensorflow::eager::CreateContextResponse();
     request.set_context_id(context_id);
     tensorflow::DeviceNameUtils::ParsedName parsed_name;
     if (!tensorflow::DeviceNameUtils::ParseFullName(remote_worker,
@@ -159,16 +166,17 @@ tensorflow::Status CreateRemoteContexts(
       return tensorflow::errors::Internal(
           "Cannot find a client for the given target:", remote_worker);
     }
-    tensorflow::Notification n;
-    tensorflow::Status status;
-    // TODO(nareshmodi) do this in parallel instead of serially.
     eager_client->CreateContextAsync(
-        &request, &response, [&status, &n](const tensorflow::Status& s) {
-          status = s;
-          n.Notify();
+        &request, response,
+        [i, &statuses, &counter, response](const tensorflow::Status& s) {
+          statuses[i] = s;
+          delete response;
+          counter.DecrementCount();
         });
-    n.WaitForNotification();
-    TF_RETURN_IF_ERROR(status);
+  }
+  counter.Wait();
+  for (int i = 0; i < num_remote_workers; i++) {
+    TF_RETURN_IF_ERROR(statuses[i]);
   }
   return tensorflow::Status::OK();
 }
@@ -215,7 +223,7 @@ tensorflow::Status UpdateTFE_ContextWithServerDef(
       std::remove(remote_workers.begin(), remote_workers.end(), worker_name),
       remote_workers.end());
 
-  std::unique_ptr<tensorflow::DeviceMgr> remote_device_mgr;
+  std::unique_ptr<tensorflow::DynamicDeviceMgr> remote_device_mgr;
   LOG_AND_RETURN_IF_ERROR(GetAllRemoteDevices(
       remote_workers, grpc_server->master_env()->worker_cache,
       &remote_device_mgr));
@@ -247,7 +255,7 @@ tensorflow::Status UpdateTFE_ContextWithServerDef(
   LOG_AND_RETURN_IF_ERROR(
       CreateRemoteContexts(remote_workers, context_id, keep_alive_secs,
                            server_def, remote_eager_workers.get(),
-                           ctx->context->Executor()->Async(), base_request));
+                           ctx->context->Executor().Async(), base_request));
 
   tensorflow::RemoteRendezvous* r =
       grpc_server->worker_env()->rendezvous_mgr->Find(context_id);
@@ -564,7 +572,7 @@ TF_Tensor* TFE_TensorHandleResolve(TFE_TensorHandle* h, TF_Status* status) {
     const tensorflow::Tensor* t = nullptr;
     tensorflow::TensorHandle* h_cpu = nullptr;
     status->status = EagerCopyToDevice(
-        handle, handle->Context(), handle->Context()->Executor(),
+        handle, handle->Context(), &handle->Context()->Executor(),
         handle->Context()->HostCPU(), false, &h_cpu);
     if (!status->status.ok()) {
       return nullptr;
@@ -596,33 +604,8 @@ TF_Tensor* TFE_TensorHandleResolve(TFE_TensorHandle* h, TF_Status* status) {
 
 TFE_Op* TFE_NewOp(TFE_Context* ctx, const char* op_or_function_name,
                   TF_Status* status) {
-  const char* name = op_or_function_name;  // Shorthand
-  const tensorflow::AttrTypeMap* types;
-  bool is_function = false;
-  status->status = tensorflow::AttrTypeMapForOp(name, &types, &is_function);
-  if (!status->status.ok()) {
-    return nullptr;
-  }
-  if (!is_function) {
-    const tensorflow::OpDef* op_def;
-    status->status = tensorflow::OpDefForOp(op_or_function_name, &op_def);
-    if (!status->status.ok()) {
-      return nullptr;
-    }
-    return new TFE_Op(ctx, name, false, types,
-                      new TFE_OpInferenceContext(op_def));
-  }
-  if (!ctx->context->FindFunctionByName(name)) {
-    status->status = tensorflow::errors::NotFound(
-        "'", name,
-        "' is neither a type of a primitive operation nor a name "
-        "of a function registered in binary running on ",
-        tensorflow::port::Hostname(),
-        ". Make sure the operation or function is "
-        "registered in the binary running in this process.");
-    return nullptr;
-  }
-  return new TFE_Op(ctx, name, true, types, nullptr);
+  return NewOrResetOp(ctx, op_or_function_name, status,
+                      /* op_to_reset= */ nullptr);
 }
 
 void TFE_DeleteOp(TFE_Op* op) { delete op; }
@@ -916,7 +899,7 @@ TFE_TensorHandle* TFE_TensorHandleCopyToDevice(TFE_TensorHandle* h,
     return nullptr;
   }
   status->status = tensorflow::EagerCopyToDevice(h->handle, ctx->context,
-                                                 ctx->context->Executor(),
+                                                 &ctx->context->Executor(),
                                                  device, false, &handle);
   if (status->status.ok()) {
     return new TFE_TensorHandle(handle);
@@ -967,7 +950,7 @@ TFE_TensorHandle* TFE_NewTensorHandle(const tensorflow::Tensor& t,
 
 void TFE_ContextExportRunMetadata(TFE_Context* ctx, TF_Buffer* buf,
                                   TF_Status* status) {
-  status->status = ctx->context->Executor()->WaitForAllPendingNodes();
+  status->status = ctx->context->Executor().WaitForAllPendingNodes();
   if (!status->status.ok()) return;
   tensorflow::mutex_lock ml(*ctx->context->MetadataMu());
   status->status = MessageToBuffer(*ctx->context->RunMetadataProto(), buf);
@@ -979,9 +962,9 @@ TFE_Op* GetFunc(TFE_Context* ctx, const tensorflow::NameAttrList& func,
                 TF_Status* status) {
   TFE_Op* func_op = TFE_NewOp(ctx, func.name().data(), status);
   for (const auto& attr : func.attr()) {
-    if (TF_GetCode(status) != TF_OK) return nullptr;
+    if (!status->status.ok()) return nullptr;
     SetOpAttrValueScalar(ctx, func_op, attr.second, attr.first.data(), status);
-    if (TF_GetCode(status) != TF_OK) return nullptr;
+    if (!status->status.ok()) return nullptr;
   }
   return func_op;
 }
@@ -1029,7 +1012,7 @@ void SetOpAttrValueScalar(TFE_Context* ctx, TFE_Op* op,
     } break;
     case tensorflow::AttrValue::kFunc: {
       const auto func_op = GetFunc(ctx, default_value.func(), status);
-      if (TF_GetCode(status) != TF_OK) return;
+      if (!status->status.ok()) return;
       // TODO(nareshmodi): TFE_OpSetAttrFunction and TFE_OpSetAttrFunctionList
       // require TFE_Op* and just convert it internally a NameAttrValue, so
       // consider adding an overload to the C API to make this case easier.
diff --git a/tensorflow/c/eager/c_api_experimental.cc b/tensorflow/c/eager/c_api_experimental.cc
index a9ad77198e7..a40a435065f 100644
--- a/tensorflow/c/eager/c_api_experimental.cc
+++ b/tensorflow/c/eager/c_api_experimental.cc
@@ -28,6 +28,16 @@ limitations under the License.
 
 using tensorflow::string;
 
+void TFE_OpReset(TFE_Context* ctx, const char* op_or_function_name,
+                 TF_Status* status, TFE_Op* op_to_reset) {
+  if (op_to_reset) {
+    NewOrResetOp(ctx, op_or_function_name, status, op_to_reset);
+  } else {
+    TF_SetStatus(status, TF_INVALID_ARGUMENT,
+                 "op_to_reset should not be nullptr");
+  }
+}
+
 void TFE_OpConsumeInput(TFE_Op* op, TFE_TensorHandle* h, TF_Status* status) {
   op->operation.ConsumeInput(h->handle);
 }
@@ -597,5 +607,5 @@ void TFE_ContextSetExecutorForThread(TFE_Context* ctx, TFE_Executor* executor) {
 }
 
 TFE_Executor* TFE_ContextGetExecutorForThread(TFE_Context* ctx) {
-  return new TFE_Executor(ctx->context->Executor());
+  return new TFE_Executor(&ctx->context->Executor());
 }
diff --git a/tensorflow/c/eager/c_api_experimental.h b/tensorflow/c/eager/c_api_experimental.h
index e5a9459faff..cafef707706 100644
--- a/tensorflow/c/eager/c_api_experimental.h
+++ b/tensorflow/c/eager/c_api_experimental.h
@@ -22,6 +22,10 @@ limitations under the License.
 extern "C" {
 #endif
 
+TF_CAPI_EXPORT extern void TFE_OpReset(TFE_Context* ctx,
+                                       const char* op_or_function_name,
+                                       TF_Status* status, TFE_Op* op_to_reset);
+
 TF_CAPI_EXPORT extern void TFE_OpConsumeInput(TFE_Op* op, TFE_TensorHandle* h,
                                               TF_Status* status);
 
diff --git a/tensorflow/c/eager/c_api_experimental_test.cc b/tensorflow/c/eager/c_api_experimental_test.cc
index ab76ad10adc..95165b0c5dc 100644
--- a/tensorflow/c/eager/c_api_experimental_test.cc
+++ b/tensorflow/c/eager/c_api_experimental_test.cc
@@ -84,11 +84,6 @@ void ExecuteWithProfiling(bool async) {
   string profile_proto_str = profile_proto.DebugString();
   if (!gpu_device_name.empty()) {
     EXPECT_TRUE(HasSubstr(profile_proto_str, "/device:GPU:0"));
-    // device name with "stream:all" is collected by Device Tracer.
-#ifndef TENSORFLOW_USE_ROCM
-    // ROCm platform does not yet support stream level tracing
-    EXPECT_TRUE(HasSubstr(profile_proto_str, "stream:all"));
-#endif
   }
   // "/host:CPU" is collected by TraceMe
   EXPECT_TRUE(HasSubstr(profile_proto_str, "/host:CPU"));
diff --git a/tensorflow/c/eager/c_api_internal.cc b/tensorflow/c/eager/c_api_internal.cc
new file mode 100644
index 00000000000..772fae13faf
--- /dev/null
+++ b/tensorflow/c/eager/c_api_internal.cc
@@ -0,0 +1,58 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/c/eager/c_api_internal.h"
+
+#include "tensorflow/core/platform/host_info.h"
+
+TFE_Op* NewOrResetOp(TFE_Context* ctx, const char* op_or_function_name,
+                     TF_Status* status, TFE_Op* op_to_reset) {
+  const char* name = op_or_function_name;  // Shorthand
+  const tensorflow::AttrTypeMap* types;
+  bool is_function = false;
+  status->status = tensorflow::AttrTypeMapForOp(name, &types, &is_function);
+  if (!status->status.ok()) {
+    return nullptr;
+  }
+  auto create_or_reset = [&op_to_reset, &ctx, &name, &types](
+                             bool is_function,
+                             TFE_OpInferenceContext* inference_ctx) -> TFE_Op* {
+    if (op_to_reset) {
+      op_to_reset->Reset(ctx, name, is_function, types, inference_ctx);
+      return op_to_reset;
+    } else {
+      return new TFE_Op(ctx, name, is_function, types, inference_ctx);
+    }
+  };
+
+  if (!is_function) {
+    const tensorflow::OpDef* op_def;
+    status->status = tensorflow::OpDefForOp(op_or_function_name, &op_def);
+    if (!status->status.ok()) {
+      return nullptr;
+    }
+    return create_or_reset(false, new TFE_OpInferenceContext(op_def));
+  }
+  if (!ctx->context->FindFunctionByName(name)) {
+    status->status = tensorflow::errors::NotFound(
+        "'", name,
+        "' is neither a type of a primitive operation nor a name "
+        "of a function registered in binary running on ",
+        tensorflow::port::Hostname(),
+        ". Make sure the operation or function is "
+        "registered in the binary running in this process.");
+    return nullptr;
+  }
+  return create_or_reset(true, nullptr);
+}
diff --git a/tensorflow/c/eager/c_api_internal.h b/tensorflow/c/eager/c_api_internal.h
index 5efed2ca76d..964e558a01f 100644
--- a/tensorflow/c/eager/c_api_internal.h
+++ b/tensorflow/c/eager/c_api_internal.h
@@ -133,10 +133,25 @@ struct TFE_Op {
       : operation(ctx->context, op, is_function, t),
         inference_ctx(inference_ctx) {}
 
+  void Clear() {
+    operation.Clear();
+    inference_ctx.reset();
+  }
+
+  void Reset(TFE_Context* ctx, const char* op, bool is_function,
+             const tensorflow::AttrTypeMap* t,
+             TFE_OpInferenceContext* infer_ctx) {
+    operation.Reset(ctx->context, op, is_function, t, nullptr);
+    inference_ctx.reset(infer_ctx);
+  }
+
   tensorflow::EagerOperation operation;
   std::unique_ptr<TFE_OpInferenceContext> inference_ctx;
 };
 
+TFE_Op* NewOrResetOp(TFE_Context* ctx, const char* op_or_function_name,
+                     TF_Status* status, TFE_Op* op_to_reset = nullptr);
+
 struct TFE_Profiler {
   explicit TFE_Profiler() { profiler = tensorflow::ProfilerSession::Create(); }
 
diff --git a/tensorflow/c/eager/c_api_test.cc b/tensorflow/c/eager/c_api_test.cc
index d3b755fee6e..6702e26e66d 100644
--- a/tensorflow/c/eager/c_api_test.cc
+++ b/tensorflow/c/eager/c_api_test.cc
@@ -1069,10 +1069,13 @@ void Execute_MatMul_CPU_Runtime_Error(bool async) {
     // still fail.
     TF_SetStatus(status, TF_OK, "");
     TFE_DeleteTensorHandle(retvals[0]);
+    TFE_Executor* executor = TFE_ContextGetExecutorForThread(ctx);
+    TFE_ExecutorWaitForAllPendingNodes(executor, status);
+    EXPECT_NE(TF_OK, TF_GetCode(status));
+    TF_SetStatus(status, TF_OK, "");
     retvals[0] = nullptr;
     TFE_Execute(matmul2, &retvals[0], &num_retvals, status);
     EXPECT_NE(TF_OK, TF_GetCode(status));
-    TFE_Executor* executor = TFE_ContextGetExecutorForThread(ctx);
     TFE_ExecutorClearError(executor);
     TFE_ExecutorWaitForAllPendingNodes(executor, status);
     ASSERT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
diff --git a/tensorflow/cc/framework/cc_op_gen.cc b/tensorflow/cc/framework/cc_op_gen.cc
index 919e2dfc638..cc13dcf9976 100644
--- a/tensorflow/cc/framework/cc_op_gen.cc
+++ b/tensorflow/cc/framework/cc_op_gen.cc
@@ -292,7 +292,9 @@ string ToCamelCase(const string& str) {
   bool cap = true;
   while (i < str.size()) {
     const char c = str[i++];
-    if (c == joiner) {
+    if (c == '>') {
+      cap = true;
+    } else if (c == joiner) {
       cap = true;
     } else if (cap) {
       result += toupper(c);
@@ -304,6 +306,21 @@ string ToCamelCase(const string& str) {
   return result;
 }
 
+string SeparateNamespaces(const string& str) {
+  string result;
+  const char joiner = '_';
+  size_t i = 0;
+  while (i < str.size()) {
+    const char c = str[i++];
+    if (c == '>') {
+      result += joiner;
+    } else {
+      result += c;
+    }
+  }
+  return result;
+}
+
 // Returns a <string, bool> pair. The string is the C++ type name to be used for
 // attr_type when defining an object of that type. The bool is a flag to
 // indicate whether to treat the type as const when accepting the C++ type as an
@@ -549,7 +566,7 @@ struct OpInfo {
 OpInfo::OpInfo(const OpDef& graph_op_def, const ApiDef& api_def,
                const std::vector<string>& aliases)
     : graph_op_def(graph_op_def), api_def(api_def), aliases(aliases) {
-  op_name = api_def.endpoint(0).name();
+  op_name = SeparateNamespaces(api_def.endpoint(0).name());
   InferOpAttributes(graph_op_def, &inferred_input_attrs);
   has_optional_attrs = HasOptionalAttrs(api_def, inferred_input_attrs);
   arg_types.push_back("const ::tensorflow::Scope&");
diff --git a/tensorflow/cc/saved_model/BUILD b/tensorflow/cc/saved_model/BUILD
index b2affdd9993..c9786fa8b7e 100644
--- a/tensorflow/cc/saved_model/BUILD
+++ b/tensorflow/cc/saved_model/BUILD
@@ -127,8 +127,29 @@ cc_library(
 )
 
 tf_cc_test(
-    name = "loader_test",
-    srcs = ["loader_test.cc"],
+    name = "saved_model_bundle_test",
+    srcs = ["saved_model_bundle_test.cc"],
+    data = [
+        ":saved_model_half_plus_two",
+    ],
+    linkstatic = 1,
+    deps = [
+        ":constants",
+        ":loader",
+        ":signature_constants",
+        ":tag_constants",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:tensorflow",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+    ],
+)
+
+tf_cc_test(
+    name = "saved_model_bundle_lite_test",
+    srcs = ["saved_model_bundle_lite_test.cc"],
     data = [
         ":saved_model_half_plus_two",
     ],
diff --git a/tensorflow/cc/saved_model/loader.cc b/tensorflow/cc/saved_model/loader.cc
index a3b80fbdba5..0aec4f42aee 100644
--- a/tensorflow/cc/saved_model/loader.cc
+++ b/tensorflow/cc/saved_model/loader.cc
@@ -299,6 +299,8 @@ Status LoadSavedModelInternal(const SessionOptions& session_options,
 
 }  // namespace
 
+SavedModelBundleInterface::~SavedModelBundleInterface() {}
+
 Status LoadSavedModel(const SessionOptions& session_options,
                       const RunOptions& run_options, const string& export_dir,
                       const std::unordered_set<string>& tags,
@@ -323,6 +325,133 @@ Status LoadSavedModel(const SessionOptions& session_options,
   return status;
 }
 
+namespace {
+// Session wrapper that prevents calls to Session::Create(), Session::Extend(),
+// and the deprecated partial-run methods.
+//
+// Limiting the available methods on a returned Session gives us the option
+// to replace the Session with a cut-down implementation, without breaking any
+// users.
+class LiteSessionWrapper : public Session {
+ public:
+  explicit LiteSessionWrapper(std::unique_ptr<Session> wrapped)
+      : wrapped_(std::move(wrapped)) {}
+
+  Status Create(const GraphDef& graph) override {
+    return errors::Unimplemented("Session::Create()");
+  }
+  Status Create(GraphDef&& graph) override {
+    return errors::Unimplemented("Session::Create()");
+  }
+
+  Status Extend(const GraphDef& graph) override {
+    return errors::Unimplemented("Session::Extend()");
+  }
+  Status Extend(GraphDef&& graph) override {
+    return errors::Unimplemented("Session::Extend()");
+  }
+
+  Status Run(const std::vector<std::pair<string, Tensor>>& inputs,
+             const std::vector<string>& output_tensor_names,
+             const std::vector<string>& target_node_names,
+             std::vector<Tensor>* outputs) override {
+    return wrapped_->Run(inputs, output_tensor_names, target_node_names,
+                         outputs);
+  }
+
+  Status Create(const RunOptions& run_options, const GraphDef& graph) override {
+    return errors::Unimplemented("Session::Create()");
+  }
+  Status Extend(const RunOptions& run_options, const GraphDef& graph) override {
+    return errors::Unimplemented("Session::Extend()");
+  }
+  Status Create(const RunOptions& run_options, GraphDef&& graph) override {
+    return errors::Unimplemented("Session::Create()");
+  }
+  Status Extend(const RunOptions& run_options, GraphDef&& graph) override {
+    return errors::Unimplemented("Session::Extend()");
+  }
+  Status Close(const RunOptions& run_options) override {
+    return wrapped_->Close(run_options);
+  }
+
+  Status Run(const RunOptions& run_options,
+             const std::vector<std::pair<string, Tensor>>& inputs,
+             const std::vector<string>& output_tensor_names,
+             const std::vector<string>& target_node_names,
+             std::vector<Tensor>* outputs, RunMetadata* run_metadata) override {
+    return wrapped_->Run(run_options, inputs, output_tensor_names,
+                         target_node_names, outputs, run_metadata);
+  }
+
+  Status PRunSetup(const std::vector<string>& input_names,
+                   const std::vector<string>& output_names,
+                   const std::vector<string>& target_nodes,
+                   string* handle) override {
+    return errors::Unimplemented("Session::PRunSetup()");
+  }
+
+  Status PRun(const string& handle,
+              const std::vector<std::pair<string, Tensor>>& inputs,
+              const std::vector<string>& output_names,
+              std::vector<Tensor>* outputs) override {
+    return errors::Unimplemented("Session::PRun()");
+  }
+
+  Status ListDevices(std::vector<DeviceAttributes>* response) override {
+    return wrapped_->ListDevices(response);
+  }
+
+  Status Close() override { return wrapped_->Close(); }
+
+  Status MakeCallable(const CallableOptions& callable_options,
+                      CallableHandle* out_handle) override {
+    return wrapped_->MakeCallable(callable_options, out_handle);
+  }
+
+  Status RunCallable(CallableHandle handle,
+                     const std::vector<Tensor>& feed_tensors,
+                     std::vector<Tensor>* fetch_tensors,
+                     RunMetadata* run_metadata) override {
+    return wrapped_->RunCallable(handle, feed_tensors, fetch_tensors,
+                                 run_metadata);
+  }
+
+  Status RunCallable(
+      CallableHandle handle, const std::vector<Tensor>& feed_tensors,
+      std::vector<Tensor>* fetch_tensors, RunMetadata* run_metadata,
+      const thread::ThreadPoolOptions& threadpool_options) override {
+    return wrapped_->RunCallable(handle, feed_tensors, fetch_tensors,
+                                 run_metadata, threadpool_options);
+  }
+
+  Status ReleaseCallable(CallableHandle handle) override {
+    return wrapped_->ReleaseCallable(handle);
+  }
+
+ private:
+  const std::unique_ptr<Session> wrapped_;
+};
+}  // namespace
+
+Status LoadSavedModel(const SessionOptions& session_options,
+                      const RunOptions& run_options, const string& export_dir,
+                      const std::unordered_set<string>& tags,
+                      SavedModelBundleLite* const bundle) {
+  SavedModelBundle legacy_bundle;
+  SessionOptions rewritten_options(session_options);
+  rewritten_options.config.mutable_experimental()
+      ->set_optimize_for_static_graph(true);
+  // TODO(mrry): Consider specializing the session creation to reduce peak
+  // RAM consumption by using `Session::Create(GraphDef&&)`.
+  TF_RETURN_IF_ERROR(LoadSavedModel(session_options, run_options, export_dir,
+                                    tags, &legacy_bundle));
+  *bundle = SavedModelBundleLite(
+      absl::make_unique<LiteSessionWrapper>(std::move(legacy_bundle.session)),
+      std::move(*legacy_bundle.meta_graph_def.mutable_signature_def()));
+  return Status::OK();
+}
+
 bool MaybeSavedModelDirectory(const string& export_dir) {
   const string saved_model_pb_path =
       io::JoinPath(export_dir, kSavedModelFilenamePb);
diff --git a/tensorflow/cc/saved_model/loader.h b/tensorflow/cc/saved_model/loader.h
index a8e098fa544..74094a0cc23 100644
--- a/tensorflow/cc/saved_model/loader.h
+++ b/tensorflow/cc/saved_model/loader.h
@@ -27,31 +27,96 @@ limitations under the License.
 
 namespace tensorflow {
 
-/// SavedModel representation once the SavedModel is loaded from storage.
-struct SavedModelBundle {
-  std::unique_ptr<Session> session;
-  MetaGraphDef meta_graph_def;
+/// Represents a SavedModel that is loaded from storage.
+class SavedModelBundleInterface {
+ public:
+  virtual ~SavedModelBundleInterface();
 
+  /// Returns the TensorFlow Session that can be used to interact with the
+  /// SavedModel.
+  virtual Session* GetSession() const = 0;
+
+  /// Returns a map from signature name to SignatureDef for all signatures in
+  /// in the SavedModel.
+  virtual const protobuf::Map<string, SignatureDef>& GetSignatures() const = 0;
+};
+
+/// SavedModel representation once the SavedModel is loaded from storage.
+///
+/// NOTE: Prefer to use SavedModelBundleLite in new code, as it consumes less
+/// RAM.
+struct SavedModelBundle : public SavedModelBundleInterface {
   /// A TensorFlow Session does not Close itself on destruction. To avoid
   /// resource leaks, we explicitly call Close on Sessions that we create.
-  ~SavedModelBundle() {
+  ~SavedModelBundle() override {
     if (session) {
       session->Close().IgnoreError();
     }
   }
 
   SavedModelBundle() = default;
+
+  Session* GetSession() const override { return session.get(); }
+  const protobuf::Map<string, SignatureDef>& GetSignatures() const override {
+    return meta_graph_def.signature_def();
+  }
+
+  std::unique_ptr<Session> session;
+  MetaGraphDef meta_graph_def;
 };
 
-/// Loads a SavedModel from the specified export directory. The meta graph def
+// A version of SavedModelBundle that avoids storing a potentially large
+// MetaGraphDef. Prefer to use SavedModelBundleLite in new code.
+class SavedModelBundleLite : public SavedModelBundleInterface {
+ public:
+  SavedModelBundleLite() = default;
+  SavedModelBundleLite& operator=(SavedModelBundleLite&& other) = default;
+
+  SavedModelBundleLite(std::unique_ptr<Session> session,
+                       protobuf::Map<string, SignatureDef> signatures)
+      : session_(std::move(session)), signatures_(std::move(signatures)) {}
+
+  /// A TensorFlow Session does not Close itself on destruction. To avoid
+  /// resource leaks, we explicitly call Close on Sessions that we create.
+  ~SavedModelBundleLite() override {
+    if (session_) {
+      session_->Close().IgnoreError();
+    }
+  }
+
+  Session* GetSession() const override { return session_.get(); }
+  const protobuf::Map<string, SignatureDef>& GetSignatures() const override {
+    return signatures_;
+  }
+
+ private:
+  std::unique_ptr<Session> session_;
+  protobuf::Map<string, SignatureDef> signatures_;
+};
+
+/// Loads a SavedModel from the specified export directory. The MetaGraphDef
 /// to be loaded is identified by the supplied tags, corresponding exactly to
-/// the set of tags used at SavedModel build time. Returns a SavedModel bundle
-/// with a session and the requested meta graph def, if found.
+/// the set of tags used at SavedModel build time. Stores a SavedModel bundle in
+/// *bundle with a session and the requested MetaGraphDef, if found.
+///
+/// NOTE: Prefer the overload that takes a SavedModelBundleLite* in new code.
 Status LoadSavedModel(const SessionOptions& session_options,
                       const RunOptions& run_options, const string& export_dir,
                       const std::unordered_set<string>& tags,
                       SavedModelBundle* const bundle);
 
+/// Loads a SavedModel from the specified export directory. The MetaGraphDef
+/// to be loaded is identified by the supplied tags, corresponding exactly to
+/// the set of tags used at SavedModel build time. Stores a SavedModel bundle
+/// in *bundle with a session created from the requested MetaGraphDef if found.
+///
+/// This overload creates a SavedModelBundleLite, which consumes less RAM than
+/// an equivalent SavedModelBundle.
+Status LoadSavedModel(const SessionOptions& session_options,
+                      const RunOptions& run_options, const string& export_dir,
+                      const std::unordered_set<string>& tags,
+                      SavedModelBundleLite* const bundle);
+
 /// Checks whether the provided directory could contain a SavedModel. Note that
 /// the method does not load any data by itself. If the method returns `false`,
 /// the export directory definitely does not contain a SavedModel. If the method
diff --git a/tensorflow/cc/saved_model/saved_model_bundle_lite_test.cc b/tensorflow/cc/saved_model/saved_model_bundle_lite_test.cc
new file mode 100644
index 00000000000..7ef0b828425
--- /dev/null
+++ b/tensorflow/cc/saved_model/saved_model_bundle_lite_test.cc
@@ -0,0 +1,244 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/cc/saved_model/loader.h"
+
+#include "tensorflow/cc/saved_model/constants.h"
+#include "tensorflow/cc/saved_model/signature_constants.h"
+#include "tensorflow/cc/saved_model/tag_constants.h"
+#include "tensorflow/core/example/example.pb.h"
+#include "tensorflow/core/example/feature.pb.h"
+#include "tensorflow/core/framework/tensor_testutil.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/lib/io/path.h"
+#include "tensorflow/core/lib/strings/str_util.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+namespace {
+
+constexpr char kTestDataPbTxt[] =
+    "cc/saved_model/testdata/half_plus_two_pbtxt/00000123";
+constexpr char kTestDataMainOp[] =
+    "cc/saved_model/testdata/half_plus_two_main_op/00000123";
+constexpr char kTestDataSharded[] =
+    "cc/saved_model/testdata/half_plus_two/00000123";
+constexpr char kTestDataInitOpV2[] =
+    "cc/saved_model/testdata/half_plus_two_v2/00000123";
+
+class LoaderTest : public ::testing::Test {
+ protected:
+  LoaderTest() {}
+
+  string MakeSerializedExample(float x) {
+    tensorflow::Example example;
+    auto* feature_map = example.mutable_features()->mutable_feature();
+    (*feature_map)["x"].mutable_float_list()->add_value(x);
+    return example.SerializeAsString();
+  }
+
+  void ValidateAssets(const string& export_dir,
+                      const SavedModelBundleLite& bundle) {
+    const string asset_directory =
+        io::JoinPath(export_dir, kSavedModelAssetsDirectory);
+    const string asset_filename = "foo.txt";
+    const string asset_filepath = io::JoinPath(asset_directory, asset_filename);
+    TF_EXPECT_OK(Env::Default()->FileExists(asset_filepath));
+
+    std::vector<Tensor> path_outputs;
+    TF_ASSERT_OK(
+        bundle.GetSession()->Run({}, {"filename_tensor:0"}, {}, &path_outputs));
+    ASSERT_EQ(1, path_outputs.size());
+
+    test::ExpectTensorEqual<tstring>(
+        test::AsTensor<tstring>({"foo.txt"}, TensorShape({})), path_outputs[0]);
+  }
+
+  void CheckSavedModelBundle(const string& export_dir,
+                             const SavedModelBundleLite& bundle) {
+    ValidateAssets(export_dir, bundle);
+    // Retrieve the regression signature from the bundle.
+    const auto& signature_def = bundle.GetSignatures().at("regress_x_to_y");
+
+    const string input_name = signature_def.inputs().at(kRegressInputs).name();
+    const string output_name =
+        signature_def.outputs().at(kRegressOutputs).name();
+
+    std::vector<tstring> serialized_examples;
+    for (float x : {0, 1, 2, 3}) {
+      serialized_examples.push_back(MakeSerializedExample(x));
+    }
+
+    // Validate the half plus two behavior.
+    Tensor input =
+        test::AsTensor<tstring>(serialized_examples, TensorShape({4}));
+    std::vector<Tensor> outputs;
+    TF_ASSERT_OK(bundle.GetSession()->Run({{input_name, input}}, {output_name},
+                                          {}, &outputs));
+    ASSERT_EQ(outputs.size(), 1);
+    test::ExpectTensorEqual<float>(
+        outputs[0],
+        test::AsTensor<float>({2, 2.5, 3, 3.5}, TensorShape({4, 1})));
+  }
+};
+
+// Test for resource leaks related to TensorFlow session closing requirements
+// when loading and unloading large numbers of SavedModelBundles.
+// TODO(sukritiramesh): Increase run iterations and move outside of the test
+// suite.
+TEST_F(LoaderTest, ResourceLeakTest) {
+  SavedModelBundleLite bundle;
+  SessionOptions session_options;
+  RunOptions run_options;
+
+  const string export_dir =
+      io::JoinPath(testing::TensorFlowSrcRoot(), kTestDataSharded);
+  for (int i = 0; i < 100; ++i) {
+    TF_ASSERT_OK(LoadSavedModel(session_options, run_options, export_dir,
+                                {kSavedModelTagServe}, &bundle));
+    CheckSavedModelBundle(export_dir, bundle);
+  }
+}
+
+TEST_F(LoaderTest, TagMatch) {
+  SavedModelBundleLite bundle;
+  SessionOptions session_options;
+  RunOptions run_options;
+
+  const string export_dir =
+      io::JoinPath(testing::TensorFlowSrcRoot(), kTestDataSharded);
+  TF_ASSERT_OK(LoadSavedModel(session_options, run_options, export_dir,
+                              {kSavedModelTagServe}, &bundle));
+  CheckSavedModelBundle(export_dir, bundle);
+}
+
+TEST_F(LoaderTest, NoTagMatch) {
+  SavedModelBundleLite bundle;
+  RunOptions run_options;
+  SessionOptions session_options;
+
+  const string export_dir =
+      io::JoinPath(testing::TensorFlowSrcRoot(), kTestDataSharded);
+  Status st = LoadSavedModel(session_options, run_options, export_dir,
+                             {"missing-tag"}, &bundle);
+  EXPECT_FALSE(st.ok());
+  EXPECT_TRUE(absl::StrContains(
+      st.error_message(),
+      "Could not find meta graph def matching supplied tags: { missing-tag }"))
+      << st.error_message();
+}
+
+TEST_F(LoaderTest, NoTagMatchMultiple) {
+  SavedModelBundleLite bundle;
+  RunOptions run_options;
+  SessionOptions session_options;
+
+  const string export_dir =
+      io::JoinPath(testing::TensorFlowSrcRoot(), kTestDataSharded);
+  Status st = LoadSavedModel(session_options, run_options, export_dir,
+                             {kSavedModelTagServe, "missing-tag"}, &bundle);
+  EXPECT_FALSE(st.ok());
+  EXPECT_TRUE(absl::StrContains(
+      st.error_message(),
+      "Could not find meta graph def matching supplied tags: "))
+      << st.error_message();
+}
+
+TEST_F(LoaderTest, SessionCreationFailure) {
+  SavedModelBundleLite bundle;
+  // Use invalid SessionOptions to cause session creation to fail.  Default
+  // options work, so provide an invalid value for the target field.
+  SessionOptions session_options;
+  constexpr char kInvalidTarget[] = "invalid target";
+  session_options.target = kInvalidTarget;
+  RunOptions run_options;
+
+  const string export_dir =
+      io::JoinPath(testing::TensorFlowSrcRoot(), kTestDataSharded);
+  Status st = LoadSavedModel(session_options, run_options, export_dir,
+                             {kSavedModelTagServe}, &bundle);
+  EXPECT_FALSE(st.ok());
+  EXPECT_TRUE(absl::StrContains(st.error_message(), kInvalidTarget))
+      << st.error_message();
+}
+
+TEST_F(LoaderTest, PbtxtFormat) {
+  SavedModelBundleLite bundle;
+  SessionOptions session_options;
+  RunOptions run_options;
+
+  const string export_dir =
+      io::JoinPath(testing::TensorFlowSrcRoot(), kTestDataPbTxt);
+  TF_ASSERT_OK(LoadSavedModel(session_options, run_options, export_dir,
+                              {kSavedModelTagServe}, &bundle));
+  CheckSavedModelBundle(export_dir, bundle);
+}
+
+TEST_F(LoaderTest, MainOpFormat) {
+  SavedModelBundleLite bundle;
+  SessionOptions session_options;
+  RunOptions run_options;
+
+  const string export_dir =
+      io::JoinPath(testing::TensorFlowSrcRoot(), kTestDataMainOp);
+  TF_ASSERT_OK(LoadSavedModel(session_options, run_options, export_dir,
+                              {kSavedModelTagServe}, &bundle));
+  CheckSavedModelBundle(export_dir, bundle);
+}
+
+TEST_F(LoaderTest, InvalidExportPath) {
+  SavedModelBundleLite bundle;
+  RunOptions run_options;
+  SessionOptions session_options;
+
+  const string export_dir =
+      io::JoinPath(testing::TensorFlowSrcRoot(), "missing-path");
+  Status st = LoadSavedModel(session_options, run_options, export_dir,
+                             {kSavedModelTagServe}, &bundle);
+  EXPECT_FALSE(st.ok());
+}
+
+TEST_F(LoaderTest, MaybeSavedModelDirectory) {
+  // Valid SavedModel directory.
+  const string export_dir =
+      io::JoinPath(testing::TensorFlowSrcRoot(), kTestDataSharded);
+  EXPECT_TRUE(MaybeSavedModelDirectory(export_dir));
+
+  // Directory that does not exist.
+  const string missing_export_dir =
+      io::JoinPath(testing::TensorFlowSrcRoot(), "missing-path");
+  EXPECT_FALSE(MaybeSavedModelDirectory(missing_export_dir));
+
+  // Directory that exists but is an invalid SavedModel location.
+  const string invalid_export_dir =
+      io::JoinPath(testing::TensorFlowSrcRoot(), "cc/saved_model");
+  EXPECT_FALSE(MaybeSavedModelDirectory(invalid_export_dir));
+}
+
+TEST_F(LoaderTest, SavedModelInitOpV2Format) {
+  SavedModelBundleLite bundle;
+  SessionOptions session_options;
+  RunOptions run_options;
+
+  const string export_dir =
+      io::JoinPath(testing::TensorFlowSrcRoot(), kTestDataInitOpV2);
+  TF_ASSERT_OK(LoadSavedModel(session_options, run_options, export_dir,
+                              {kSavedModelTagServe}, &bundle));
+  CheckSavedModelBundle(export_dir, bundle);
+}
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/cc/saved_model/loader_test.cc b/tensorflow/cc/saved_model/saved_model_bundle_test.cc
similarity index 98%
rename from tensorflow/cc/saved_model/loader_test.cc
rename to tensorflow/cc/saved_model/saved_model_bundle_test.cc
index aa2031d17d2..a2ce9b5f5e9 100644
--- a/tensorflow/cc/saved_model/loader_test.cc
+++ b/tensorflow/cc/saved_model/saved_model_bundle_test.cc
@@ -71,8 +71,7 @@ class LoaderTest : public ::testing::Test {
                              const SavedModelBundle& bundle) {
     ValidateAssets(export_dir, bundle);
     // Retrieve the regression signature from meta graph def.
-    const auto signature_def_map = bundle.meta_graph_def.signature_def();
-    const auto signature_def = signature_def_map.at("regress_x_to_y");
+    const auto& signature_def = bundle.GetSignatures().at("regress_x_to_y");
 
     const string input_name = signature_def.inputs().at(kRegressInputs).name();
     const string output_name =
diff --git a/tensorflow/compiler/jit/BUILD b/tensorflow/compiler/jit/BUILD
index 1ebfe235b4d..2b15b12ec24 100644
--- a/tensorflow/compiler/jit/BUILD
+++ b/tensorflow/compiler/jit/BUILD
@@ -1,5 +1,5 @@
 load("//tensorflow:tensorflow.bzl", "tf_cc_test", "cc_header_only_library")
-load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda")
+load("//tensorflow/stream_executor:build_defs.bzl", "if_cuda_or_rocm")
 load("//tensorflow:tensorflow.bzl", "tf_custom_op_py_library", "tf_jit_compilation_passes_extra_deps")
 load("//tensorflow/core/platform:default/build_config.bzl", "tf_additional_all_protos", "tf_proto_library")
 
@@ -38,7 +38,7 @@ cc_library(
         ":xla_cpu_device",
         ":xla_cpu_jit",
         "//tensorflow/compiler/plugin",
-    ] + if_cuda([
+    ] + if_cuda_or_rocm([
         ":xla_gpu_device",
         ":xla_gpu_jit",
     ]),
@@ -61,7 +61,7 @@ cc_library(
 cc_library(
     name = "xla_gpu_jit",
     visibility = ["//visibility:public"],
-    deps = if_cuda([
+    deps = if_cuda_or_rocm([
         ":jit_compilation_passes",
         "//tensorflow/compiler/jit/kernels:xla_ops",
         "//tensorflow/compiler/tf2xla/kernels:xla_ops",
diff --git a/tensorflow/compiler/jit/compilability_check_util.cc b/tensorflow/compiler/jit/compilability_check_util.cc
index 6498436fbd9..b0c78469118 100644
--- a/tensorflow/compiler/jit/compilability_check_util.cc
+++ b/tensorflow/compiler/jit/compilability_check_util.cc
@@ -130,17 +130,24 @@ RecursiveCompilabilityChecker::FindUncompilableNodes(
   return uncompilable_nodes;
 }
 
-bool RecursiveCompilabilityChecker::HasXLAKernel(const Node& node) const {
+bool RecursiveCompilabilityChecker::HasXLAKernel(
+    const Node& node, string* uncompilable_reason) const {
   // There is a SymbolicGradient kernel on the XLA_JIT device, but the gradient
   // is really a kind of function call and will be handled by
   // IsCompilableCall().
-  if (node.type_string() == "SymbolicGradient") return false;
+  if (node.type_string() == "SymbolicGradient") {
+    *uncompilable_reason =
+        "SymbolicGradient should be handled by IsCompilableCall().";
+    return false;
+  }
   if (node.type_string() == "Const") {
     // Skip Const op with type DT_STRING, since XLA doesn't support it, but the
     // registered Const KernelDef says that it does, to support no-op Assert for
     // tfcompile.
     const AttrValue* attr = node.attrs().Find("dtype");
     if (attr != nullptr && attr->type() == DT_STRING) {
+      *uncompilable_reason =
+          "Const op with type DT_STRING is not supported by XLA.";
       return false;
     }
   }
@@ -150,10 +157,16 @@ bool RecursiveCompilabilityChecker::HasXLAKernel(const Node& node) const {
   // such nodes out of XLA clusters.
   if (HasForwardedRefInput(node)) {
     VLOG(2) << "Rejecting " << node.name() << ": Identity with unsafe cast.";
+    *uncompilable_reason = "Identity with unsafe cast.";
     return false;
   }
 
-  return FindKernelDef(jit_device_type_, node.def(), nullptr, nullptr).ok();
+  Status s = FindKernelDef(jit_device_type_, node.def(), nullptr, nullptr);
+  if (!s.ok()) {
+    *uncompilable_reason = s.error_message();
+    return false;
+  }
+  return true;
 }
 
 // Tests whether 'if_node' is compilable. Every operator in the then_branch and
@@ -336,16 +349,17 @@ bool RecursiveCompilabilityChecker::IsCompilableNode(
     return false;
   }
 
+  string uncompilable_reason;
   if (IsFunctionCall(*lib_runtime->GetFunctionLibraryDefinition(), node)) {
     if (!IsCompilableCall(node.def(), lib_runtime, stack_trace,
                           encapsulating_function, uncompilable_nodes)) {
       LogNotCompilable(node, "unsupported function");
       return false;
     }
-  } else if (!HasXLAKernel(node)) {
-    absl::string_view uncompilable_reason = "unsupported op";
-    MaybeMarkUncompilableNode(uncompilable_reason, *stack_trace,
-                              encapsulating_function, uncompilable_nodes);
+  } else if (!HasXLAKernel(node, &uncompilable_reason)) {
+    MaybeMarkUncompilableNode(
+        absl::StrCat("unsupported op: ", uncompilable_reason), *stack_trace,
+        encapsulating_function, uncompilable_nodes);
     LogNotCompilable(node, uncompilable_reason);
     return false;
   }
diff --git a/tensorflow/compiler/jit/compilability_check_util.h b/tensorflow/compiler/jit/compilability_check_util.h
index 04639df14a1..43b2689b522 100644
--- a/tensorflow/compiler/jit/compilability_check_util.h
+++ b/tensorflow/compiler/jit/compilability_check_util.h
@@ -247,7 +247,8 @@ class RecursiveCompilabilityChecker {
            absl::c_any_of(node.output_types(), is_variant);
   }
 
-  bool HasXLAKernel(const Node& node) const;
+  bool HasXLAKernel(const Node& node,
+                    string* uncompilable_reason = nullptr) const;
 
   static void MaybeMarkUncompilableNode(
       const absl::string_view reason,
diff --git a/tensorflow/compiler/jit/compilability_check_util_test.cc b/tensorflow/compiler/jit/compilability_check_util_test.cc
index 0dd3b8141c9..cdce1e92799 100644
--- a/tensorflow/compiler/jit/compilability_check_util_test.cc
+++ b/tensorflow/compiler/jit/compilability_check_util_test.cc
@@ -125,7 +125,8 @@ TEST_F(CompilabilityCheckUtilTest, CheckNonFunctionalNodes) {
   const auto& uncompilable_nodes_inside_function = node_info_it->second.second;
   ASSERT_EQ(1, uncompilable_nodes_inside_function.size());
   const auto& uncompilable_node_info = uncompilable_nodes_inside_function.at(0);
-  EXPECT_EQ("unsupported op", uncompilable_node_info.uncompilable_reason);
+  EXPECT_TRUE(absl::StrContains(uncompilable_node_info.uncompilable_reason,
+                                "unsupported op"));
   ASSERT_EQ(1, uncompilable_node_info.stack_trace.size());
   ASSERT_EQ("", uncompilable_node_info.stack_trace.at(0).function_name);
 }
@@ -167,7 +168,8 @@ TEST_F(CompilabilityCheckUtilTest, CheckSimpleFunctionNode) {
   EXPECT_EQ("D", node_stack.at(0).name);
   EXPECT_EQ(kUncompilableFunctionNodeName, node_stack.at(1).name);
   EXPECT_EQ(kUncompilableFunctionNodeName, node_info.name);
-  EXPECT_EQ("unsupported op", node_info.uncompilable_reason);
+  EXPECT_TRUE(
+      absl::StrContains(node_info.uncompilable_reason, "unsupported op"));
 }
 
 TEST_F(CompilabilityCheckUtilTest, CheckFunctionalWhileNode) {
@@ -246,7 +248,8 @@ TEST_F(CompilabilityCheckUtilTest, CheckFunctionalWhileNode) {
             stacktrace_second_node_info.function_name);
 
   EXPECT_EQ(kUncompilableFunctionNodeName, node_info.name);
-  EXPECT_EQ("unsupported op", node_info.uncompilable_reason);
+  EXPECT_TRUE(
+      absl::StrContains(node_info.uncompilable_reason, "unsupported op"));
 }
 
 TEST_F(CompilabilityCheckUtilTest, CheckFunctionalIfNode) {
@@ -322,7 +325,8 @@ TEST_F(CompilabilityCheckUtilTest, CheckFunctionalIfNode) {
             stacktrace_second_node_info.function_name);
 
   EXPECT_EQ(kUncompilableFunctionNodeName, uncompilable_node_one.name);
-  EXPECT_EQ("unsupported op", uncompilable_node_one.uncompilable_reason);
+  EXPECT_TRUE(absl::StrContains(uncompilable_node_one.uncompilable_reason,
+                                "unsupported op"));
 
   NameAttrList function_two;
   function_two.set_name(kUncompilableFunctionTwoName);
@@ -345,7 +349,8 @@ TEST_F(CompilabilityCheckUtilTest, CheckFunctionalIfNode) {
             node_two_stacktrace_second_node.function_name);
 
   EXPECT_EQ(kUncompilableFunctionNodeTwoName, uncompilable_node_two.name);
-  EXPECT_EQ("unsupported op", uncompilable_node_two.uncompilable_reason);
+  EXPECT_TRUE(absl::StrContains(uncompilable_node_one.uncompilable_reason,
+                                "unsupported op"));
 }
 
 }  // namespace
diff --git a/tensorflow/compiler/mlir/BUILD b/tensorflow/compiler/mlir/BUILD
index dbe0d66b0c8..4e0f62b4351 100644
--- a/tensorflow/compiler/mlir/BUILD
+++ b/tensorflow/compiler/mlir/BUILD
@@ -45,7 +45,8 @@ cc_library(
         "//tensorflow/compiler/mlir/lite:tensorflow_lite_quantize",
         "//tensorflow/compiler/mlir/tensorflow",
         "//tensorflow/compiler/mlir/tensorflow:tensorflow_dialect_registration",
-        "//tensorflow/compiler/mlir/tensorflow:tensorflow_fold_switch",
+        "//tensorflow/compiler/mlir/tensorflow:tensorflow_passes",
+        "//tensorflow/compiler/mlir/tensorflow:tensorflow_test_passes",
         "//tensorflow/compiler/mlir/tensorflow:tf_dialect_passes",
         "//tensorflow/compiler/mlir/xla:hlo",
         "//tensorflow/compiler/mlir/xla:lhlo",
diff --git a/tensorflow/compiler/mlir/g3doc/README.md b/tensorflow/compiler/mlir/g3doc/README.md
new file mode 100644
index 00000000000..39734828d19
--- /dev/null
+++ b/tensorflow/compiler/mlir/g3doc/README.md
@@ -0,0 +1,3 @@
+# TensorFlow MLIR
+
+These are the docs for: https://www.tensorflow.org/mlir
diff --git a/tensorflow/compiler/mlir/g3doc/_book.yaml b/tensorflow/compiler/mlir/g3doc/_book.yaml
new file mode 100644
index 00000000000..9e8aa655c09
--- /dev/null
+++ b/tensorflow/compiler/mlir/g3doc/_book.yaml
@@ -0,0 +1,24 @@
+upper_tabs:
+# Tabs left of dropdown menu
+- include: /_upper_tabs_left.yaml
+- include: /api_docs/_upper_tabs_api.yaml
+# Dropdown menu
+- name: Resources
+  path: /resources
+  is_default: true
+  menu:
+  - include: /resources/_menu_toc.yaml
+  lower_tabs:
+    # Subsite tabs
+    other:
+    - name: Guide & Tutorials
+      contents:
+      - title: Overview
+        path: /mlir/overview
+      - heading: Dialects
+      - title: TensorFlow
+        path: /mlir/tf_ops
+      - title: TensorFlow Lite
+        path: /mlir/tfl_ops
+
+- include: /_upper_tabs_right.yaml
diff --git a/tensorflow/compiler/mlir/g3doc/_index.yaml b/tensorflow/compiler/mlir/g3doc/_index.yaml
new file mode 100644
index 00000000000..9090eefe875
--- /dev/null
+++ b/tensorflow/compiler/mlir/g3doc/_index.yaml
@@ -0,0 +1,48 @@
+book_path: /mlir/_book.yaml
+project_path: /mlir/_project.yaml
+description: <!--no description-->
+landing_page:
+  custom_css_path: /site-assets/css/style.css
+  rows:
+  - heading: MLIR unifies the infrastructure for high-performance ML models in TensorFlow.
+    items:
+    - description: >
+        The MLIR project defines a common intermediate representation (IR) that
+        unifies the infrastructure required to execute high performance machine
+        learning models in TensorFlow and similar ML frameworks. This project
+        will include the application of HPC techniques, along with integration of
+        search algorithms like reinforcement learning. MLIR aims to reduce the
+        cost to bring up new hardware, and improve usability for existing
+        TensorFlow users.
+
+    - code_block: |
+        <pre class = "prettyprint">
+        // Syntactically similar to LLVM:
+        func @testFunction(%arg0: i32) {
+          %x = call @thingToCall(%arg0) : (i32) -> i32
+          br ^bb1
+        ^bb1:
+          %y = addi %x, %x : i32
+          return %y : i32
+        }
+        </pre>
+
+  - classname: devsite-landing-row-cards
+    items:
+    - heading: "Multi-Level Intermediate Representation for Compiler Infrastructure"
+      youtube_id: qzljG6DKgic
+      buttons:
+      - label: Watch the video
+        path: https://www.youtube.com/watch?v=qzljG6DKgic
+    - heading: "A new intermediate representation and compiler framework"
+      image_path: /resources/images/tf-logo-card-16x9.png
+      path: https://medium.com/tensorflow/mlir-a-new-intermediate-representation-and-compiler-framework-beba999ed18d
+      buttons:
+      - label: Read on TensorFlow blog
+        path: https://medium.com/tensorflow/mlir-a-new-intermediate-representation-and-compiler-framework-beba999ed18d
+    - heading: TensorFlow MLIR on GitHub
+      image_path: /resources/images/github-card-16x9.png
+      path: https://github.com/tensorflow/mlir
+      buttons:
+      - label: View on GitHub
+        path: https://github.com/tensorflow/mlir
diff --git a/tensorflow/compiler/mlir/g3doc/_project.yaml b/tensorflow/compiler/mlir/g3doc/_project.yaml
new file mode 100644
index 00000000000..be0e46ac0ca
--- /dev/null
+++ b/tensorflow/compiler/mlir/g3doc/_project.yaml
@@ -0,0 +1,11 @@
+name: TensorFlow MLIR
+breadcrumb_name: MLIR
+home_url: /mlir/
+parent_project_metadata_path: /_project.yaml
+description: >
+  MLIR unifies the infrastructure for high-performance ML models in TensorFlow.
+use_site_branding: true
+hide_from_products_list: true
+content_license: cc-apache
+buganizer_id: 443907
+include: /_project_included.yaml
diff --git a/tensorflow/compiler/mlir/g3doc/images/mlir-infra.svg b/tensorflow/compiler/mlir/g3doc/images/mlir-infra.svg
new file mode 100644
index 00000000000..aec0986ba02
--- /dev/null
+++ b/tensorflow/compiler/mlir/g3doc/images/mlir-infra.svg
@@ -0,0 +1 @@
+<svg version="1.1" viewBox="0.0 0.0 960.0 720.0" fill="none" stroke="none" stroke-linecap="square" stroke-miterlimit="10" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns="http://www.w3.org/2000/svg"><clipPath id="p.0"><path d="m0 0l960.0 0l0 720.0l-960.0 0l0 -720.0z" clip-rule="nonzero"/></clipPath><g clip-path="url(#p.0)"><path fill="#000000" fill-opacity="0.0" d="m0 0l960.0 0l0 720.0l-960.0 0z" fill-rule="evenodd"/><path fill="#434343" d="m8.131233 225.34946l0 0c0 -10.332718 8.376322 -18.70903 18.709036 -18.70903l74.83389 0l0 0c4.9619446 0 9.720665 1.9711151 13.229286 5.4797363c3.5086288 3.5086365 5.4797516 8.267349 5.4797516 13.229294l0 394.14096c0 10.332764 -8.37632 18.709045 -18.709038 18.709045l-74.83389 0c-10.332714 0 -18.709036 -8.376282 -18.709036 -18.709045z" fill-rule="evenodd"/><path fill="#ffffff" d="m55.68145 377.67307l2.3125 0l2.53125 6.703125l0.109375 0l2.53125 -6.703125l2.34375 0l0 9.546875l-1.796875 0l0 -4.703125l0.109375 -1.59375l-0.109375 0l-2.421875 6.296875l-1.40625 0l-2.4375 -6.296875l-0.09375 0l0.09375 1.59375l0 4.703125l-1.765625 0l0 -9.546875zm11.99527 0l1.796875 0l0 7.84375l4.046875 0l0 1.703125l-5.84375 0l0 -9.546875zm6.2500153 9.546875l0 0z" fill-rule="nonzero"/><path fill="#ffffff" d="m25.853401 393.67307l6.046875 0l0 1.703125l-4.25 0l0 2.328125l3.828125 0l0 1.703125l-3.828125 0l0 3.8125l-1.796875 0l0 -9.546875zm7.135956 2.75l1.640625 0l0 0.90625l0.09375 0q0.265625 -0.484375 0.796875 -0.796875q0.53125 -0.3125 1.171875 -0.3125q0.46875 0 0.84375 0.15625l0 1.734375q-0.359375 -0.125 -0.609375 -0.1875q-0.234375 -0.0625 -0.515625 -0.0625q-0.78125 0 -1.234375 0.5625q-0.453125 0.5625 -0.453125 1.40625l0 3.390625l-1.734375 0l0 -6.796875zm7.5558777 7.015625q-1.078125 0 -1.78125 -0.625q-0.6875 -0.640625 -0.6875 -1.671875q0 -0.6875 0.359375 -1.203125q0.359375 -0.515625 0.984375 -0.796875q0.625 -0.28125 1.390625 -0.28125q1.046875 0 1.796875 0.3125l0 -0.296875q0 -0.5625 -0.421875 -0.90625q-0.421875 -0.34375 -1.125 -0.34375q-0.484375 0 -0.9375 0.21875q-0.4375 0.21875 -0.734375 0.578125l-1.109375 -0.875q0.484375 -0.640625 1.234375 -0.984375q0.75 -0.359375 1.625 -0.359375q1.5625 0 2.375 0.71875q0.8125 0.71875 0.8125 2.109375l0 4.1875l-1.71875 0l0 -0.6875l-0.109375 0q-0.3125 0.390625 -0.8125 0.65625q-0.5 0.25 -1.140625 0.25zm0.40625 -1.359375q0.765625 0 1.203125 -0.484375q0.453125 -0.5 0.453125 -1.140625q-0.6875 -0.328125 -1.421875 -0.328125q-1.359375 0 -1.359375 1.015625q0 0.40625 0.28125 0.671875q0.296875 0.265625 0.84375 0.265625zm4.992676 -5.65625l1.640625 0l0 0.859375l0.09375 0q0.328125 -0.5 0.859375 -0.78125q0.546875 -0.296875 1.21875 -0.296875q0.71875 0 1.25 0.34375q0.53125 0.34375 0.75 0.859375q0.34375 -0.53125 0.921875 -0.859375q0.578125 -0.34375 1.359375 -0.34375q1.15625 0 1.765625 0.71875q0.609375 0.71875 0.609375 1.921875l0 4.375l-1.734375 0l0 -4.046875q0 -0.640625 -0.296875 -0.984375q-0.296875 -0.359375 -0.828125 -0.359375q-0.6875 0 -1.09375 0.546875q-0.40625 0.53125 -0.40625 1.421875l0 3.421875l-1.75 0l0 -4.046875q0 -0.640625 -0.3125 -0.984375q-0.3125 -0.359375 -0.875 -0.359375q-0.640625 0 -1.046875 0.546875q-0.390625 0.53125 -0.390625 1.421875l0 3.421875l-1.734375 0l0 -6.796875zm15.313751 7.015625q-1.0 0 -1.8125 -0.46875q-0.8125 -0.46875 -1.28125 -1.28125q-0.453125 -0.828125 -0.453125 -1.859375q0 -0.96875 0.453125 -1.796875q0.453125 -0.84375 1.25 -1.328125q0.796875 -0.5 1.78125 -0.5q1.0625 0 1.8125 0.453125q0.765625 0.453125 1.15625 1.25q0.390625 0.78125 0.390625 1.734375q0 0.3125 -0.03125 0.59375l-5.109375 0q0.125 0.8125 0.640625 1.234375q0.515625 0.421875 1.234375 0.421875q0.609375 0 1.03125 -0.265625q0.4375 -0.265625 0.6875 -0.703125l1.421875 0.703125q-1.046875 1.8125 -3.171875 1.8125zm1.625 -4.4375q-0.03125 -0.328125 -0.234375 -0.640625q-0.203125 -0.3125 -0.578125 -0.515625q-0.359375 -0.203125 -0.859375 -0.203125q-0.625 0 -1.078125 0.375q-0.4375 0.359375 -0.640625 0.984375l3.390625 0zm2.1277466 -2.578125l1.953125 0l1.171875 4.40625l0.09375 0l1.359375 -4.40625l1.796875 0l1.359375 4.40625l0.09375 0l1.171875 -4.40625l1.921875 0l-2.171875 6.796875l-1.875 0l-1.390625 -4.390625l-0.09375 0l-1.359375 4.390625l-1.875 0l-2.15625 -6.796875zm14.984573 7.015625q-1.0625 0 -1.890625 -0.46875q-0.8125 -0.484375 -1.28125 -1.296875q-0.453125 -0.828125 -0.453125 -1.859375q0 -1.0 0.453125 -1.828125q0.46875 -0.828125 1.28125 -1.296875q0.828125 -0.484375 1.890625 -0.484375q1.03125 0 1.859375 0.484375q0.828125 0.46875 1.28125 1.296875q0.46875 0.828125 0.46875 1.828125q0 1.03125 -0.46875 1.859375q-0.453125 0.8125 -1.28125 1.296875q-0.828125 0.46875 -1.859375 0.46875zm0 -1.625q0.5 0 0.921875 -0.234375q0.4375 -0.234375 0.6875 -0.6875q0.25 -0.453125 0.25 -1.078125q0 -0.59375 -0.25 -1.046875q-0.25 -0.453125 -0.6875 -0.6875q-0.421875 -0.25 -0.921875 -0.25q-0.515625 0 -0.9375 0.25q-0.421875 0.234375 -0.6875 0.6875q-0.265625 0.453125 -0.265625 1.046875q0 0.609375 0.265625 1.078125q0.265625 0.453125 0.6875 0.6875q0.421875 0.234375 0.9375 0.234375zm4.813431 -5.390625l1.640625 0l0 0.90625l0.09375 0q0.265625 -0.484375 0.796875 -0.796875q0.53125 -0.3125 1.171875 -0.3125q0.46875 0 0.84375 0.15625l0 1.734375q-0.359375 -0.125 -0.609375 -0.1875q-0.234375 -0.0625 -0.515625 -0.0625q-0.78125 0 -1.234375 0.5625q-0.453125 0.5625 -0.453125 1.40625l0 3.390625l-1.734375 0l0 -6.796875zm5.7577515 -2.75l1.734375 0l0 5.203125l0.09375 0l2.4375 -2.453125l2.15625 0l0 0.109375l-2.578125 2.515625l2.6875 4.0625l0 0.109375l-2.0625 0l-1.84375 -2.96875l-0.890625 0.859375l0 2.109375l-1.734375 0l0 -9.546875zm9.895187 9.765625q-1.234375 0 -2.0 -0.5q-0.765625 -0.515625 -1.078125 -1.328125l1.5625 -0.6875q0.203125 0.484375 0.59375 0.734375q0.40625 0.25 0.921875 0.25q0.484375 0 0.796875 -0.15625q0.328125 -0.15625 0.328125 -0.5q0 -0.34375 -0.296875 -0.5q-0.296875 -0.171875 -0.875 -0.296875l-0.8125 -0.1875q-0.8125 -0.1875 -1.359375 -0.71875q-0.546875 -0.53125 -0.546875 -1.3125q0 -0.59375 0.34375 -1.046875q0.359375 -0.46875 0.953125 -0.71875q0.609375 -0.265625 1.34375 -0.265625q2.109375 0 2.8125 1.484375l-1.484375 0.65625q-0.40625 -0.71875 -1.28125 -0.71875q-0.453125 0 -0.71875 0.171875q-0.265625 0.15625 -0.265625 0.40625q0 0.484375 0.90625 0.71875l1.0 0.25q1.015625 0.25 1.515625 0.765625q0.515625 0.515625 0.515625 1.28125q0 0.65625 -0.375 1.15625q-0.375 0.5 -1.046875 0.78125q-0.65625 0.28125 -1.453125 0.28125z" fill-rule="nonzero"/><path fill="#ffffff" d="m54.497314 425.67307l5.515625 0l0 1.078125l-4.375 0l0 3.171875l3.953125 0l0 1.0625l-3.953125 0l0 3.15625l4.375 0l0 1.078125l-5.515625 0l0 -9.546875zm7.8750153 9.625q-0.375 0 -0.625 -0.234375q-0.234375 -0.25 -0.234375 -0.609375q0 -0.359375 0.234375 -0.59375q0.25 -0.25 0.625 -0.25q0.359375 0 0.59375 0.25q0.25 0.234375 0.25 0.59375q0 0.359375 -0.25 0.609375q-0.234375 0.234375 -0.59375 0.234375zm5.4064636 3.015625q-1.203125 0 -1.984375 -0.5625q-0.78125 -0.5625 -1.046875 -1.34375l1.046875 -0.4375q0.203125 0.578125 0.734375 0.9375q0.53125 0.375 1.25 0.375q1.046875 0 1.625 -0.609375q0.578125 -0.609375 0.578125 -1.734375l0 -0.765625l-0.046875 0q-0.328125 0.53125 -0.9375 0.859375q-0.609375 0.328125 -1.375 0.328125q-0.890625 0 -1.625 -0.453125q-0.71875 -0.453125 -1.140625 -1.265625q-0.421875 -0.828125 -0.421875 -1.859375q0 -1.03125 0.421875 -1.84375q0.421875 -0.828125 1.140625 -1.28125q0.734375 -0.453125 1.625 -0.453125q0.765625 0 1.375 0.328125q0.609375 0.328125 0.9375 0.859375l0.046875 0l0 -0.96875l1.09375 0l0 6.53125q0 1.640625 -0.921875 2.5q-0.90625 0.859375 -2.375 0.859375zm0 -3.984375q0.609375 0 1.109375 -0.296875q0.5 -0.3125 0.796875 -0.890625q0.296875 -0.578125 0.296875 -1.359375q0 -0.796875 -0.296875 -1.375q-0.296875 -0.578125 -0.796875 -0.875q-0.5 -0.296875 -1.109375 -0.296875q-0.609375 0 -1.125 0.3125q-0.5 0.296875 -0.8125 0.875q-0.296875 0.578125 -0.296875 1.359375q0 0.78125 0.296875 1.359375q0.3125 0.578125 0.8125 0.890625q0.515625 0.296875 1.125 0.296875zm5.7224884 0.96875q-0.375 0 -0.625 -0.234375q-0.234375 -0.25 -0.234375 -0.609375q0 -0.359375 0.234375 -0.59375q0.25 -0.25 0.625 -0.25q0.359375 0 0.59375 0.25q0.25 0.234375 0.25 0.59375q0 0.359375 -0.25 0.609375q-0.234375 0.234375 -0.59375 0.234375zm1.5783386 -0.078125l0 0z" fill-rule="nonzero"/><path fill="#ffffff" d="m31.78949 442.7512l-2.6875 0l0 -1.078125l6.484375 0l0 1.078125l-2.671875 0l0 8.46875l-1.125 0l0 -8.46875zm6.931961 8.6875q-0.96875 0 -1.75 -0.46875q-0.765625 -0.46875 -1.203125 -1.28125q-0.421875 -0.828125 -0.421875 -1.859375q0 -0.96875 0.40625 -1.796875q0.40625 -0.84375 1.15625 -1.328125q0.75 -0.5 1.71875 -0.5q1.0 0 1.734375 0.453125q0.734375 0.4375 1.125 1.234375q0.40625 0.78125 0.40625 1.796875q0 0.15625 -0.03125 0.34375l-5.390625 0q0.046875 0.78125 0.375 1.3125q0.328125 0.53125 0.84375 0.796875q0.515625 0.265625 1.078125 0.265625q1.328125 0 2.015625 -1.234375l0.953125 0.46875q-0.421875 0.8125 -1.1875 1.3125q-0.75 0.484375 -1.828125 0.484375zm1.96875 -4.34375q-0.03125 -0.421875 -0.25 -0.84375q-0.203125 -0.421875 -0.65625 -0.71875q-0.453125 -0.296875 -1.15625 -0.296875q-0.796875 0 -1.359375 0.515625q-0.546875 0.515625 -0.734375 1.34375l4.15625 0zm2.519333 -2.671875l1.078125 0l0 1.0l0.046875 0q0.28125 -0.5 0.890625 -0.859375q0.625 -0.359375 1.3125 -0.359375q1.21875 0 1.859375 0.71875q0.65625 0.71875 0.65625 1.9375l0 4.359375l-1.125 0l0 -4.1875q0 -0.9375 -0.453125 -1.359375q-0.453125 -0.4375 -1.25 -0.4375q-0.546875 0 -0.984375 0.3125q-0.421875 0.296875 -0.671875 0.796875q-0.234375 0.5 -0.234375 1.046875l0 3.828125l-1.125 0l0 -6.796875zm9.785385 7.015625q-1.09375 0 -1.828125 -0.515625q-0.734375 -0.515625 -1.046875 -1.3125l1.015625 -0.453125q0.25 0.609375 0.75 0.953125q0.5 0.328125 1.109375 0.328125q0.640625 0 1.09375 -0.265625q0.453125 -0.265625 0.453125 -0.734375q0 -0.4375 -0.375 -0.6875q-0.359375 -0.265625 -1.125 -0.453125l-0.828125 -0.203125q-0.8125 -0.203125 -1.328125 -0.6875q-0.515625 -0.484375 -0.515625 -1.234375q0 -0.59375 0.34375 -1.03125q0.359375 -0.453125 0.9375 -0.6875q0.59375 -0.25 1.265625 -0.25q0.890625 0 1.59375 0.390625q0.703125 0.390625 1.0 1.078125l-0.984375 0.453125q-0.453125 -0.90625 -1.625 -0.90625q-0.546875 0 -0.96875 0.265625q-0.421875 0.25 -0.421875 0.65625q0 0.375 0.28125 0.609375q0.296875 0.234375 0.890625 0.375l0.984375 0.25q1.0 0.265625 1.5 0.765625q0.515625 0.5 0.515625 1.21875q0 0.625 -0.359375 1.09375q-0.359375 0.46875 -0.984375 0.734375q-0.609375 0.25 -1.34375 0.25zm7.1208496 0q-1.0 0 -1.796875 -0.484375q-0.796875 -0.484375 -1.25 -1.3125q-0.4375 -0.828125 -0.4375 -1.828125q0 -0.984375 0.4375 -1.8125q0.453125 -0.828125 1.25 -1.3125q0.796875 -0.484375 1.796875 -0.484375q1.0 0 1.796875 0.484375q0.8125 0.484375 1.25 1.3125q0.453125 0.828125 0.453125 1.8125q0 1.0 -0.453125 1.828125q-0.4375 0.828125 -1.25 1.3125q-0.796875 0.484375 -1.796875 0.484375zm0 -1.03125q0.625 0 1.15625 -0.3125q0.546875 -0.3125 0.875 -0.890625q0.328125 -0.59375 0.328125 -1.390625q0 -0.78125 -0.328125 -1.359375q-0.328125 -0.59375 -0.875 -0.90625q-0.53125 -0.3125 -1.15625 -0.3125q-0.625 0 -1.171875 0.3125q-0.546875 0.3125 -0.875 0.90625q-0.328125 0.578125 -0.328125 1.359375q0 0.796875 0.328125 1.390625q0.328125 0.578125 0.875 0.890625q0.546875 0.3125 1.171875 0.3125zm4.685196 -5.984375l1.078125 0l0 1.09375l0.046875 0q0.203125 -0.5625 0.765625 -0.921875q0.578125 -0.375 1.203125 -0.375q0.46875 0 0.8125 0.140625l0 1.21875q-0.4375 -0.203125 -0.96875 -0.203125q-0.484375 0 -0.90625 0.28125q-0.40625 0.265625 -0.65625 0.75q-0.25 0.46875 -0.25 1.015625l0 3.796875l-1.125 0l0 -6.796875zm5.2167816 -2.75l5.515625 0l0 1.078125l-4.375 0l0 3.265625l3.953125 0l0 1.078125l-3.953125 0l0 4.125l-1.140625 0l0 -9.546875zm6.8584595 0l1.125 0l0 9.546875l-1.125 0l0 -9.546875zm5.8705597 9.765625q-1.0 0 -1.796875 -0.484375q-0.796875 -0.484375 -1.25 -1.3125q-0.4375 -0.828125 -0.4375 -1.828125q0 -0.984375 0.4375 -1.8125q0.453125 -0.828125 1.25 -1.3125q0.796875 -0.484375 1.796875 -0.484375q1.0 0 1.796875 0.484375q0.8125 0.484375 1.25 1.3125q0.453125 0.828125 0.453125 1.8125q0 1.0 -0.453125 1.828125q-0.4375 0.828125 -1.25 1.3125q-0.796875 0.484375 -1.796875 0.484375zm0 -1.03125q0.625 0 1.15625 -0.3125q0.546875 -0.3125 0.875 -0.890625q0.328125 -0.59375 0.328125 -1.390625q0 -0.78125 -0.328125 -1.359375q-0.328125 -0.59375 -0.875 -0.90625q-0.53125 -0.3125 -1.15625 -0.3125q-0.625 0 -1.171875 0.3125q-0.546875 0.3125 -0.875 0.90625q-0.328125 0.578125 -0.328125 1.359375q0 0.796875 0.328125 1.390625q0.328125 0.578125 0.875 0.890625q0.546875 0.3125 1.171875 0.3125zm3.935196 -5.984375l1.171875 0l1.609375 5.375l0.015625 0l1.71875 -5.375l1.15625 0l1.71875 5.359375l0.015625 0l1.609375 -5.359375l1.15625 0l-2.203125 6.796875l-1.140625 0l-1.765625 -5.421875l-1.75 5.421875l-1.125 0l-2.1875 -6.796875zm10.549667 7.90625l0.71875 -1.109375q-0.25 -0.046875 -0.453125 -0.25q-0.1875 -0.21875 -0.1875 -0.515625q0 -0.359375 0.25 -0.59375q0.25 -0.25 0.609375 -0.25q0.359375 0 0.609375 0.25q0.25 0.234375 0.25 0.59375q0 0.21875 -0.09375 0.421875q-0.09375 0.203125 -0.28125 0.484375l-0.890625 1.328125l-0.53125 -0.359375zm2.5245667 -1.109375l0 0z" fill-rule="nonzero"/><path fill="#ffffff" d="m30.26062 457.67307l3.21875 0q0.796875 0 1.46875 0.359375q0.6875 0.359375 1.09375 1.015625q0.40625 0.640625 0.40625 1.453125q0 0.8125 -0.40625 1.46875q-0.40625 0.640625 -1.09375 1.0q-0.671875 0.359375 -1.46875 0.359375l-2.078125 0l0 3.890625l-1.140625 0l0 -9.546875zm3.25 4.578125q0.53125 0 0.9375 -0.25q0.40625 -0.265625 0.625 -0.65625q0.234375 -0.40625 0.234375 -0.84375q0 -0.4375 -0.234375 -0.828125q-0.21875 -0.40625 -0.625 -0.65625q-0.40625 -0.265625 -0.9375 -0.265625l-2.109375 0l0 3.5l2.109375 0zm6.2863464 4.546875l-2.8125 -6.375l1.21875 0l2.140625 5.046875l0.015625 0l2.0625 -5.046875l1.21875 0l-4.1875 9.6719055l-1.171875 0l1.515625 -3.2969055zm6.986664 -8.046875l-2.6875 0l0 -1.078125l6.484375 0l0 1.078125l-2.671875 0l0 8.46875l-1.125 0l0 -8.46875zm7.041336 8.6875q-1.0 0 -1.796875 -0.484375q-0.796875 -0.484375 -1.25 -1.3125q-0.4375 -0.828125 -0.4375 -1.828125q0 -0.984375 0.4375 -1.8125q0.453125 -0.828125 1.25 -1.3125q0.796875 -0.484375 1.796875 -0.484375q1.0 0 1.796875 0.484375q0.8125 0.484375 1.25 1.3125q0.453125 0.828125 0.453125 1.8125q0 1.0 -0.453125 1.828125q-0.4375 0.828125 -1.25 1.3125q-0.796875 0.484375 -1.796875 0.484375zm0 -1.03125q0.625 0 1.15625 -0.3125q0.546875 -0.3125 0.875 -0.890625q0.328125 -0.59375 0.328125 -1.390625q0 -0.78125 -0.328125 -1.359375q-0.328125 -0.59375 -0.875 -0.90625q-0.53125 -0.3125 -1.15625 -0.3125q-0.625 0 -1.171875 0.3125q-0.546875 0.3125 -0.875 0.90625q-0.328125 0.578125 -0.328125 1.359375q0 0.796875 0.328125 1.390625q0.328125 0.578125 0.875 0.890625q0.546875 0.3125 1.171875 0.3125zm4.685196 -5.984375l1.078125 0l0 1.09375l0.046875 0q0.203125 -0.5625 0.765625 -0.921875q0.578125 -0.375 1.203125 -0.375q0.46875 0 0.8125 0.140625l0 1.21875q-0.4375 -0.203125 -0.96875 -0.203125q-0.484375 0 -0.90625 0.28125q-0.40625 0.265625 -0.65625 0.75q-0.25 0.46875 -0.25 1.015625l0 3.796875l-1.125 0l0 -6.796875zm7.6761627 7.015625q-0.984375 0 -1.78125 -0.46875q-0.78125 -0.484375 -1.21875 -1.296875q-0.4375 -0.828125 -0.4375 -1.859375q0 -1.015625 0.4375 -1.84375q0.4375 -0.828125 1.21875 -1.296875q0.796875 -0.46875 1.78125 -0.46875q1.109375 0 1.859375 0.515625q0.75 0.515625 1.0625 1.375l-1.015625 0.421875q-0.25 -0.625 -0.75 -0.953125q-0.5 -0.328125 -1.21875 -0.328125q-0.59375 0 -1.125 0.328125q-0.515625 0.328125 -0.828125 0.921875q-0.3125 0.578125 -0.3125 1.328125q0 0.765625 0.3125 1.359375q0.3125 0.578125 0.828125 0.90625q0.53125 0.328125 1.125 0.328125q0.71875 0 1.234375 -0.328125q0.53125 -0.34375 0.78125 -0.953125l1.015625 0.421875q-0.34375 0.84375 -1.109375 1.375q-0.765625 0.515625 -1.859375 0.515625zm4.199005 -9.765625l1.125 0l0 2.8125l-0.046875 0.9375l0.046875 0q0.28125 -0.515625 0.890625 -0.859375q0.609375 -0.359375 1.328125 -0.359375q1.234375 0 1.890625 0.734375q0.65625 0.71875 0.65625 1.921875l0 4.359375l-1.140625 0l0 -4.1875q0 -0.921875 -0.46875 -1.359375q-0.453125 -0.4375 -1.203125 -0.4375q-0.53125 0 -0.984375 0.328125q-0.4375 0.3125 -0.703125 0.828125q-0.265625 0.5 -0.265625 1.046875l0 3.78125l-1.125 0l0 -9.546875zm6.673279 9.546875l0 0zm6.9514923 0.21875q-0.96875 0 -1.75 -0.46875q-0.765625 -0.46875 -1.203125 -1.28125q-0.421875 -0.828125 -0.421875 -1.859375q0 -0.96875 0.40625 -1.796875q0.40625 -0.84375 1.15625 -1.328125q0.75 -0.5 1.71875 -0.5q1.0 0 1.734375 0.453125q0.734375 0.4375 1.125 1.234375q0.40625 0.78125 0.40625 1.796875q0 0.15625 -0.03125 0.34375l-5.390625 0q0.046875 0.78125 0.375 1.3125q0.328125 0.53125 0.84375 0.796875q0.515625 0.265625 1.078125 0.265625q1.328125 0 2.015625 -1.234375l0.953125 0.46875q-0.421875 0.8125 -1.1875 1.3125q-0.75 0.484375 -1.828125 0.484375zm1.96875 -4.34375q-0.03125 -0.421875 -0.25 -0.84375q-0.203125 -0.421875 -0.65625 -0.71875q-0.453125 -0.296875 -1.15625 -0.296875q-0.796875 0 -1.359375 0.515625q-0.546875 0.515625 -0.734375 1.34375l4.15625 0zm4.928711 4.234375q-0.421875 0 -0.78125 -0.125q-0.34375 -0.140625 -0.578125 -0.375q-0.53125 -0.5 -0.53125 -1.390625l0 -3.984375l-1.1875 0l0 -1.03125l1.1875 0l0 -1.921875l1.125 0l0 1.921875l1.671875 0l0 1.03125l-1.671875 0l0 3.703125q0 0.5625 0.21875 0.828125q0.25 0.28125 0.734375 0.28125q0.40625 0 0.75 -0.21875l0 1.109375q-0.203125 0.09375 -0.421875 0.125q-0.203125 0.046875 -0.515625 0.046875zm5.0534973 0.109375q-0.984375 0 -1.78125 -0.46875q-0.78125 -0.484375 -1.21875 -1.296875q-0.4375 -0.828125 -0.4375 -1.859375q0 -1.015625 0.4375 -1.84375q0.4375 -0.828125 1.21875 -1.296875q0.796875 -0.46875 1.78125 -0.46875q1.109375 0 1.859375 0.515625q0.75 0.515625 1.0625 1.375l-1.015625 0.421875q-0.25 -0.625 -0.75 -0.953125q-0.5 -0.328125 -1.21875 -0.328125q-0.59375 0 -1.125 0.328125q-0.515625 0.328125 -0.828125 0.921875q-0.3125 0.578125 -0.3125 1.328125q0 0.765625 0.3125 1.359375q0.3125 0.578125 0.828125 0.90625q0.53125 0.328125 1.125 0.328125q0.71875 0 1.234375 -0.328125q0.53125 -0.34375 0.78125 -0.953125l1.015625 0.421875q-0.34375 0.84375 -1.109375 1.375q-0.765625 0.515625 -1.859375 0.515625zm3.3552551 -0.21875l0 0z" fill-rule="nonzero"/><path fill="#4285f4" d="m158.4755 221.85857l0 0c0 -8.404755 6.8134003 -15.21814 15.218155 -15.21814l126.57158 0c4.0361023 0 7.906891 1.6033325 10.760864 4.4572906c2.8539429 2.8539581 4.4572754 6.7247467 4.4572754 10.760849l0 60.870773c0 8.404755 -6.813385 15.21817 -15.21814 15.21817l-126.57158 0c-8.404755 0 -15.218155 -6.8134155 -15.218155 -15.21817z" fill-rule="evenodd"/><path fill="#ffffff" d="m216.92953 248.51083q-1.921875 0 -3.53125 -0.90625q-1.609375 -0.90625 -2.5625 -2.5q-0.953125 -1.59375 -0.953125 -3.5625q0 -1.984375 0.953125 -3.578125q0.953125 -1.59375 2.5625 -2.5q1.609375 -0.90625 3.53125 -0.90625q1.578125 0 2.875 0.546875q1.296875 0.546875 2.21875 1.5625l-1.734375 1.703125q-0.671875 -0.71875 -1.484375 -1.0625q-0.8125 -0.359375 -1.890625 -0.359375q-1.234375 0 -2.265625 0.578125q-1.015625 0.5625 -1.625 1.625q-0.609375 1.046875 -0.609375 2.390625q0 1.34375 0.609375 2.390625q0.625 1.046875 1.65625 1.625q1.046875 0.5625 2.265625 0.5625q1.875 0 3.0625 -1.09375q0.375 -0.34375 0.65625 -0.859375q0.28125 -0.53125 0.40625 -1.140625l-4.1875 0l0 -2.140625l6.546875 0q0.125 0.515625 0.125 1.171875q0 1.328125 -0.40625 2.46875q-0.390625 1.125 -1.21875 1.984375q-0.890625 0.96875 -2.15625 1.484375q-1.265625 0.515625 -2.84375 0.515625zm8.317978 -9.8125l2.296875 0l0 1.265625l0.15625 0q0.359375 -0.671875 1.09375 -1.109375q0.75 -0.4375 1.640625 -0.4375q0.65625 0 1.203125 0.203125l0 2.453125q-0.5 -0.1875 -0.859375 -0.265625q-0.34375 -0.09375 -0.734375 -0.09375q-1.078125 0 -1.71875 0.78125q-0.625 0.78125 -0.625 1.984375l0 4.734375l-2.453125 0l0 -9.515625zm10.589432 9.8125q-1.515625 0 -2.484375 -0.875q-0.96875 -0.890625 -0.96875 -2.328125q0 -0.953125 0.5 -1.671875q0.5 -0.734375 1.375 -1.125q0.890625 -0.390625 1.953125 -0.390625q1.46875 0 2.515625 0.421875l0 -0.40625q0 -0.78125 -0.59375 -1.265625q-0.578125 -0.484375 -1.578125 -0.484375q-0.671875 0 -1.296875 0.3125q-0.625 0.296875 -1.03125 0.796875l-1.5625 -1.234375q0.6875 -0.890625 1.734375 -1.375q1.046875 -0.484375 2.265625 -0.484375q2.1875 0 3.328125 1.015625q1.140625 1.0 1.140625 2.9375l0 5.859375l-2.40625 0l0 -0.96875l-0.15625 0q-0.421875 0.5625 -1.125 0.921875q-0.703125 0.34375 -1.609375 0.34375zm0.578125 -1.90625q1.0625 0 1.6875 -0.671875q0.625 -0.6875 0.625 -1.59375q-0.953125 -0.453125 -2.0 -0.453125q-1.90625 0 -1.90625 1.421875q0 0.578125 0.40625 0.9375q0.421875 0.359375 1.1875 0.359375zm6.903656 -7.90625l2.296875 0l0 1.15625l0.140625 0q0.390625 -0.609375 1.140625 -1.03125q0.75 -0.421875 1.71875 -0.421875q1.28125 0 2.328125 0.640625q1.0625 0.640625 1.65625 1.796875q0.59375 1.15625 0.59375 2.625q0 1.46875 -0.59375 2.625q-0.59375 1.140625 -1.65625 1.78125q-1.046875 0.640625 -2.328125 0.640625q-0.96875 0 -1.734375 -0.40625q-0.75 -0.40625 -1.125 -1.03125l-0.140625 0l0.140625 1.34375l0 3.828125l-2.4375 0l0 -13.546875zm4.859375 7.5625q0.6875 0 1.28125 -0.359375q0.609375 -0.359375 0.953125 -0.984375q0.359375 -0.640625 0.359375 -1.453125q0 -0.828125 -0.359375 -1.453125q-0.34375 -0.640625 -0.953125 -0.984375q-0.59375 -0.34375 -1.28125 -0.34375q-0.671875 0 -1.265625 0.34375q-0.59375 0.34375 -0.953125 0.984375q-0.34375 0.625 -0.34375 1.453125q0 0.8125 0.34375 1.453125q0.359375 0.625 0.953125 0.984375q0.59375 0.359375 1.265625 0.359375zm6.713928 -11.40625l2.453125 0l0 3.375l-0.15625 1.65625l0.15625 0q0.40625 -0.671875 1.1875 -1.078125q0.78125 -0.40625 1.703125 -0.40625q1.734375 0 2.65625 1.03125q0.921875 1.03125 0.921875 2.875l0 5.90625l-2.4375 0l0 -5.609375q0 -0.921875 -0.5 -1.421875q-0.5 -0.5 -1.3125 -0.5q-0.96875 0 -1.59375 0.78125q-0.625 0.78125 -0.625 1.921875l0 4.828125l-2.453125 0l0 -13.359375z" fill-rule="nonzero"/><path fill="#ffffff" d="m207.81186 256.85458l2.515625 0l0 13.359375l-2.515625 0l0 -13.359375zm5.2158203 3.84375l2.296875 0l0 1.1875l0.15625 0q0.4375 -0.6875 1.1875 -1.078125q0.765625 -0.40625 1.703125 -0.40625q1.0 0 1.75 0.484375q0.75 0.46875 1.0625 1.1875q0.46875 -0.734375 1.28125 -1.203125q0.8125 -0.46875 1.90625 -0.46875q1.625 0 2.46875 1.0q0.859375 1.0 0.859375 2.6875l0 6.125l-2.421875 0l0 -5.671875q0 -0.890625 -0.421875 -1.375q-0.421875 -0.484375 -1.171875 -0.484375q-0.953125 0 -1.53125 0.75q-0.5625 0.75 -0.5625 1.984375l0 4.796875l-2.4375 0l0 -5.671875q0 -0.890625 -0.4375 -1.375q-0.4375 -0.484375 -1.234375 -0.484375q-0.90625 0 -1.453125 0.75q-0.546875 0.75 -0.546875 1.984375l0 4.796875l-2.453125 0l0 -9.515625zm16.93335 0l2.296875 0l0 1.15625l0.140625 0q0.390625 -0.609375 1.140625 -1.03125q0.75 -0.421875 1.71875 -0.421875q1.28125 0 2.328125 0.640625q1.0625 0.640625 1.65625 1.796875q0.59375 1.15625 0.59375 2.625q0 1.46875 -0.59375 2.625q-0.59375 1.140625 -1.65625 1.78125q-1.046875 0.640625 -2.328125 0.640625q-0.96875 0 -1.734375 -0.40625q-0.75 -0.40625 -1.125 -1.03125l-0.140625 0l0.140625 1.34375l0 3.828125l-2.4375 0l0 -13.546875zm4.859375 7.5625q0.6875 0 1.28125 -0.359375q0.609375 -0.359375 0.953125 -0.984375q0.359375 -0.640625 0.359375 -1.453125q0 -0.828125 -0.359375 -1.453125q-0.34375 -0.640625 -0.953125 -0.984375q-0.59375 -0.34375 -1.28125 -0.34375q-0.671875 0 -1.265625 0.34375q-0.59375 0.34375 -0.953125 0.984375q-0.34375 0.625 -0.34375 1.453125q0 0.8125 0.34375 1.453125q0.359375 0.625 0.953125 0.984375q0.59375 0.359375 1.265625 0.359375zm11.463089 2.25q-1.484375 0 -2.640625 -0.65625q-1.15625 -0.671875 -1.796875 -1.8125q-0.640625 -1.15625 -0.640625 -2.578125q0 -1.421875 0.640625 -2.578125q0.640625 -1.15625 1.796875 -1.8125q1.15625 -0.671875 2.640625 -0.671875q1.453125 0 2.609375 0.671875q1.15625 0.65625 1.796875 1.8125q0.640625 1.15625 0.640625 2.578125q0 1.421875 -0.640625 2.578125q-0.640625 1.140625 -1.796875 1.8125q-1.15625 0.65625 -2.609375 0.65625zm0 -2.25q0.703125 0 1.296875 -0.328125q0.609375 -0.34375 0.953125 -0.984375q0.359375 -0.640625 0.359375 -1.484375q0 -0.84375 -0.359375 -1.46875q-0.34375 -0.640625 -0.953125 -0.96875q-0.59375 -0.34375 -1.296875 -0.34375q-0.71875 0 -1.3125 0.34375q-0.59375 0.328125 -0.96875 0.96875q-0.359375 0.625 -0.359375 1.46875q0 0.828125 0.359375 1.484375q0.375 0.640625 0.96875 0.984375q0.59375 0.328125 1.3125 0.328125zm18.927567 2.109375q-0.703125 0 -1.296875 -0.21875q-0.59375 -0.21875 -0.96875 -0.59375q-0.875 -0.84375 -0.875 -2.390625l0 -4.375l-1.671875 0l0 -2.09375l1.671875 0l0 -2.6875l2.4375 0l0 2.6875l2.328125 0l0 2.09375l-2.328125 0l0 3.9375q0 0.71875 0.3125 1.046875q0.265625 0.296875 0.90625 0.296875q0.34375 0 0.578125 -0.09375q0.234375 -0.09375 0.609375 -0.328125l0 2.390625q-0.796875 0.328125 -1.703125 0.328125zm-12.203125 -9.671875l2.296875 0l0 1.265625l0.15625 0q0.375 -0.6875 1.140625 -1.109375q0.765625 -0.421875 1.640625 -0.421875q0.625 0 0.890625 0.125l0 2.375q-0.46875 -0.1875 -1.15625 -0.1875q-1.15625 0 -1.84375 0.75q-0.671875 0.75 -0.671875 2.03125l0 4.6875l-2.453125 0l0 -9.515625z" fill-rule="nonzero"/><path fill="#000000" fill-opacity="0.0" d="m120.3832 422.41995c9.523109 0 14.286385 -42.527557 19.046227 -85.055115c4.759842 -42.527557 9.516251 -85.05513 19.032516 -85.05513" fill-rule="evenodd"/><path stroke="#3f3f3f" stroke-width="2.0" stroke-linejoin="round" stroke-linecap="butt" d="m120.3832 422.41995c9.523109 0 14.286385 -42.527557 19.046227 -85.055115c2.3799286 -21.263794 4.7589874 -42.527557 7.732605 -58.475403c0.74339294 -3.986969 1.5239563 -7.6416626 2.3509827 -10.881073c0.41352844 -1.6197205 0.83865356 -3.1355896 1.2765656 -4.5372314c0.10946655 -0.35043335 0.21974182 -0.6937256 0.33084106 -1.0296936c0.055541992 -0.16796875 0.11128235 -0.33413696 0.16725159 -0.49847412l0.0038452148 -0.011138916" fill-rule="evenodd"/><path fill="#3f3f3f" stroke="#3f3f3f" stroke-width="2.0" stroke-linecap="butt" d="m153.94038 263.90576l2.7744904 -9.251617l-8.072205 5.3037415z" fill-rule="evenodd"/><path fill="#f6921e" d="m352.0098 221.85857l0 0c0 -8.404755 6.8134155 -15.21814 15.21817 -15.21814l119.04401 0c4.0361023 0 7.906891 1.6033325 10.760864 4.4572906c2.8539429 2.8539581 4.4572754 6.7247467 4.4572754 10.760849l0 60.870773c0 8.404755 -6.813385 15.21817 -15.21814 15.21817l-119.04401 0c-8.404755 0 -15.21817 -6.8134155 -15.21817 -15.21817z" fill-rule="evenodd"/><path fill="#ffffff" d="m375.99768 248.51083q-1.953125 0 -3.546875 -0.921875q-1.59375 -0.921875 -2.515625 -2.515625q-0.90625 -1.609375 -0.90625 -3.53125q0 -1.921875 0.90625 -3.53125q0.921875 -1.609375 2.515625 -2.53125q1.59375 -0.921875 3.546875 -0.921875q1.96875 0 3.5625 0.921875q1.59375 0.921875 2.5 2.53125q0.921875 1.609375 0.921875 3.53125q0 1.921875 -0.921875 3.53125q-0.90625 1.59375 -2.5 2.515625q-1.59375 0.921875 -3.5625 0.921875zm0 -2.390625q1.234375 0 2.265625 -0.578125q1.03125 -0.578125 1.609375 -1.609375q0.59375 -1.046875 0.59375 -2.390625q0 -1.34375 -0.59375 -2.390625q-0.578125 -1.046875 -1.609375 -1.625q-1.03125 -0.578125 -2.265625 -0.578125q-1.21875 0 -2.234375 0.578125q-1.015625 0.578125 -1.609375 1.625q-0.59375 1.046875 -0.59375 2.390625q0 1.34375 0.59375 2.390625q0.59375 1.03125 1.609375 1.609375q1.015625 0.578125 2.234375 0.578125zm8.96814 -7.421875l2.296875 0l0 1.15625l0.140625 0q0.390625 -0.609375 1.140625 -1.03125q0.75 -0.421875 1.71875 -0.421875q1.28125 0 2.328125 0.640625q1.0625 0.640625 1.65625 1.796875q0.59375 1.15625 0.59375 2.625q0 1.46875 -0.59375 2.625q-0.59375 1.140625 -1.65625 1.78125q-1.046875 0.640625 -2.328125 0.640625q-0.96875 0 -1.734375 -0.40625q-0.75 -0.40625 -1.125 -1.03125l-0.140625 0l0.140625 1.34375l0 3.828125l-2.4375 0l0 -13.546875zm4.859375 7.5625q0.6875 0 1.28125 -0.359375q0.609375 -0.359375 0.953125 -0.984375q0.359375 -0.640625 0.359375 -1.453125q0 -0.828125 -0.359375 -1.453125q-0.34375 -0.640625 -0.953125 -0.984375q-0.59375 -0.34375 -1.28125 -0.34375q-0.671875 0 -1.265625 0.34375q-0.59375 0.34375 -0.953125 0.984375q-0.34375 0.625 -0.34375 1.453125q0 0.8125 0.34375 1.453125q0.359375 0.625 0.953125 0.984375q0.59375 0.359375 1.265625 0.359375zm10.667877 2.109375q-0.703125 0 -1.296875 -0.21875q-0.59375 -0.21875 -0.96875 -0.59375q-0.875 -0.84375 -0.875 -2.390625l0 -4.375l-1.671875 0l0 -2.09375l1.671875 0l0 -2.6875l2.4375 0l0 2.6875l2.328125 0l0 2.09375l-2.328125 0l0 3.9375q0 0.71875 0.3125 1.046875q0.265625 0.296875 0.90625 0.296875q0.34375 0 0.578125 -0.09375q0.234375 -0.09375 0.609375 -0.328125l0 2.390625q-0.796875 0.328125 -1.703125 0.328125zm5.141632 -10.71875q-0.65625 0 -1.125 -0.46875q-0.46875 -0.46875 -0.46875 -1.109375q0 -0.65625 0.46875 -1.109375q0.46875 -0.46875 1.125 -0.46875q0.65625 0 1.109375 0.46875q0.453125 0.453125 0.453125 1.109375q0 0.640625 -0.453125 1.109375q-0.453125 0.46875 -1.109375 0.46875zm-1.234375 1.046875l2.4375 0l0 9.515625l-2.4375 0l0 -9.515625zm4.9742126 0l2.296875 0l0 1.1875l0.15625 0q0.4375 -0.6875 1.1875 -1.078125q0.765625 -0.40625 1.703125 -0.40625q1.0 0 1.75 0.484375q0.75 0.46875 1.0625 1.1875q0.46875 -0.734375 1.28125 -1.203125q0.8125 -0.46875 1.90625 -0.46875q1.625 0 2.46875 1.0q0.859375 1.0 0.859375 2.6875l0 6.125l-2.421875 0l0 -5.671875q0 -0.890625 -0.421875 -1.375q-0.421875 -0.484375 -1.171875 -0.484375q-0.953125 0 -1.53125 0.75q-0.5625 0.75 -0.5625 1.984375l0 4.796875l-2.4375 0l0 -5.671875q0 -0.890625 -0.4375 -1.375q-0.4375 -0.484375 -1.234375 -0.484375q-0.90625 0 -1.453125 0.75q-0.546875 0.75 -0.546875 1.984375l0 4.796875l-2.453125 0l0 -9.515625zm18.386475 -1.046875q-0.65625 0 -1.125 -0.46875q-0.46875 -0.46875 -0.46875 -1.109375q0 -0.65625 0.46875 -1.109375q0.46875 -0.46875 1.125 -0.46875q0.65625 0 1.109375 0.46875q0.453125 0.453125 0.453125 1.109375q0 0.640625 -0.453125 1.109375q-0.453125 0.46875 -1.109375 0.46875zm-1.234375 1.046875l2.4375 0l0 9.515625l-2.4375 0l0 -9.515625zm4.4898376 7.515625l4.96875 -5.4375l-4.8125 0l0 -2.078125l7.8125 0l0 2.0l-4.953125 5.4375l5.109375 0l0 2.078125l-8.125 0l0 -2.0zm12.75293 2.296875q-1.515625 0 -2.484375 -0.875q-0.96875 -0.890625 -0.96875 -2.328125q0 -0.953125 0.5 -1.671875q0.5 -0.734375 1.375 -1.125q0.890625 -0.390625 1.953125 -0.390625q1.46875 0 2.515625 0.421875l0 -0.40625q0 -0.78125 -0.59375 -1.265625q-0.578125 -0.484375 -1.578125 -0.484375q-0.671875 0 -1.296875 0.3125q-0.625 0.296875 -1.03125 0.796875l-1.5625 -1.234375q0.6875 -0.890625 1.734375 -1.375q1.046875 -0.484375 2.265625 -0.484375q2.1875 0 3.328125 1.015625q1.140625 1.0 1.140625 2.9375l0 5.859375l-2.40625 0l0 -0.96875l-0.15625 0q-0.421875 0.5625 -1.125 0.921875q-0.703125 0.34375 -1.609375 0.34375zm0.578125 -1.90625q1.0625 0 1.6875 -0.671875q0.625 -0.6875 0.625 -1.59375q-0.953125 -0.453125 -2.0 -0.453125q-1.90625 0 -1.90625 1.421875q0 0.578125 0.40625 0.9375q0.421875 0.359375 1.1875 0.359375zm10.9201355 1.765625q-0.703125 0 -1.296875 -0.21875q-0.59375 -0.21875 -0.96875 -0.59375q-0.875 -0.84375 -0.875 -2.390625l0 -4.375l-1.671875 0l0 -2.09375l1.671875 0l0 -2.6875l2.4375 0l0 2.6875l2.328125 0l0 2.09375l-2.328125 0l0 3.9375q0 0.71875 0.3125 1.046875q0.265625 0.296875 0.90625 0.296875q0.34375 0 0.578125 -0.09375q0.234375 -0.09375 0.609375 -0.328125l0 2.390625q-0.796875 0.328125 -1.703125 0.328125zm5.141632 -10.71875q-0.65625 0 -1.125 -0.46875q-0.46875 -0.46875 -0.46875 -1.109375q0 -0.65625 0.46875 -1.109375q0.46875 -0.46875 1.125 -0.46875q0.65625 0 1.109375 0.46875q0.453125 0.453125 0.453125 1.109375q0 0.640625 -0.453125 1.109375q-0.453125 0.46875 -1.109375 0.46875zm-1.234375 1.046875l2.4375 0l0 9.515625l-2.4375 0l0 -9.515625zm9.44339 9.8125q-1.484375 0 -2.640625 -0.65625q-1.15625 -0.671875 -1.796875 -1.8125q-0.640625 -1.15625 -0.640625 -2.578125q0 -1.421875 0.640625 -2.578125q0.640625 -1.15625 1.796875 -1.8125q1.15625 -0.671875 2.640625 -0.671875q1.453125 0 2.609375 0.671875q1.15625 0.65625 1.796875 1.8125q0.640625 1.15625 0.640625 2.578125q0 1.421875 -0.640625 2.578125q-0.640625 1.140625 -1.796875 1.8125q-1.15625 0.65625 -2.609375 0.65625zm0 -2.25q0.703125 0 1.296875 -0.328125q0.609375 -0.34375 0.953125 -0.984375q0.359375 -0.640625 0.359375 -1.484375q0 -0.84375 -0.359375 -1.46875q-0.34375 -0.640625 -0.953125 -0.96875q-0.59375 -0.34375 -1.296875 -0.34375q-0.71875 0 -1.3125 0.34375q-0.59375 0.328125 -0.96875 0.96875q-0.359375 0.625 -0.359375 1.46875q0 0.828125 0.359375 1.484375q0.375 0.640625 0.96875 0.984375q0.59375 0.328125 1.3125 0.328125zm6.7244263 -7.5625l2.296875 0l0 1.1875l0.15625 0q0.421875 -0.6875 1.203125 -1.078125q0.796875 -0.40625 1.703125 -0.40625q1.71875 0 2.609375 1.046875q0.90625 1.046875 0.90625 2.859375l0 5.90625l-2.453125 0l0 -5.609375q0 -0.921875 -0.46875 -1.421875q-0.46875 -0.5 -1.296875 -0.5q-1.0 0 -1.609375 0.765625q-0.59375 0.765625 -0.59375 1.921875l0 4.84375l-2.453125 0l0 -9.515625zm9.98761 9.515625l0 0z" fill-rule="nonzero"/><path fill="#ffffff" d="m378.10632 267.44833l1.40625 1.703125l-1.765625 1.359375l-1.0 -1.1875q-1.296875 1.1875 -3.1875 1.1875q-1.375 0 -2.515625 -0.59375q-1.140625 -0.59375 -1.796875 -1.671875q-0.640625 -1.078125 -0.640625 -2.421875q0 -1.109375 0.546875 -2.109375q0.5625 -1.015625 1.546875 -1.734375q-0.796875 -1.109375 -0.796875 -2.09375q0 -0.9375 0.4375 -1.6875q0.453125 -0.765625 1.234375 -1.203125q0.796875 -0.4375 1.78125 -0.4375q1.28125 0 2.109375 0.59375q0.84375 0.59375 1.21875 1.609375l-1.96875 1.046875q-0.40625 -1.109375 -1.328125 -1.109375q-0.515625 0 -0.84375 0.328125q-0.3125 0.3125 -0.3125 0.796875q0 0.28125 0.109375 0.53125q0.125 0.25 0.359375 0.546875l4.03125 4.875l1.078125 -1.78125l1.78125 1.125l-1.484375 2.328125zm-4.421875 0.859375q1.015625 0 1.703125 -0.625l-3.359375 -4.078125q-0.515625 0.390625 -0.796875 0.953125q-0.28125 0.5625 -0.28125 1.171875q0 0.765625 0.375 1.359375q0.375 0.59375 1.0 0.90625q0.640625 0.3125 1.359375 0.3125zm6.46286 1.90625l0 0zm11.851593 0.296875q-1.953125 0 -3.5625 -0.921875q-1.59375 -0.921875 -2.515625 -2.515625q-0.921875 -1.59375 -0.921875 -3.53125q0 -1.953125 0.921875 -3.546875q0.921875 -1.59375 2.515625 -2.515625q1.609375 -0.921875 3.5625 -0.921875q3.015625 0 5.0 2.1875l-1.78125 1.71875q-1.28125 -1.515625 -3.203125 -1.515625q-1.25 0 -2.28125 0.578125q-1.015625 0.5625 -1.609375 1.609375q-0.59375 1.03125 -0.59375 2.40625q0 1.359375 0.59375 2.40625q0.59375 1.03125 1.609375 1.609375q1.03125 0.5625 2.28125 0.5625q2.109375 0 3.53125 -1.75l1.78125 1.703125q-0.984375 1.1875 -2.34375 1.8125q-1.34375 0.625 -2.984375 0.625zm11.509491 0q-1.484375 0 -2.640625 -0.65625q-1.15625 -0.671875 -1.796875 -1.8125q-0.640625 -1.15625 -0.640625 -2.578125q0 -1.421875 0.640625 -2.578125q0.640625 -1.15625 1.796875 -1.8125q1.15625 -0.671875 2.640625 -0.671875q1.453125 0 2.609375 0.671875q1.15625 0.65625 1.796875 1.8125q0.640625 1.15625 0.640625 2.578125q0 1.421875 -0.640625 2.578125q-0.640625 1.140625 -1.796875 1.8125q-1.15625 0.65625 -2.609375 0.65625zm0 -2.25q0.703125 0 1.296875 -0.328125q0.609375 -0.34375 0.953125 -0.984375q0.359375 -0.640625 0.359375 -1.484375q0 -0.84375 -0.359375 -1.46875q-0.34375 -0.640625 -0.953125 -0.96875q-0.59375 -0.34375 -1.296875 -0.34375q-0.71875 0 -1.3125 0.34375q-0.59375 0.328125 -0.96875 0.96875q-0.359375 0.625 -0.359375 1.46875q0 0.828125 0.359375 1.484375q0.375 0.640625 0.96875 0.984375q0.59375 0.328125 1.3125 0.328125zm6.7244263 -7.5625l2.296875 0l0 1.1875l0.15625 0q0.421875 -0.6875 1.203125 -1.078125q0.796875 -0.40625 1.703125 -0.40625q1.71875 0 2.609375 1.046875q0.90625 1.046875 0.90625 2.859375l0 5.90625l-2.453125 0l0 -5.609375q0 -0.921875 -0.46875 -1.421875q-0.46875 -0.5 -1.296875 -0.5q-1.0 0 -1.609375 0.765625q-0.59375 0.765625 -0.59375 1.921875l0 4.84375l-2.453125 0l0 -9.515625zm9.739258 0l2.78125 0l2.40625 6.1875l0.15625 0l2.4375 -6.1875l2.75 0l-4.078125 9.515625l-2.421875 0l-4.03125 -9.515625zm16.034576 9.8125q-1.421875 0 -2.5625 -0.640625q-1.140625 -0.65625 -1.78125 -1.796875q-0.640625 -1.15625 -0.640625 -2.59375q0 -1.359375 0.625 -2.53125q0.640625 -1.171875 1.75 -1.859375q1.125 -0.6875 2.515625 -0.6875q1.46875 0 2.53125 0.640625q1.0625 0.625 1.609375 1.734375q0.5625 1.09375 0.5625 2.4375q0 0.421875 -0.0625 0.828125l-7.140625 0q0.1875 1.125 0.90625 1.71875q0.71875 0.59375 1.71875 0.59375q0.84375 0 1.453125 -0.359375q0.609375 -0.375 0.953125 -0.984375l1.984375 0.96875q-1.453125 2.53125 -4.421875 2.53125zm2.25 -6.203125q-0.03125 -0.453125 -0.328125 -0.890625q-0.28125 -0.4375 -0.796875 -0.71875q-0.515625 -0.296875 -1.203125 -0.296875q-0.875 0 -1.5 0.515625q-0.625 0.5 -0.90625 1.390625l4.734375 0zm4.2324524 -3.609375l2.296875 0l0 1.265625l0.15625 0q0.359375 -0.671875 1.09375 -1.109375q0.75 -0.4375 1.640625 -0.4375q0.65625 0 1.203125 0.203125l0 2.453125q-0.5 -0.1875 -0.859375 -0.265625q-0.34375 -0.09375 -0.734375 -0.09375q-1.078125 0 -1.71875 0.78125q-0.625 0.78125 -0.625 1.984375l0 4.734375l-2.453125 0l0 -9.515625zm11.2178955 9.8125q-1.734375 0 -2.8125 -0.703125q-1.0625 -0.71875 -1.5 -1.84375l2.1875 -0.953125q0.296875 0.671875 0.84375 1.015625q0.546875 0.34375 1.28125 0.34375q0.671875 0 1.109375 -0.21875q0.453125 -0.21875 0.453125 -0.703125q0 -0.46875 -0.40625 -0.6875q-0.40625 -0.234375 -1.234375 -0.421875l-1.109375 -0.25q-1.15625 -0.28125 -1.921875 -1.015625q-0.765625 -0.734375 -0.765625 -1.84375q0 -0.8125 0.484375 -1.46875q0.5 -0.65625 1.34375 -1.0q0.859375 -0.359375 1.875 -0.359375q2.953125 0 3.9375 2.0625l-2.078125 0.921875q-0.5625 -1.0 -1.796875 -1.0q-0.640625 0 -1.015625 0.234375q-0.359375 0.21875 -0.359375 0.578125q0 0.671875 1.265625 1.015625l1.390625 0.328125q1.421875 0.359375 2.140625 1.09375q0.71875 0.71875 0.71875 1.78125q0 0.90625 -0.53125 1.609375q-0.53125 0.703125 -1.453125 1.09375q-0.921875 0.390625 -2.046875 0.390625zm7.093689 -10.859375q-0.65625 0 -1.125 -0.46875q-0.46875 -0.46875 -0.46875 -1.109375q0 -0.65625 0.46875 -1.109375q0.46875 -0.46875 1.125 -0.46875q0.65625 0 1.109375 0.46875q0.453125 0.453125 0.453125 1.109375q0 0.640625 -0.453125 1.109375q-0.453125 0.46875 -1.109375 0.46875zm-1.234375 1.046875l2.4375 0l0 9.515625l-2.4375 0l0 -9.515625zm9.443359 9.8125q-1.484375 0 -2.640625 -0.65625q-1.15625 -0.671875 -1.796875 -1.8125q-0.640625 -1.15625 -0.640625 -2.578125q0 -1.421875 0.640625 -2.578125q0.640625 -1.15625 1.796875 -1.8125q1.15625 -0.671875 2.640625 -0.671875q1.453125 0 2.609375 0.671875q1.15625 0.65625 1.796875 1.8125q0.640625 1.15625 0.640625 2.578125q0 1.421875 -0.640625 2.578125q-0.640625 1.140625 -1.796875 1.8125q-1.15625 0.65625 -2.609375 0.65625zm0 -2.25q0.703125 0 1.296875 -0.328125q0.609375 -0.34375 0.953125 -0.984375q0.359375 -0.640625 0.359375 -1.484375q0 -0.84375 -0.359375 -1.46875q-0.34375 -0.640625 -0.953125 -0.96875q-0.59375 -0.34375 -1.296875 -0.34375q-0.71875 0 -1.3125 0.34375q-0.59375 0.328125 -0.96875 0.96875q-0.359375 0.625 -0.359375 1.46875q0 0.828125 0.359375 1.484375q0.375 0.640625 0.96875 0.984375q0.59375 0.328125 1.3125 0.328125zm6.724457 -7.5625l2.296875 0l0 1.1875l0.15625 0q0.421875 -0.6875 1.203125 -1.078125q0.796875 -0.40625 1.703125 -0.40625q1.71875 0 2.609375 1.046875q0.90625 1.046875 0.90625 2.859375l0 5.90625l-2.453125 0l0 -5.609375q0 -0.921875 -0.46875 -1.421875q-0.46875 -0.5 -1.296875 -0.5q-1.0 0 -1.609375 0.765625q-0.59375 0.765625 -0.59375 1.921875l0 4.84375l-2.453125 0l0 -9.515625z" fill-rule="nonzero"/><path fill="#db4437" d="m722.64044 224.63551l0 0c0 -9.399185 7.619507 -17.018723 17.018677 -17.018723l68.072815 0l0 0c4.513672 0 8.842407 1.793045 12.034058 4.98468c3.1916504 3.1916199 4.98468 7.520401 4.98468 12.034042l0 404.5453c0 9.39917 -7.619568 17.018677 -17.018738 17.018677l-68.072815 0c-9.39917 0 -17.018677 -7.619507 -17.018677 -17.018677z" fill-rule="evenodd"/><path fill="#ffffff" d="m740.59766 420.46875l4.5 0q1.109375 0 2.0625 0.515625q0.953125 0.5 1.515625 1.40625q0.578125 0.890625 0.578125 2.03125q0 0.9375 -0.421875 1.75q-0.421875 0.796875 -1.171875 1.34375q-0.734375 0.53125 -1.625 0.71875l-0.03125 0.046875l3.765625 5.46875l0 0.078125l-1.859375 0l-3.640625 -5.453125l-2.09375 0l0 5.453125l-1.578125 0l0 -13.359375zm4.40625 6.4375q0.703125 0 1.296875 -0.3125q0.609375 -0.328125 0.96875 -0.890625q0.375 -0.5625 0.375 -1.28125q0 -0.609375 -0.328125 -1.171875q-0.3125 -0.5625 -0.890625 -0.90625q-0.5625 -0.359375 -1.296875 -0.359375l-2.953125 0l0 4.921875l2.828125 0zm9.464661 7.21875q-1.71875 0 -2.625 -1.0q-0.90625 -1.015625 -0.90625 -2.828125l0 -5.984375l1.59375 0l0 5.75q0 1.359375 0.609375 2.0q0.625 0.625 1.671875 0.625q0.796875 0 1.421875 -0.421875q0.625 -0.4375 0.96875 -1.125q0.34375 -0.6875 0.34375 -1.453125l0 -5.375l1.59375 0l0 9.515625l-1.515625 0l0 -1.375l-0.078125 0q-0.390625 0.703125 -1.25 1.1875q-0.859375 0.484375 -1.828125 0.484375zm7.0130615 -9.8125l1.515625 0l0 1.40625l0.078125 0q0.390625 -0.71875 1.25 -1.203125q0.859375 -0.5 1.828125 -0.5q1.71875 0 2.609375 1.0q0.90625 1.0 0.90625 2.71875l0 6.09375l-1.578125 0l0 -5.859375q0 -1.328125 -0.640625 -1.921875q-0.625 -0.59375 -1.734375 -0.59375q-0.765625 0 -1.375 0.4375q-0.59375 0.421875 -0.9375 1.125q-0.328125 0.6875 -0.328125 1.453125l0 5.359375l-1.59375 0l0 -9.515625zm13.655701 9.671875q-0.578125 0 -1.078125 -0.1875q-0.484375 -0.1875 -0.828125 -0.515625q-0.734375 -0.703125 -0.734375 -1.953125l0 -5.578125l-1.671875 0l0 -1.4375l1.671875 0l0 -2.6875l1.578125 0l0 2.6875l2.328125 0l0 1.4375l-2.328125 0l0 5.1875q0 0.78125 0.296875 1.15625q0.359375 0.40625 1.03125 0.40625q0.578125 0 1.046875 -0.3125l0 1.546875q-0.28125 0.125 -0.578125 0.1875q-0.28125 0.0625 -0.734375 0.0625zm4.20282 -11.421875q-0.46875 0 -0.8125 -0.328125q-0.328125 -0.34375 -0.328125 -0.8125q0 -0.484375 0.328125 -0.8125q0.34375 -0.328125 0.8125 -0.328125q0.484375 0 0.8125 0.328125q0.328125 0.328125 0.328125 0.8125q0 0.46875 -0.328125 0.8125q-0.328125 0.328125 -0.8125 0.328125zm-0.78125 1.75l1.578125 0l0 9.515625l-1.578125 0l0 -9.515625zm4.0600586 0l1.515625 0l0 1.40625l0.078125 0q0.40625 -0.734375 1.21875 -1.21875q0.828125 -0.484375 1.75 -0.484375q1.03125 0 1.8125 0.5q0.78125 0.5 1.109375 1.34375q0.5 -0.84375 1.328125 -1.34375q0.84375 -0.5 1.9375 -0.5q1.640625 0 2.46875 1.0q0.828125 1.0 0.828125 2.71875l0 6.09375l-1.5625 0l0 -5.859375q0 -1.328125 -0.546875 -1.921875q-0.53125 -0.59375 -1.59375 -0.59375q-0.71875 0 -1.296875 0.40625q-0.578125 0.40625 -0.90625 1.109375q-0.3125 0.6875 -0.3125 1.484375l0 5.375l-1.59375 0l0 -5.84375q0 -1.34375 -0.546875 -1.9375q-0.53125 -0.59375 -1.578125 -0.59375q-0.703125 0 -1.28125 0.421875q-0.578125 0.421875 -0.90625 1.125q-0.328125 0.6875 -0.328125 1.5l0 5.328125l-1.59375 0l0 -9.515625zm20.57721 9.8125q-1.359375 0 -2.4375 -0.640625q-1.078125 -0.65625 -1.6875 -1.796875q-0.609375 -1.15625 -0.609375 -2.59375q0 -1.359375 0.5625 -2.53125q0.578125 -1.171875 1.625 -1.859375q1.0625 -0.6875 2.421875 -0.6875q1.390625 0 2.421875 0.625q1.03125 0.625 1.578125 1.734375q0.546875 1.09375 0.546875 2.515625q0 0.21875 -0.03125 0.484375l-7.546875 0q0.0625 1.078125 0.53125 1.828125q0.46875 0.734375 1.1875 1.109375q0.71875 0.375 1.5 0.375q1.859375 0 2.8125 -1.71875l1.34375 0.65625q-0.59375 1.140625 -1.65625 1.828125q-1.0625 0.671875 -2.5625 0.671875zm2.75 -6.0625q-0.046875 -0.59375 -0.34375 -1.1875q-0.296875 -0.609375 -0.9375 -1.015625q-0.625 -0.40625 -1.59375 -0.40625q-1.125 0 -1.90625 0.71875q-0.78125 0.71875 -1.03125 1.890625l5.8125 0z" fill-rule="nonzero"/><path fill="#741b47" d="m843.1496 295.00012l0 0c0 -3.3804626 2.7404175 -6.1208496 6.1208496 -6.1208496l100.010254 0c1.623352 0 3.1802368 0.64486694 4.328125 1.7927551c1.1478882 1.1478882 1.7927246 2.7047424 1.7927246 4.3280945l0 24.482697c0 3.3804626 -2.7403564 6.1208496 -6.1208496 6.1208496l-100.010254 0c-3.3804321 0 -6.1208496 -2.740387 -6.1208496 -6.1208496z" fill-rule="evenodd"/><path fill="#ffffff" d="m890.1718 313.35147q-1.671875 0 -3.03125 -0.78125q-1.359375 -0.78125 -2.140625 -2.140625q-0.765625 -1.375 -0.765625 -3.0625q0 -1.671875 0.765625 -3.03125q0.78125 -1.375 2.140625 -2.15625q1.359375 -0.796875 3.03125 -0.796875q1.28125 0 2.34375 0.484375q1.0625 0.484375 1.84375 1.390625l-0.96875 0.953125q-0.65625 -0.796875 -1.4375 -1.15625q-0.78125 -0.375 -1.78125 -0.375q-1.25 0 -2.296875 0.59375q-1.03125 0.578125 -1.65625 1.640625q-0.609375 1.0625 -0.609375 2.453125q0 1.390625 0.609375 2.46875q0.625 1.0625 1.65625 1.640625q1.046875 0.578125 2.296875 0.578125q2.078125 0 3.53125 -1.703125l1.0 0.953125q-0.78125 0.953125 -1.96875 1.5q-1.171875 0.546875 -2.5625 0.546875zm6.481018 -11.703125l3.859375 0q0.953125 0 1.765625 0.4375q0.828125 0.421875 1.3125 1.203125q0.484375 0.765625 0.484375 1.75q0 0.96875 -0.484375 1.75q-0.484375 0.78125 -1.3125 1.21875q-0.8125 0.421875 -1.765625 0.421875l-2.5 0l0 4.671875l-1.359375 0l0 -11.453125zm3.890625 5.484375q0.640625 0 1.125 -0.296875q0.484375 -0.3125 0.75 -0.78125q0.28125 -0.484375 0.28125 -1.015625q0 -0.53125 -0.28125 -1.0q-0.265625 -0.484375 -0.75 -0.78125q-0.484375 -0.3125 -1.125 -0.3125l-2.53125 0l0 4.1875l2.53125 0zm9.262878 6.21875q-1.234375 0 -2.203125 -0.5625q-0.953125 -0.5625 -1.484375 -1.578125q-0.515625 -1.015625 -0.515625 -2.34375l0 -7.21875l1.359375 0l0 7.28125q0 1.390625 0.734375 2.265625q0.734375 0.859375 2.109375 0.859375q1.375 0 2.109375 -0.859375q0.75 -0.875 0.75 -2.265625l0 -7.28125l1.359375 0l0 7.21875q0 1.328125 -0.5 2.359375q-0.5 1.015625 -1.46875 1.578125q-0.953125 0.546875 -2.25 0.546875z" fill-rule="nonzero"/><path fill="#000000" fill-opacity="0.0" d="m315.48337 252.29396l36.53543 0" fill-rule="evenodd"/><path stroke="#000000" stroke-width="2.0" stroke-linejoin="round" stroke-linecap="butt" d="m315.48337 252.29396l24.535461 0" fill-rule="evenodd"/><path fill="#000000" stroke="#000000" stroke-width="2.0" stroke-linecap="butt" d="m340.01883 255.59743l9.076202 -3.3034668l-9.076202 -3.3034668z" fill-rule="evenodd"/><path fill="#4285f4" d="m158.21262 319.31012l0 0c0 -4.6300354 3.753357 -8.383392 8.383362 -8.383392l140.24115 0c2.2233887 0 4.3557434 0.88323975 5.9279175 2.4554443c1.5722046 1.5721741 2.4554443 3.7045288 2.4554443 5.927948l0 33.53247c0 4.630005 -3.753357 8.383362 -8.383362 8.383362l-140.24115 0c-4.630005 0 -8.383362 -3.753357 -8.383362 -8.383362z" fill-rule="evenodd"/><path fill="#ffffff" d="m195.9582 342.18634q-1.65625 0 -3.0 -0.796875q-1.34375 -0.796875 -2.109375 -2.171875q-0.765625 -1.375 -0.765625 -3.015625q0 -1.625 0.765625 -3.0q0.765625 -1.375 2.109375 -2.171875q1.34375 -0.8125 3.0 -0.8125q1.640625 0 2.984375 0.8125q1.34375 0.796875 2.109375 2.171875q0.78125 1.375 0.78125 3.0q0 1.640625 -0.78125 3.015625q-0.765625 1.375 -2.109375 2.171875q-1.34375 0.796875 -2.984375 0.796875zm0 -1.296875q1.234375 0 2.265625 -0.59375q1.03125 -0.59375 1.625 -1.65625q0.609375 -1.078125 0.609375 -2.4375q0 -1.359375 -0.609375 -2.421875q-0.59375 -1.0625 -1.625 -1.65625q-1.03125 -0.609375 -2.265625 -0.609375q-1.234375 0 -2.265625 0.609375q-1.03125 0.59375 -1.640625 1.65625q-0.59375 1.0625 -0.59375 2.421875q0 1.359375 0.59375 2.4375q0.609375 1.0625 1.640625 1.65625q1.03125 0.59375 2.265625 0.59375zm7.527771 -7.109375l1.296875 0l0 1.21875l0.0625 0q0.375 -0.625 1.109375 -1.046875q0.75 -0.4375 1.6875 -0.4375q1.09375 0 1.984375 0.5625q0.890625 0.5625 1.390625 1.5625q0.515625 0.984375 0.515625 2.21875q0 1.25 -0.515625 2.234375q-0.5 0.984375 -1.390625 1.546875q-0.890625 0.546875 -1.984375 0.546875q-0.9375 0 -1.6875 -0.421875q-0.734375 -0.421875 -1.109375 -1.03125l-0.0625 0l0.0625 1.125l0 3.53125l-1.359375 0l0 -11.609375zm4.0 7.1875q0.703125 0 1.328125 -0.390625q0.625 -0.390625 0.984375 -1.09375q0.375 -0.703125 0.375 -1.625q0 -0.9375 -0.375 -1.640625q-0.359375 -0.703125 -0.984375 -1.078125q-0.625 -0.390625 -1.328125 -0.390625q-0.71875 0 -1.34375 0.390625q-0.609375 0.375 -0.984375 1.078125q-0.375 0.703125 -0.375 1.640625q0 0.921875 0.375 1.625q0.375 0.703125 0.984375 1.09375q0.625 0.390625 1.34375 0.390625zm4.6305237 0.96875l0 0zm8.196381 0.25q-1.109375 0 -2.0 -0.546875q-0.890625 -0.5625 -1.40625 -1.546875q-0.5 -0.984375 -0.5 -2.234375q0 -1.234375 0.5 -2.21875q0.515625 -1.0 1.40625 -1.5625q0.890625 -0.5625 2.0 -0.5625q0.921875 0 1.65625 0.4375q0.734375 0.421875 1.125 1.046875l0.0625 0l-0.0625 -1.140625l0 -3.375l1.359375 0l0 11.453125l-1.296875 0l0 -1.203125l-0.0625 0q-0.390625 0.609375 -1.125 1.03125q-0.734375 0.421875 -1.65625 0.421875zm0.140625 -1.21875q0.71875 0 1.328125 -0.390625q0.625 -0.390625 1.0 -1.09375q0.375 -0.703125 0.375 -1.625q0 -0.9375 -0.375 -1.640625q-0.375 -0.703125 -1.0 -1.078125q-0.609375 -0.390625 -1.328125 -0.390625q-0.703125 0 -1.328125 0.390625q-0.625 0.375 -1.0 1.09375q-0.375 0.71875 -0.375 1.625q0 0.90625 0.375 1.625q0.375 0.703125 1.0 1.09375q0.625 0.390625 1.328125 0.390625zm9.567642 1.21875q-1.171875 0 -2.109375 -0.546875q-0.921875 -0.5625 -1.4375 -1.546875q-0.515625 -0.984375 -0.515625 -2.21875q0 -1.171875 0.484375 -2.171875q0.484375 -1.0 1.390625 -1.59375q0.90625 -0.59375 2.078125 -0.59375q1.203125 0 2.078125 0.546875q0.875 0.53125 1.34375 1.484375q0.484375 0.9375 0.484375 2.15625q0 0.1875 -0.03125 0.40625l-6.46875 0q0.046875 0.9375 0.4375 1.578125q0.40625 0.640625 1.015625 0.96875q0.625 0.3125 1.296875 0.3125q1.59375 0 2.421875 -1.484375l1.140625 0.5625q-0.5 0.984375 -1.421875 1.5625q-0.90625 0.578125 -2.1875 0.578125zm2.34375 -5.1875q-0.03125 -0.515625 -0.28125 -1.03125q-0.25 -0.515625 -0.796875 -0.859375q-0.546875 -0.359375 -1.375 -0.359375q-0.96875 0 -1.640625 0.625q-0.671875 0.609375 -0.875 1.625l4.96875 0zm9.566757 -4.71875q-0.40625 0 -0.703125 -0.28125q-0.28125 -0.296875 -0.28125 -0.703125q0 -0.421875 0.28125 -0.6875q0.296875 -0.28125 0.703125 -0.28125q0.40625 0 0.6875 0.28125q0.28125 0.265625 0.28125 0.6875q0 0.40625 -0.28125 0.703125q-0.28125 0.28125 -0.6875 0.28125zm-5.75 2.734375l-1.5 0l0 -1.234375l1.5 0l0 -0.890625q0 -0.765625 0.34375 -1.328125q0.359375 -0.578125 0.953125 -0.890625q0.59375 -0.3125 1.3125 -0.3125q0.6875 0 1.203125 0.1875l0 1.328125q-0.296875 -0.109375 -0.5625 -0.171875q-0.25 -0.078125 -0.59375 -0.078125q-0.53125 0 -0.921875 0.375q-0.375 0.375 -0.375 1.046875l0 0.734375l5.078125 0l0 8.15625l-1.359375 0l0 -6.921875l-3.71875 0l0 6.921875l-1.359375 0l0 -6.921875zm8.566513 -1.234375l1.28125 0l0 1.203125l0.078125 0q0.328125 -0.609375 1.0625 -1.03125q0.734375 -0.4375 1.578125 -0.4375q1.46875 0 2.234375 0.859375q0.78125 0.859375 0.78125 2.328125l0 5.234375l-1.359375 0l0 -5.03125q0 -1.125 -0.546875 -1.640625q-0.546875 -0.515625 -1.484375 -0.515625q-0.65625 0 -1.1875 0.375q-0.515625 0.359375 -0.796875 0.96875q-0.28125 0.59375 -0.28125 1.25l0 4.59375l-1.359375 0l0 -8.15625zm9.772888 -1.5q-0.40625 0 -0.703125 -0.28125q-0.28125 -0.296875 -0.28125 -0.703125q0 -0.421875 0.28125 -0.6875q0.296875 -0.28125 0.703125 -0.28125q0.40625 0 0.6875 0.28125q0.28125 0.265625 0.28125 0.6875q0 0.40625 -0.28125 0.703125q-0.28125 0.28125 -0.6875 0.28125zm-0.671875 1.5l1.359375 0l0 8.15625l-1.359375 0l0 -8.15625zm6.5360107 8.28125q-0.5 0 -0.921875 -0.15625q-0.421875 -0.15625 -0.703125 -0.4375q-0.640625 -0.609375 -0.640625 -1.671875l0 -4.78125l-1.421875 0l0 -1.234375l1.421875 0l0 -2.3125l1.359375 0l0 2.3125l2.0 0l0 1.234375l-2.0 0l0 4.4375q0 0.671875 0.25 1.0q0.3125 0.34375 0.890625 0.34375q0.484375 0 0.890625 -0.265625l0 1.328125q-0.234375 0.109375 -0.484375 0.15625q-0.25 0.046875 -0.640625 0.046875zm3.6078796 -9.78125q-0.40625 0 -0.703125 -0.28125q-0.28125 -0.296875 -0.28125 -0.703125q0 -0.421875 0.28125 -0.6875q0.296875 -0.28125 0.703125 -0.28125q0.40625 0 0.6875 0.28125q0.28125 0.265625 0.28125 0.6875q0 0.40625 -0.28125 0.703125q-0.28125 0.28125 -0.6875 0.28125zm-0.671875 1.5l1.359375 0l0 8.15625l-1.359375 0l0 -8.15625zm7.1610107 8.40625q-1.203125 0 -2.171875 -0.5625q-0.953125 -0.578125 -1.484375 -1.5625q-0.53125 -1.0 -0.53125 -2.203125q0 -1.203125 0.53125 -2.1875q0.53125 -1.0 1.484375 -1.578125q0.96875 -0.578125 2.171875 -0.578125q1.203125 0 2.15625 0.578125q0.96875 0.578125 1.5 1.578125q0.53125 0.984375 0.53125 2.1875q0 1.203125 -0.53125 2.203125q-0.53125 0.984375 -1.5 1.5625q-0.953125 0.5625 -2.15625 0.5625zm0 -1.21875q0.75 0 1.390625 -0.375q0.65625 -0.390625 1.046875 -1.09375q0.390625 -0.703125 0.390625 -1.640625q0 -0.953125 -0.390625 -1.65625q-0.390625 -0.703125 -1.046875 -1.078125q-0.640625 -0.375 -1.390625 -0.375q-0.75 0 -1.40625 0.375q-0.65625 0.375 -1.046875 1.078125q-0.390625 0.703125 -0.390625 1.65625q0 0.9375 0.390625 1.640625q0.390625 0.703125 1.046875 1.09375q0.65625 0.375 1.40625 0.375zm5.6260376 -7.1875l1.28125 0l0 1.203125l0.078125 0q0.328125 -0.609375 1.0625 -1.03125q0.734375 -0.4375 1.578125 -0.4375q1.46875 0 2.234375 0.859375q0.78125 0.859375 0.78125 2.328125l0 5.234375l-1.359375 0l0 -5.03125q0 -1.125 -0.546875 -1.640625q-0.546875 -0.515625 -1.484375 -0.515625q-0.65625 0 -1.1875 0.375q-0.515625 0.359375 -0.796875 0.96875q-0.28125 0.59375 -0.28125 1.25l0 4.59375l-1.359375 0l0 -8.15625z" fill-rule="nonzero"/><path fill="#4285f4" d="m158.21262 389.57376l0 0c0 -4.630005 3.753357 -8.383362 8.383362 -8.383362l140.24115 0c2.2233887 0 4.3557434 0.88323975 5.9279175 2.4554138c1.5722046 1.5722046 2.4554443 3.7045288 2.4554443 5.927948l0 33.53247c0 4.630005 -3.753357 8.383362 -8.383362 8.383362l-140.24115 0c-4.630005 0 -8.383362 -3.753357 -8.383362 -8.383362z" fill-rule="evenodd"/><path fill="#ffffff" d="m187.12544 402.04373l-3.203125 0l0 -1.296875l7.765625 0l0 1.296875l-3.203125 0l0 10.15625l-1.359375 0l0 -10.15625zm7.5230103 9.640625l-3.375 -7.640625l1.46875 0l2.5625 6.0625l0.03125 0l2.46875 -6.0625l1.46875 0l-5.03125 11.609375l-1.40625 0l1.8125 -3.96875zm5.693512 -7.640625l1.296875 0l0 1.21875l0.0625 0q0.375 -0.625 1.109375 -1.046875q0.75 -0.4375 1.6875 -0.4375q1.09375 0 1.984375 0.5625q0.890625 0.5625 1.390625 1.5625q0.515625 0.984375 0.515625 2.21875q0 1.25 -0.515625 2.234375q-0.5 0.984375 -1.390625 1.546875q-0.890625 0.546875 -1.984375 0.546875q-0.9375 0 -1.6875 -0.421875q-0.734375 -0.421875 -1.109375 -1.03125l-0.0625 0l0.0625 1.125l0 3.53125l-1.359375 0l0 -11.609375zm4.0 7.1875q0.703125 0 1.328125 -0.390625q0.625 -0.390625 0.984375 -1.09375q0.375 -0.703125 0.375 -1.625q0 -0.9375 -0.375 -1.640625q-0.359375 -0.703125 -0.984375 -1.078125q-0.625 -0.390625 -1.328125 -0.390625q-0.71875 0 -1.34375 0.390625q-0.609375 0.375 -0.984375 1.078125q-0.375 0.703125 -0.375 1.640625q0 0.921875 0.375 1.625q0.375 0.703125 0.984375 1.09375q0.625 0.390625 1.34375 0.390625zm9.271149 1.21875q-1.171875 0 -2.109375 -0.546875q-0.921875 -0.5625 -1.4375 -1.546875q-0.515625 -0.984375 -0.515625 -2.21875q0 -1.171875 0.484375 -2.171875q0.484375 -1.0 1.390625 -1.59375q0.90625 -0.59375 2.078125 -0.59375q1.203125 0 2.078125 0.546875q0.875 0.53125 1.34375 1.484375q0.484375 0.9375 0.484375 2.15625q0 0.1875 -0.03125 0.40625l-6.46875 0q0.046875 0.9375 0.4375 1.578125q0.40625 0.640625 1.015625 0.96875q0.625 0.3125 1.296875 0.3125q1.59375 0 2.421875 -1.484375l1.140625 0.5625q-0.5 0.984375 -1.421875 1.5625q-0.90625 0.578125 -2.1875 0.578125zm2.34375 -5.1875q-0.03125 -0.515625 -0.28125 -1.03125q-0.25 -0.515625 -0.796875 -0.859375q-0.546875 -0.359375 -1.375 -0.359375q-0.96875 0 -1.640625 0.625q-0.671875 0.609375 -0.875 1.625l4.96875 0zm2.0236359 4.9375l0 0zm8.196381 0.25q-1.109375 0 -2.0 -0.546875q-0.890625 -0.5625 -1.40625 -1.546875q-0.5 -0.984375 -0.5 -2.234375q0 -1.234375 0.5 -2.21875q0.515625 -1.0 1.40625 -1.5625q0.890625 -0.5625 2.0 -0.5625q0.921875 0 1.65625 0.4375q0.734375 0.421875 1.125 1.046875l0.0625 0l-0.0625 -1.140625l0 -3.375l1.359375 0l0 11.453125l-1.296875 0l0 -1.203125l-0.0625 0q-0.390625 0.609375 -1.125 1.03125q-0.734375 0.421875 -1.65625 0.421875zm0.140625 -1.21875q0.71875 0 1.328125 -0.390625q0.625 -0.390625 1.0 -1.09375q0.375 -0.703125 0.375 -1.625q0 -0.9375 -0.375 -1.640625q-0.375 -0.703125 -1.0 -1.078125q-0.609375 -0.390625 -1.328125 -0.390625q-0.703125 0 -1.328125 0.390625q-0.625 0.375 -1.0 1.09375q-0.375 0.71875 -0.375 1.625q0 0.90625 0.375 1.625q0.375 0.703125 1.0 1.09375q0.625 0.390625 1.328125 0.390625zm9.567642 1.21875q-1.171875 0 -2.109375 -0.546875q-0.921875 -0.5625 -1.4375 -1.546875q-0.515625 -0.984375 -0.515625 -2.21875q0 -1.171875 0.484375 -2.171875q0.484375 -1.0 1.390625 -1.59375q0.90625 -0.59375 2.078125 -0.59375q1.203125 0 2.078125 0.546875q0.875 0.53125 1.34375 1.484375q0.484375 0.9375 0.484375 2.15625q0 0.1875 -0.03125 0.40625l-6.46875 0q0.046875 0.9375 0.4375 1.578125q0.40625 0.640625 1.015625 0.96875q0.625 0.3125 1.296875 0.3125q1.59375 0 2.421875 -1.484375l1.140625 0.5625q-0.5 0.984375 -1.421875 1.5625q-0.90625 0.578125 -2.1875 0.578125zm2.34375 -5.1875q-0.03125 -0.515625 -0.28125 -1.03125q-0.25 -0.515625 -0.796875 -0.859375q-0.546875 -0.359375 -1.375 -0.359375q-0.96875 0 -1.640625 0.625q-0.671875 0.609375 -0.875 1.625l4.96875 0zm9.566757 -4.71875q-0.40625 0 -0.703125 -0.28125q-0.28125 -0.296875 -0.28125 -0.703125q0 -0.421875 0.28125 -0.6875q0.296875 -0.28125 0.703125 -0.28125q0.40625 0 0.6875 0.28125q0.28125 0.265625 0.28125 0.6875q0 0.40625 -0.28125 0.703125q-0.28125 0.28125 -0.6875 0.28125zm-5.75 2.734375l-1.5 0l0 -1.234375l1.5 0l0 -0.890625q0 -0.765625 0.34375 -1.328125q0.359375 -0.578125 0.953125 -0.890625q0.59375 -0.3125 1.3125 -0.3125q0.6875 0 1.203125 0.1875l0 1.328125q-0.296875 -0.109375 -0.5625 -0.171875q-0.25 -0.078125 -0.59375 -0.078125q-0.53125 0 -0.921875 0.375q-0.375 0.375 -0.375 1.046875l0 0.734375l5.078125 0l0 8.15625l-1.359375 0l0 -6.921875l-3.71875 0l0 6.921875l-1.359375 0l0 -6.921875zm8.566513 -1.234375l1.28125 0l0 1.203125l0.078125 0q0.328125 -0.609375 1.0625 -1.03125q0.734375 -0.4375 1.578125 -0.4375q1.4687653 0 2.2343903 0.859375q0.78125 0.859375 0.78125 2.328125l0 5.234375l-1.359375 0l0 -5.03125q0 -1.125 -0.54689026 -1.640625q-0.546875 -0.515625 -1.484375 -0.515625q-0.65625 0 -1.1875 0.375q-0.515625 0.359375 -0.796875 0.96875q-0.28125 0.59375 -0.28125 1.25l0 4.59375l-1.359375 0l0 -8.15625zm9.772903 -1.5q-0.40625 0 -0.703125 -0.28125q-0.28125 -0.296875 -0.28125 -0.703125q0 -0.421875 0.28125 -0.6875q0.296875 -0.28125 0.703125 -0.28125q0.40625 0 0.6875 0.28125q0.28125 0.265625 0.28125 0.6875q0 0.40625 -0.28125 0.703125q-0.28125 0.28125 -0.6875 0.28125zm-0.671875 1.5l1.359375 0l0 8.15625l-1.359375 0l0 -8.15625zm6.5360107 8.28125q-0.5 0 -0.921875 -0.15625q-0.421875 -0.15625 -0.703125 -0.4375q-0.640625 -0.609375 -0.640625 -1.671875l0 -4.78125l-1.421875 0l0 -1.234375l1.421875 0l0 -2.3125l1.359375 0l0 2.3125l2.0 0l0 1.234375l-2.0 0l0 4.4375q0 0.671875 0.25 1.0q0.3125 0.34375 0.890625 0.34375q0.484375 0 0.890625 -0.265625l0 1.328125q-0.234375 0.109375 -0.484375 0.15625q-0.25 0.046875 -0.640625 0.046875zm3.6078796 -9.78125q-0.40625 0 -0.703125 -0.28125q-0.28125 -0.296875 -0.28125 -0.703125q0 -0.421875 0.28125 -0.6875q0.296875 -0.28125 0.703125 -0.28125q0.40625 0 0.6875 0.28125q0.28125 0.265625 0.28125 0.6875q0 0.40625 -0.28125 0.703125q-0.28125 0.28125 -0.6875 0.28125zm-0.671875 1.5l1.359375 0l0 8.15625l-1.359375 0l0 -8.15625zm7.1610107 8.40625q-1.203125 0 -2.171875 -0.5625q-0.953125 -0.578125 -1.484375 -1.5625q-0.53125 -1.0 -0.53125 -2.203125q0 -1.203125 0.53125 -2.1875q0.53125 -1.0 1.484375 -1.578125q0.96875 -0.578125 2.171875 -0.578125q1.203125 0 2.15625 0.578125q0.96875 0.578125 1.5 1.578125q0.53125 0.984375 0.53125 2.1875q0 1.203125 -0.53125 2.203125q-0.53125 0.984375 -1.5 1.5625q-0.953125 0.5625 -2.15625 0.5625zm0 -1.21875q0.75 0 1.390625 -0.375q0.65625 -0.390625 1.046875 -1.09375q0.390625 -0.703125 0.390625 -1.640625q0 -0.953125 -0.390625 -1.65625q-0.390625 -0.703125 -1.046875 -1.078125q-0.640625 -0.375 -1.390625 -0.375q-0.75 0 -1.40625 0.375q-0.65625 0.375 -1.046875 1.078125q-0.390625 0.703125 -0.390625 1.65625q0 0.9375 0.390625 1.640625q0.390625 0.703125 1.046875 1.09375q0.65625 0.375 1.40625 0.375zm5.6260376 -7.1875l1.28125 0l0 1.203125l0.078125 0q0.328125 -0.609375 1.0625 -1.03125q0.734375 -0.4375 1.578125 -0.4375q1.46875 0 2.234375 0.859375q0.7812195 0.859375 0.7812195 2.328125l0 5.234375l-1.3593445 0l0 -5.03125q0 -1.125 -0.546875 -1.640625q-0.546875 -0.515625 -1.484375 -0.515625q-0.65625 0 -1.1875 0.375q-0.515625 0.359375 -0.796875 0.96875q-0.28125 0.59375 -0.28125 1.25l0 4.59375l-1.359375 0l0 -8.15625z" fill-rule="nonzero"/><path fill="#4285f4" d="m158.21262 459.8374l0 0c0 -4.630005 3.753357 -8.383362 8.383362 -8.383362l140.24115 0c2.2233887 0 4.3557434 0.88323975 5.9279175 2.4554443c1.5722046 1.5721741 2.4554443 3.7045288 2.4554443 5.9279175l0 33.53247c0 4.630005 -3.753357 8.383392 -8.383362 8.383392l-140.24115 0c-4.630005 0 -8.383362 -3.7533875 -8.383362 -8.383392z" fill-rule="evenodd"/><path fill="#ffffff" d="m231.5017 482.46365l0 -1.609375l1.609375 0l0 1.609375l-1.609375 0zm4.4453125 0l0 -1.609375l1.609375 0l0 1.609375l-1.609375 0zm4.4453125 0l0 -1.609375l1.609375 0l0 1.609375l-1.609375 0z" fill-rule="nonzero"/><path fill="#f6921e" d="m350.8194 319.30954l0 0c0 -4.630005 3.753357 -8.383362 8.383362 -8.383362l132.71356 0c2.2234192 0 4.355774 0.88323975 5.927948 2.4554443c1.5721741 1.5721741 2.4554443 3.7045288 2.4554443 5.9279175l0 33.5325c0 4.630005 -3.7533875 8.383362 -8.383392 8.383362l-132.71356 0c-4.630005 0 -8.383362 -3.753357 -8.383362 -8.383362z" fill-rule="evenodd"/><path fill="#ffffff" d="m399.56876 320.98267l3.875 0q0.9375 0 1.75 0.4375q0.828125 0.421875 1.3125 1.203125q0.484375 0.765625 0.484375 1.75q0 0.796875 -0.359375 1.484375q-0.359375 0.6875 -1.0 1.15625q-0.625 0.46875 -1.390625 0.625l-0.03125 0.046875l3.234375 4.6875l0 0.0625l-1.609375 0l-3.109375 -4.671875l-1.796875 0l0 4.671875l-1.359375 0l0 -11.453125zm3.796875 5.515625q0.578125 0 1.09375 -0.265625q0.53125 -0.28125 0.84375 -0.765625q0.3125 -0.484375 0.3125 -1.09375q0 -0.53125 -0.28125 -1.0q-0.265625 -0.484375 -0.75 -0.78125q-0.484375 -0.3125 -1.125 -0.3125l-2.53125 0l0 4.21875l2.4375 0zm8.778503 6.1875q-1.171875 0 -2.109375 -0.546875q-0.921875 -0.5625 -1.4375 -1.546875q-0.515625 -0.984375 -0.515625 -2.21875q0 -1.171875 0.484375 -2.171875q0.484375 -1.0 1.390625 -1.59375q0.90625 -0.59375 2.078125 -0.59375q1.203125 0 2.078125 0.546875q0.875 0.53125 1.34375 1.484375q0.484375 0.9375 0.484375 2.15625q0 0.1875 -0.03125 0.40625l-6.46875 0q0.046875 0.9375 0.4375 1.578125q0.40625 0.640625 1.015625 0.96875q0.625 0.3125 1.296875 0.3125q1.59375 0 2.421875 -1.484375l1.140625 0.5625q-0.5 0.984375 -1.421875 1.5625q-0.90625 0.578125 -2.1875 0.578125zm2.34375 -5.1875q-0.03125 -0.515625 -0.28125 -1.03125q-0.25 -0.515625 -0.796875 -0.859375q-0.546875 -0.359375 -1.375 -0.359375q-0.96875 0 -1.640625 0.625q-0.671875 0.609375 -0.875 1.625l4.96875 0zm1.9730225 -3.21875l1.40625 0l1.9375 6.4375l0.015625 0l2.0625 -6.4375l1.390625 0l2.0625 6.421875l0.03125 0l1.921875 -6.421875l1.375 0l-2.625 8.15625l-1.390625 0l-2.109375 -6.515625l-2.09375 6.515625l-1.359375 0l-2.625 -8.15625zm13.338257 0l1.28125 0l0 1.3125l0.078125 0q0.234375 -0.671875 0.921875 -1.109375q0.6875 -0.453125 1.4375 -0.453125q0.5625 0 0.96875 0.171875l0 1.46875q-0.515625 -0.265625 -1.15625 -0.265625q-0.59375 0 -1.09375 0.34375q-0.5 0.328125 -0.796875 0.90625q-0.28125 0.5625 -0.28125 1.21875l0 4.5625l-1.359375 0l0 -8.15625zm6.7969055 -1.5q-0.40625 0 -0.703125 -0.28125q-0.28125 -0.296875 -0.28125 -0.703125q0 -0.421875 0.28125 -0.6875q0.296875 -0.28125 0.703125 -0.28125q0.40625 0 0.6875 0.28125q0.28125 0.265625 0.28125 0.6875q0 0.40625 -0.28125 0.703125q-0.28125 0.28125 -0.6875 0.28125zm-0.671875 1.5l1.359375 0l0 8.15625l-1.359375 0l0 -8.15625zm6.5360107 8.28125q-0.5 0 -0.921875 -0.15625q-0.421875 -0.15625 -0.703125 -0.4375q-0.640625 -0.609375 -0.640625 -1.671875l0 -4.78125l-1.421875 0l0 -1.234375l1.421875 0l0 -2.3125l1.359375 0l0 2.3125l2.0 0l0 1.234375l-2.0 0l0 4.4375q0 0.671875 0.25 1.0q0.3125 0.34375 0.890625 0.34375q0.484375 0 0.890625 -0.265625l0 1.328125q-0.234375 0.109375 -0.484375 0.15625q-0.25 0.046875 -0.640625 0.046875zm6.003998 0.125q-1.171875 0 -2.109375 -0.546875q-0.921875 -0.5625 -1.4375 -1.546875q-0.515625 -0.984375 -0.515625 -2.21875q0 -1.171875 0.484375 -2.171875q0.484375 -1.0 1.390625 -1.59375q0.90625 -0.59375 2.078125 -0.59375q1.203125 0 2.078125 0.546875q0.875 0.53125 1.34375 1.484375q0.484375 0.9375 0.484375 2.15625q0 0.1875 -0.03125 0.40625l-6.46875 0q0.046875 0.9375 0.4375 1.578125q0.40625 0.640625 1.015625 0.96875q0.625 0.3125 1.296875 0.3125q1.59375 0 2.421875 -1.484375l1.140625 0.5625q-0.5 0.984375 -1.421875 1.5625q-0.90625 0.578125 -2.1875 0.578125zm2.34375 -5.1875q-0.03125 -0.515625 -0.28125 -1.03125q-0.25 -0.515625 -0.796875 -0.859375q-0.546875 -0.359375 -1.375 -0.359375q-0.96875 0 -1.640625 0.625q-0.671875 0.609375 -0.875 1.625l4.96875 0zm2.0236511 4.9375l0 0z" fill-rule="nonzero"/><path fill="#ffffff" d="m391.18372 351.6858q-1.609375 0 -2.984375 -0.78125q-1.359375 -0.78125 -2.15625 -2.15625q-0.796875 -1.375 -0.796875 -3.046875q0 -1.65625 0.796875 -3.03125q0.796875 -1.375 2.15625 -2.15625q1.375 -0.796875 2.984375 -0.796875q1.265625 0 2.375 0.453125q1.109375 0.453125 1.828125 1.265625l-0.953125 0.96875q-0.5625 -0.671875 -1.421875 -1.03125q-0.84375 -0.359375 -1.8125 -0.359375q-1.203125 0 -2.265625 0.59375q-1.046875 0.578125 -1.6875 1.640625q-0.625 1.0625 -0.625 2.453125q0 1.390625 0.625 2.46875q0.640625 1.0625 1.6875 1.640625q1.0625 0.578125 2.265625 0.578125q1.15625 0 1.921875 -0.359375q0.78125 -0.359375 1.34375 -0.953125q0.421875 -0.4375 0.671875 -1.0625q0.25 -0.640625 0.3125 -1.40625l-4.234375 0l0 -1.265625l5.5 0q0.078125 0.453125 0.078125 0.828125q0 1.0625 -0.34375 2.0625q-0.328125 1.0 -1.0625 1.75q-1.578125 1.703125 -4.203125 1.703125zm10.912415 0q-1.171875 0 -2.109375 -0.546875q-0.921875 -0.5625 -1.4375 -1.546875q-0.515625 -0.984375 -0.515625 -2.21875q0 -1.171875 0.484375 -2.171875q0.484375 -1.0 1.390625 -1.59375q0.90625 -0.59375 2.078125 -0.59375q1.203125 0 2.078125 0.546875q0.875 0.53125 1.34375 1.484375q0.484375 0.9375 0.484375 2.15625q0 0.1875 -0.03125 0.40625l-6.46875 0q0.046875 0.9375 0.4375 1.578125q0.40625 0.640625 1.015625 0.96875q0.625 0.3125 1.296875 0.3125q1.59375 0 2.421875 -1.484375l1.140625 0.5625q-0.5 0.984375 -1.421875 1.5625q-0.90625 0.578125 -2.1875 0.578125zm2.34375 -5.1875q-0.03125 -0.515625 -0.28125 -1.03125q-0.25 -0.515625 -0.796875 -0.859375q-0.546875 -0.359375 -1.375 -0.359375q-0.96875 0 -1.640625 0.625q-0.671875 0.609375 -0.875 1.625l4.96875 0zm3.0392456 -3.21875l1.28125 0l0 1.203125l0.078125 0q0.328125 -0.609375 1.0625 -1.03125q0.734375 -0.4375 1.578125 -0.4375q1.46875 0 2.234375 0.859375q0.78125 0.859375 0.78125 2.328125l0 5.234375l-1.359375 0l0 -5.03125q0 -1.125 -0.546875 -1.640625q-0.546875 -0.515625 -1.484375 -0.515625q-0.65625 0 -1.1875 0.375q-0.515625 0.359375 -0.796875 0.96875q-0.28125 0.59375 -0.28125 1.25l0 4.59375l-1.359375 0l0 -8.15625zm12.601013 8.40625q-1.171875 0 -2.109375 -0.546875q-0.921875 -0.5625 -1.4375 -1.546875q-0.515625 -0.984375 -0.515625 -2.21875q0 -1.171875 0.484375 -2.171875q0.484375 -1.0 1.390625 -1.59375q0.90625 -0.59375 2.078125 -0.59375q1.203125 0 2.078125 0.546875q0.875 0.53125 1.34375 1.484375q0.484375 0.9375 0.484375 2.15625q0 0.1875 -0.03125 0.40625l-6.46875 0q0.046875 0.9375 0.4375 1.578125q0.40625 0.640625 1.015625 0.96875q0.625 0.3125 1.296875 0.3125q1.59375 0 2.421875 -1.484375l1.140625 0.5625q-0.5 0.984375 -1.421875 1.5625q-0.90625 0.578125 -2.1875 0.578125zm2.34375 -5.1875q-0.03125 -0.515625 -0.28125 -1.03125q-0.25 -0.515625 -0.796875 -0.859375q-0.546875 -0.359375 -1.375 -0.359375q-0.96875 0 -1.640625 0.625q-0.671875 0.609375 -0.875 1.625l4.96875 0zm3.0392761 -3.21875l1.28125 0l0 1.3125l0.078125 0q0.234375 -0.671875 0.921875 -1.109375q0.6875 -0.453125 1.4375 -0.453125q0.5625 0 0.96875 0.171875l0 1.46875q-0.515625 -0.265625 -1.15625 -0.265625q-0.59375 0 -1.09375 0.34375q-0.5 0.328125 -0.796875 0.90625q-0.28125 0.5625 -0.28125 1.21875l0 4.5625l-1.359375 0l0 -8.15625zm8.273743 8.40625q-0.890625 0 -1.59375 -0.34375q-0.703125 -0.359375 -1.09375 -0.96875q-0.375 -0.625 -0.375 -1.40625q0 -1.296875 0.96875 -2.015625q0.984375 -0.734375 2.46875 -0.734375q0.734375 0 1.359375 0.171875q0.640625 0.15625 0.96875 0.359375l0 -0.5q0 -0.90625 -0.640625 -1.453125q-0.625 -0.5625 -1.609375 -0.5625q-0.671875 0 -1.265625 0.296875q-0.578125 0.296875 -0.90625 0.828125l-1.03125 -0.765625q0.484375 -0.734375 1.328125 -1.15625q0.859375 -0.421875 1.875 -0.421875q1.671875 0 2.609375 0.875q0.9375 0.875 0.9375 2.375l0 5.171875l-1.296875 0l0 -1.171875l-0.0625 0q-0.34375 0.59375 -1.046875 1.015625q-0.703125 0.40625 -1.59375 0.40625zm0.140625 -1.1875q0.6875 0 1.265625 -0.34375q0.59375 -0.359375 0.9375 -0.953125q0.359375 -0.59375 0.359375 -1.296875q-0.375 -0.265625 -0.9375 -0.421875q-0.5625 -0.15625 -1.1875 -0.15625q-1.109375 0 -1.6875 0.46875q-0.5625 0.453125 -0.5625 1.1875q0 0.671875 0.5 1.09375q0.515625 0.421875 1.3125 0.421875zm8.726654 1.0625q-0.5 0 -0.921875 -0.15625q-0.421875 -0.15625 -0.703125 -0.4375q-0.640625 -0.609375 -0.640625 -1.671875l0 -4.78125l-1.421875 0l0 -1.234375l1.421875 0l0 -2.3125l1.359375 0l0 2.3125l2.0 0l0 1.234375l-2.0 0l0 4.4375q0 0.671875 0.25 1.0q0.3125 0.34375 0.890625 0.34375q0.484375 0 0.890625 -0.265625l0 1.328125q-0.234375 0.109375 -0.484375 0.15625q-0.25 0.046875 -0.640625 0.046875zm6.128998 0.125q-1.203125 0 -2.171875 -0.5625q-0.953125 -0.578125 -1.484375 -1.5625q-0.53125 -1.0 -0.53125 -2.203125q0 -1.203125 0.53125 -2.1875q0.53125 -1.0 1.484375 -1.578125q0.96875 -0.578125 2.171875 -0.578125q1.203125 0 2.15625 0.578125q0.96875 0.578125 1.5 1.578125q0.53125 0.984375 0.53125 2.1875q0 1.203125 -0.53125 2.203125q-0.53125 0.984375 -1.5 1.5625q-0.953125 0.5625 -2.15625 0.5625zm0 -1.21875q0.75 0 1.390625 -0.375q0.65625 -0.390625 1.046875 -1.09375q0.390625 -0.703125 0.390625 -1.640625q0 -0.953125 -0.390625 -1.65625q-0.390625 -0.703125 -1.046875 -1.078125q-0.640625 -0.375 -1.390625 -0.375q-0.75 0 -1.40625 0.375q-0.65625 0.375 -1.046875 1.078125q-0.390625 0.703125 -0.390625 1.65625q0 0.9375 0.390625 1.640625q0.390625 0.703125 1.046875 1.09375q0.65625 0.375 1.40625 0.375zm5.6260376 -7.1875l1.28125 0l0 1.3125l0.078125 0q0.234375 -0.671875 0.921875 -1.109375q0.6875 -0.453125 1.4375 -0.453125q0.5625 0 0.96875 0.171875l0 1.46875q-0.515625 -0.265625 -1.15625 -0.265625q-0.59375 0 -1.09375 0.34375q-0.5 0.328125 -0.796875 0.90625q-0.28125 0.5625 -0.28125 1.21875l0 4.5625l-1.359375 0l0 -8.15625zm8.523743 8.40625q-1.3125 0 -2.203125 -0.609375q-0.875 -0.609375 -1.234375 -1.578125l1.203125 -0.546875q0.3125 0.734375 0.90625 1.140625q0.609375 0.40625 1.328125 0.40625q0.765625 0 1.3125 -0.3125q0.546875 -0.3125 0.546875 -0.890625q0 -0.515625 -0.4375 -0.828125q-0.4375 -0.3125 -1.359375 -0.53125l-1.0 -0.265625q-0.96875 -0.234375 -1.59375 -0.8125q-0.625 -0.578125 -0.625 -1.484375q0 -0.703125 0.421875 -1.234375q0.421875 -0.546875 1.125 -0.828125q0.703125 -0.296875 1.53125 -0.296875q1.0625 0 1.90625 0.46875q0.84375 0.46875 1.1875 1.296875l-1.171875 0.546875q-0.546875 -1.09375 -1.9375 -1.09375q-0.671875 0 -1.1875 0.3125q-0.5 0.3125 -0.5 0.796875q0 0.453125 0.34375 0.734375q0.359375 0.265625 1.0625 0.453125l1.1875 0.296875q1.203125 0.3125 1.8125 0.90625q0.609375 0.59375 0.609375 1.46875q0 0.75 -0.4375 1.3125q-0.4375 0.5625 -1.171875 0.875q-0.734375 0.296875 -1.625 0.296875z" fill-rule="nonzero"/><path fill="#f6921e" d="m350.8194 389.57285l0 0c0 -4.630005 3.753357 -8.383392 8.383362 -8.383392l132.71356 0c2.2234192 0 4.355774 0.88327026 5.927948 2.4554443c1.5721741 1.5721741 2.4554443 3.7045288 2.4554443 5.927948l0 33.53247c0 4.630005 -3.7533875 8.383362 -8.383392 8.383362l-132.71356 0c-4.630005 0 -8.383362 -3.753357 -8.383362 -8.383362z" fill-rule="evenodd"/><path fill="#ffffff" d="m366.7207 400.74594l1.359375 0l0 10.15625l5.0 0l0 1.296875l-6.359375 0l0 -11.453125zm11.436371 11.703125q-1.203125 0 -2.171875 -0.5625q-0.953125 -0.578125 -1.484375 -1.5625q-0.53125 -1.0 -0.53125 -2.203125q0 -1.203125 0.53125 -2.1875q0.53125 -1.0 1.484375 -1.578125q0.96875 -0.578125 2.171875 -0.578125q1.203125 0 2.15625 0.578125q0.96875 0.578125 1.5 1.578125q0.53125 0.984375 0.53125 2.1875q0 1.203125 -0.53125 2.203125q-0.53125 0.984375 -1.5 1.5625q-0.953125 0.5625 -2.15625 0.5625zm0 -1.21875q0.75 0 1.390625 -0.375q0.65625 -0.390625 1.046875 -1.09375q0.390625 -0.703125 0.390625 -1.640625q0 -0.953125 -0.390625 -1.65625q-0.390625 -0.703125 -1.046875 -1.078125q-0.640625 -0.375 -1.390625 -0.375q-0.75 0 -1.40625 0.375q-0.65625 0.375 -1.046875 1.078125q-0.390625 0.703125 -0.390625 1.65625q0 0.9375 0.390625 1.640625q0.390625 0.703125 1.046875 1.09375q0.65625 0.375 1.40625 0.375zm4.7197876 -7.1875l1.40625 0l1.9375 6.4375l0.015625 0l2.0625 -6.4375l1.390625 0l2.0625 6.421875l0.03125 0l1.921875 -6.421875l1.375 0l-2.625 8.15625l-1.390625 0l-2.109375 -6.515625l-2.09375 6.515625l-1.359375 0l-2.625 -8.15625zm16.803253 8.40625q-1.171875 0 -2.109375 -0.546875q-0.921875 -0.5625 -1.4375 -1.546875q-0.515625 -0.984375 -0.515625 -2.21875q0 -1.171875 0.484375 -2.171875q0.484375 -1.0 1.390625 -1.59375q0.90625 -0.59375 2.078125 -0.59375q1.203125 0 2.078125 0.546875q0.875 0.53125 1.34375 1.484375q0.484375 0.9375 0.484375 2.15625q0 0.1875 -0.03125 0.40625l-6.46875 0q0.046875 0.9375 0.4375 1.578125q0.40625 0.640625 1.015625 0.96875q0.625 0.3125 1.296875 0.3125q1.59375 0 2.421875 -1.484375l1.140625 0.5625q-0.5 0.984375 -1.421875 1.5625q-0.90625 0.578125 -2.1875 0.578125zm2.34375 -5.1875q-0.03125 -0.515625 -0.28125 -1.03125q-0.25 -0.515625 -0.796875 -0.859375q-0.546875 -0.359375 -1.375 -0.359375q-0.96875 0 -1.640625 0.625q-0.671875 0.609375 -0.875 1.625l4.96875 0zm3.0392761 -3.21875l1.28125 0l0 1.3125l0.078125 0q0.234375 -0.671875 0.921875 -1.109375q0.6875 -0.453125 1.4375 -0.453125q0.5625 0 0.96875 0.171875l0 1.46875q-0.515625 -0.265625 -1.15625 -0.265625q-0.59375 0 -1.09375 0.34375q-0.5 0.328125 -0.796875 0.90625q-0.28125 0.5625 -0.28125 1.21875l0 4.5625l-1.359375 0l0 -8.15625zm6.796875 -1.5q-0.40625 0 -0.703125 -0.28125q-0.28125 -0.296875 -0.28125 -0.703125q0 -0.421875 0.28125 -0.6875q0.296875 -0.28125 0.703125 -0.28125q0.40625 0 0.6875 0.28125q0.28125 0.265625 0.28125 0.6875q0 0.40625 -0.28125 0.703125q-0.28125 0.28125 -0.6875 0.28125zm-0.671875 1.5l1.359375 0l0 8.15625l-1.359375 0l0 -8.15625zm3.4910278 0l1.28125 0l0 1.203125l0.078125 0q0.328125 -0.609375 1.0625 -1.03125q0.734375 -0.4375 1.578125 -0.4375q1.46875 0 2.234375 0.859375q0.78125 0.859375 0.78125 2.328125l0 5.234375l-1.359375 0l0 -5.03125q0 -1.125 -0.546875 -1.640625q-0.546875 -0.515625 -1.484375 -0.515625q-0.65625 0 -1.1875 0.375q-0.515625 0.359375 -0.796875 0.96875q-0.28125 0.59375 -0.28125 1.25l0 4.59375l-1.359375 0l0 -8.15625zm12.474121 11.875q-1.4375 0 -2.375 -0.671875q-0.9375 -0.671875 -1.265625 -1.625l1.25 -0.53125q0.265625 0.703125 0.890625 1.140625q0.640625 0.453125 1.5 0.453125q1.265625 0 1.953125 -0.734375q0.703125 -0.734375 0.703125 -2.09375l0 -0.90625l-0.0625 0q-0.40625 0.625 -1.140625 1.015625q-0.71875 0.390625 -1.640625 0.390625q-1.0625 0 -1.9375 -0.53125q-0.875 -0.546875 -1.390625 -1.515625q-0.5 -0.984375 -0.5 -2.234375q0 -1.234375 0.5 -2.21875q0.515625 -0.984375 1.390625 -1.53125q0.875 -0.546875 1.9375 -0.546875q0.921875 0 1.640625 0.40625q0.734375 0.390625 1.140625 1.03125l0.0625 0l0 -1.171875l1.296875 0l0 7.84375q0 1.953125 -1.09375 2.984375q-1.078125 1.046875 -2.859375 1.046875zm0 -4.796875q0.734375 0 1.328125 -0.359375q0.609375 -0.375 0.96875 -1.0625q0.359375 -0.6875 0.359375 -1.625q0 -0.96875 -0.359375 -1.65625q-0.359375 -0.6875 -0.96875 -1.046875q-0.59375 -0.359375 -1.328125 -0.359375q-0.734375 0 -1.34375 0.375q-0.609375 0.359375 -0.96875 1.046875q-0.359375 0.6875 -0.359375 1.640625q0 0.9375 0.359375 1.640625q0.359375 0.6875 0.96875 1.046875q0.609375 0.359375 1.34375 0.359375zm4.9582825 1.078125l0 0zm4.6495056 -8.15625l1.296875 0l0 1.21875l0.0625 0q0.375 -0.625 1.109375 -1.046875q0.75 -0.4375 1.6875 -0.4375q1.09375 0 1.984375 0.5625q0.890625 0.5625 1.390625 1.5625q0.515625 0.984375 0.515625 2.21875q0 1.25 -0.515625 2.234375q-0.5 0.984375 -1.390625 1.546875q-0.890625 0.546875 -1.984375 0.546875q-0.9375 0 -1.6875 -0.421875q-0.734375 -0.421875 -1.109375 -1.03125l-0.0625 0l0.0625 1.125l0 3.53125l-1.359375 0l0 -11.609375zm4.0 7.1875q0.703125 0 1.328125 -0.390625q0.625 -0.390625 0.984375 -1.09375q0.375 -0.703125 0.375 -1.625q0 -0.9375 -0.375 -1.640625q-0.359375 -0.703125 -0.984375 -1.078125q-0.625 -0.390625 -1.328125 -0.390625q-0.71875 0 -1.34375 0.390625q-0.609375 0.375 -0.984375 1.078125q-0.375 0.703125 -0.375 1.640625q0 0.921875 0.375 1.625q0.375 0.703125 0.984375 1.09375q0.625 0.390625 1.34375 0.390625zm8.239899 1.21875q-0.890625 0 -1.59375 -0.34375q-0.703125 -0.359375 -1.09375 -0.96875q-0.375 -0.625 -0.375 -1.40625q0 -1.296875 0.96875 -2.015625q0.984375 -0.734375 2.46875 -0.734375q0.734375 0 1.359375 0.171875q0.640625 0.15625 0.96875 0.359375l0 -0.5q0 -0.90625 -0.640625 -1.453125q-0.625 -0.5625 -1.609375 -0.5625q-0.671875 0 -1.265625 0.296875q-0.578125 0.296875 -0.90625 0.828125l-1.03125 -0.765625q0.484375 -0.734375 1.328125 -1.15625q0.859375 -0.421875 1.875 -0.421875q1.671875 0 2.609375 0.875q0.9375 0.875 0.9375 2.375l0 5.171875l-1.296875 0l0 -1.171875l-0.0625 0q-0.34375 0.59375 -1.046875 1.015625q-0.703125 0.40625 -1.59375 0.40625zm0.140625 -1.1875q0.6875 0 1.265625 -0.34375q0.59375 -0.359375 0.9375 -0.953125q0.359375 -0.59375 0.359375 -1.296875q-0.375 -0.265625 -0.9375 -0.421875q-0.5625 -0.15625 -1.1875 -0.15625q-1.109375 0 -1.6875 0.46875q-0.5625 0.453125 -0.5625 1.1875q0 0.671875 0.5 1.09375q0.515625 0.421875 1.3125 0.421875zm8.525391 1.1875q-1.3125 0 -2.203125 -0.609375q-0.875 -0.609375 -1.234375 -1.578125l1.203125 -0.546875q0.3125 0.734375 0.90625 1.140625q0.609375 0.40625 1.328125 0.40625q0.765625 0 1.3125 -0.3125q0.546875 -0.3125 0.546875 -0.890625q0 -0.515625 -0.4375 -0.828125q-0.4375 -0.3125 -1.359375 -0.53125l-1.0 -0.265625q-0.96875 -0.234375 -1.59375 -0.8125q-0.625 -0.578125 -0.625 -1.484375q0 -0.703125 0.421875 -1.234375q0.421875 -0.546875 1.125 -0.828125q0.703125 -0.296875 1.53125 -0.296875q1.0625 0 1.90625 0.46875q0.84375 0.46875 1.1875 1.296875l-1.171875 0.546875q-0.546875 -1.09375 -1.9375 -1.09375q-0.671875 0 -1.1875 0.3125q-0.5 0.3125 -0.5 0.796875q0 0.453125 0.34375 0.734375q0.359375 0.265625 1.0625 0.453125l1.1875 0.296875q1.203125 0.3125 1.8125 0.90625q0.609375 0.59375 0.609375 1.46875q0 0.75 -0.4375 1.3125q-0.4375 0.5625 -1.171875 0.875q-0.734375 0.296875 -1.625 0.296875zm7.567993 0q-1.3125 0 -2.203125 -0.609375q-0.875 -0.609375 -1.234375 -1.578125l1.203125 -0.546875q0.3125 0.734375 0.90625 1.140625q0.609375 0.40625 1.328125 0.40625q0.765625 0 1.3125 -0.3125q0.546875 -0.3125 0.546875 -0.890625q0 -0.515625 -0.4375 -0.828125q-0.4375 -0.3125 -1.359375 -0.53125l-1.0 -0.265625q-0.96875 -0.234375 -1.59375 -0.8125q-0.625 -0.578125 -0.625 -1.484375q0 -0.703125 0.421875 -1.234375q0.421875 -0.546875 1.125 -0.828125q0.703125 -0.296875 1.53125 -0.296875q1.0625 0 1.90625 0.46875q0.84375 0.46875 1.1875 1.296875l-1.171875 0.546875q-0.546875 -1.09375 -1.9375 -1.09375q-0.671875 0 -1.1875 0.3125q-0.5 0.3125 -0.5 0.796875q0 0.453125 0.34375 0.734375q0.359375 0.265625 1.0625 0.453125l1.1875 0.296875q1.203125 0.3125 1.8125 0.90625q0.609375 0.59375 0.609375 1.46875q0 0.75 -0.4375 1.3125q-0.4375 0.5625 -1.171875 0.875q-0.734375 0.296875 -1.625 0.296875zm8.42926 0q-1.171875 0 -2.109375 -0.546875q-0.921875 -0.5625 -1.4375 -1.546875q-0.515625 -0.984375 -0.515625 -2.21875q0 -1.171875 0.484375 -2.171875q0.484375 -1.0 1.390625 -1.59375q0.90625 -0.59375 2.078125 -0.59375q1.203125 0 2.078125 0.546875q0.875 0.53125 1.34375 1.484375q0.484375 0.9375 0.484375 2.15625q0 0.1875 -0.03125 0.40625l-6.46875 0q0.046875 0.9375 0.4375 1.578125q0.40625 0.640625 1.015625 0.96875q0.625 0.3125 1.296875 0.3125q1.59375 0 2.421875 -1.484375l1.140625 0.5625q-0.5 0.984375 -1.421875 1.5625q-0.90625 0.578125 -2.1875 0.578125zm2.34375 -5.1875q-0.03125 -0.515625 -0.28125 -1.03125q-0.25 -0.515625 -0.796875 -0.859375q-0.546875 -0.359375 -1.375 -0.359375q-0.96875 0 -1.640625 0.625q-0.671875 0.609375 -0.875 1.625l4.96875 0zm5.9630127 5.1875q-1.3125 0 -2.203125 -0.609375q-0.875 -0.609375 -1.234375 -1.578125l1.203125 -0.546875q0.3125 0.734375 0.90625 1.140625q0.609375 0.40625 1.328125 0.40625q0.765625 0 1.3125 -0.3125q0.546875 -0.3125 0.546875 -0.890625q0 -0.515625 -0.4375 -0.828125q-0.4375 -0.3125 -1.359375 -0.53125l-1.0 -0.265625q-0.96875 -0.234375 -1.59375 -0.8125q-0.625 -0.578125 -0.625 -1.484375q0 -0.703125 0.421875 -1.234375q0.421875 -0.546875 1.125 -0.828125q0.703125 -0.296875 1.53125 -0.296875q1.0625 0 1.90625 0.46875q0.84375 0.46875 1.1875 1.296875l-1.171875 0.546875q-0.546875 -1.09375 -1.9375 -1.09375q-0.671875 0 -1.1875 0.3125q-0.5 0.3125 -0.5 0.796875q0 0.453125 0.34375 0.734375q0.359375 0.265625 1.0625 0.453125l1.1875 0.296875q1.203125 0.3125 1.8125 0.90625q0.609375 0.59375 0.609375 1.46875q0 0.75 -0.4375 1.3125q-0.4375 0.5625 -1.171875 0.875q-0.734375 0.296875 -1.625 0.296875z" fill-rule="nonzero"/><path fill="#f6921e" d="m350.8194 459.83612l0 0c0 -4.630005 3.753357 -8.383362 8.383362 -8.383362l132.71356 0c2.2234192 0 4.355774 0.88323975 5.927948 2.4554138c1.5721741 1.5722046 2.4554443 3.7045288 2.4554443 5.927948l0 33.53247c0 4.630005 -3.7533875 8.383362 -8.383392 8.383362l-132.71356 0c-4.630005 0 -8.383362 -3.753357 -8.383362 -8.383362z" fill-rule="evenodd"/><path fill="#ffffff" d="m376.26266 478.44672l1.515625 0.375q-0.46875 1.875 -1.71875 2.859375q-1.234375 0.984375 -3.015625 0.984375q-1.859375 0 -3.015625 -0.75q-1.15625 -0.765625 -1.765625 -2.1875q-0.609375 -1.4375 -0.609375 -3.078125q0 -1.796875 0.6875 -3.125q0.6875 -1.328125 1.9375 -2.015625q1.265625 -0.703125 2.78125 -0.703125q1.71875 0 2.890625 0.875q1.171875 0.875 1.640625 2.46875l-1.5 0.34375q-0.390625 -1.25 -1.15625 -1.8125q-0.75 -0.578125 -1.90625 -0.578125q-1.3125 0 -2.203125 0.640625q-0.890625 0.625 -1.25 1.703125q-0.359375 1.0625 -0.359375 2.1875q0 1.46875 0.421875 2.5625q0.4375 1.078125 1.328125 1.625q0.90625 0.53125 1.953125 0.53125q1.265625 0 2.140625 -0.734375q0.890625 -0.734375 1.203125 -2.171875zm8.6171875 2.984375q-0.78125 0.671875 -1.5 0.953125q-0.71875 0.265625 -1.546875 0.265625q-1.375 0 -2.109375 -0.671875q-0.734375 -0.671875 -0.734375 -1.703125q0 -0.609375 0.28125 -1.109375q0.28125 -0.515625 0.71875 -0.8125q0.453125 -0.3125 1.015625 -0.46875q0.421875 -0.109375 1.25 -0.203125q1.703125 -0.203125 2.515625 -0.484375q0 -0.296875 0 -0.375q0 -0.859375 -0.390625 -1.203125q-0.546875 -0.484375 -1.609375 -0.484375q-0.984375 0 -1.46875 0.359375q-0.46875 0.34375 -0.6875 1.21875l-1.375 -0.1875q0.1875 -0.875 0.609375 -1.421875q0.4375 -0.546875 1.25 -0.828125q0.8125 -0.296875 1.875 -0.296875q1.0625 0 1.71875 0.25q0.671875 0.25 0.984375 0.625q0.3125 0.375 0.4375 0.953125q0.078125 0.359375 0.078125 1.296875l0 1.875q0 1.96875 0.078125 2.484375q0.09375 0.515625 0.359375 1.0l-1.46875 0q-0.21875 -0.4375 -0.28125 -1.03125zm-0.109375 -3.140625q-0.765625 0.3125 -2.296875 0.53125q-0.875 0.125 -1.234375 0.28125q-0.359375 0.15625 -0.5625 0.46875q-0.1875 0.296875 -0.1875 0.65625q0 0.5625 0.421875 0.9375q0.4375 0.375 1.25 0.375q0.8125 0 1.4375 -0.34375q0.640625 -0.359375 0.9375 -0.984375q0.234375 -0.46875 0.234375 -1.40625l0 -0.515625zm3.6015625 4.171875l0 -8.296875l1.265625 0l0 1.171875q0.90625 -1.359375 2.640625 -1.359375q0.75 0 1.375 0.265625q0.625 0.265625 0.9375 0.703125q0.3125 0.4375 0.4375 1.046875q0.078125 0.390625 0.078125 1.359375l0 5.109375l-1.40625 0l0 -5.046875q0 -0.859375 -0.171875 -1.28125q-0.15625 -0.4375 -0.578125 -0.6875q-0.40625 -0.25 -0.96875 -0.25q-0.90625 0 -1.5625 0.578125q-0.640625 0.5625 -0.640625 2.15625l0 4.53125l-1.40625 0zm8.3671875 -4.15625q0 -2.296875 1.28125 -3.40625q1.078125 -0.921875 2.609375 -0.921875q1.71875 0 2.796875 1.125q1.09375 1.109375 1.09375 3.09375q0 1.59375 -0.484375 2.515625q-0.484375 0.921875 -1.40625 1.4375q-0.90625 0.5 -2.0 0.5q-1.734375 0 -2.8125 -1.109375q-1.078125 -1.125 -1.078125 -3.234375zm1.453125 0q0 1.59375 0.6875 2.390625q0.703125 0.796875 1.75 0.796875q1.046875 0 1.734375 -0.796875q0.703125 -0.796875 0.703125 -2.4375q0 -1.53125 -0.703125 -2.328125q-0.6875 -0.796875 -1.734375 -0.796875q-1.046875 0 -1.75 0.796875q-0.6875 0.78125 -0.6875 2.375zm7.9765625 4.15625l0 -8.296875l1.265625 0l0 1.171875q0.90625 -1.359375 2.640625 -1.359375q0.75 0 1.375 0.265625q0.625 0.265625 0.9375 0.703125q0.3125 0.4375 0.4375 1.046875q0.078125 0.390625 0.078125 1.359375l0 5.109375l-1.40625 0l0 -5.046875q0 -0.859375 -0.171875 -1.28125q-0.15625 -0.4375 -0.578125 -0.6875q-0.40625 -0.25 -0.96875 -0.25q-0.90625 0 -1.5625 0.578125q-0.640625 0.5625 -0.640625 2.15625l0 4.53125l-1.40625 0zm8.8984375 -9.84375l0 -1.609375l1.40625 0l0 1.609375l-1.40625 0zm0 9.84375l0 -8.296875l1.40625 0l0 8.296875l-1.40625 0zm8.9609375 -3.046875l1.390625 0.1875q-0.234375 1.421875 -1.171875 2.234375q-0.921875 0.8125 -2.28125 0.8125q-1.703125 0 -2.75 -1.109375q-1.03125 -1.125 -1.03125 -3.203125q0 -1.34375 0.4375 -2.34375q0.453125 -1.015625 1.359375 -1.515625q0.921875 -0.5 1.984375 -0.5q1.359375 0 2.21875 0.6875q0.859375 0.671875 1.09375 1.9375l-1.359375 0.203125q-0.203125 -0.828125 -0.703125 -1.25q-0.484375 -0.421875 -1.1875 -0.421875q-1.0625 0 -1.734375 0.765625q-0.65625 0.75 -0.65625 2.40625q0 1.671875 0.640625 2.4375q0.640625 0.75 1.671875 0.75q0.828125 0 1.375 -0.5q0.5625 -0.515625 0.703125 -1.578125zm8.0 2.015625q-0.78125 0.671875 -1.5 0.953125q-0.71875 0.265625 -1.546875 0.265625q-1.375 0 -2.109375 -0.671875q-0.734375 -0.671875 -0.734375 -1.703125q0 -0.609375 0.28125 -1.109375q0.28125 -0.515625 0.71875 -0.8125q0.453125 -0.3125 1.015625 -0.46875q0.421875 -0.109375 1.25 -0.203125q1.703125 -0.203125 2.515625 -0.484375q0 -0.296875 0 -0.375q0 -0.859375 -0.390625 -1.203125q-0.546875 -0.484375 -1.609375 -0.484375q-0.984375 0 -1.46875 0.359375q-0.46875 0.34375 -0.6875 1.21875l-1.375 -0.1875q0.1875 -0.875 0.609375 -1.421875q0.4375 -0.546875 1.25 -0.828125q0.8125 -0.296875 1.875 -0.296875q1.0625 0 1.71875 0.25q0.671875 0.25 0.984375 0.625q0.3125 0.375 0.4375 0.953125q0.078125 0.359375 0.078125 1.296875l0 1.875q0 1.96875 0.078125 2.484375q0.09375 0.515625 0.359375 1.0l-1.46875 0q-0.21875 -0.4375 -0.28125 -1.03125zm-0.109375 -3.140625q-0.765625 0.3125 -2.296875 0.53125q-0.875 0.125 -1.234375 0.28125q-0.359375 0.15625 -0.5625 0.46875q-0.1875 0.296875 -0.1875 0.65625q0 0.5625 0.421875 0.9375q0.4375 0.375 1.25 0.375q0.8125 0 1.4375 -0.34375q0.640625 -0.359375 0.9375 -0.984375q0.234375 -0.46875 0.234375 -1.40625l0 -0.515625zm3.5703125 4.171875l0 -11.453125l1.40625 0l0 11.453125l-1.40625 0zm3.5859375 -9.84375l0 -1.609375l1.40625 0l0 1.609375l-1.40625 0zm0 9.84375l0 -8.296875l1.40625 0l0 8.296875l-1.40625 0zm2.8046875 0l0 -1.140625l5.28125 -6.0625q-0.890625 0.046875 -1.578125 0.046875l-3.390625 0l0 -1.140625l6.78125 0l0 0.921875l-4.484375 5.265625l-0.875 0.96875q0.953125 -0.078125 1.78125 -0.078125l3.828125 0l0 1.21875l-7.34375 0zm14.15625 -1.03125q-0.78125 0.671875 -1.5 0.953125q-0.71875 0.265625 -1.546875 0.265625q-1.375 0 -2.109375 -0.671875q-0.734375 -0.671875 -0.734375 -1.703125q0 -0.609375 0.28125 -1.109375q0.28125 -0.515625 0.71875 -0.8125q0.453125 -0.3125 1.015625 -0.46875q0.421875 -0.109375 1.25 -0.203125q1.703125 -0.203125 2.515625 -0.484375q0 -0.296875 0 -0.375q0 -0.859375 -0.390625 -1.203125q-0.546875 -0.484375 -1.609375 -0.484375q-0.984375 0 -1.46875 0.359375q-0.46875 0.34375 -0.6875 1.21875l-1.375 -0.1875q0.1875 -0.875 0.609375 -1.421875q0.4375 -0.546875 1.25 -0.828125q0.8125 -0.296875 1.875 -0.296875q1.0625 0 1.71875 0.25q0.671875 0.25 0.984375 0.625q0.3125 0.375 0.4375 0.953125q0.078125 0.359375 0.078125 1.296875l0 1.875q0 1.96875 0.078125 2.484375q0.09375 0.515625 0.359375 1.0l-1.46875 0q-0.21875 -0.4375 -0.28125 -1.03125zm-0.109375 -3.140625q-0.765625 0.3125 -2.296875 0.53125q-0.875 0.125 -1.234375 0.28125q-0.359375 0.15625 -0.5625 0.46875q-0.1875 0.296875 -0.1875 0.65625q0 0.5625 0.421875 0.9375q0.4375 0.375 1.25 0.375q0.8125 0 1.4375 -0.34375q0.640625 -0.359375 0.9375 -0.984375q0.234375 -0.46875 0.234375 -1.40625l0 -0.515625zm6.6640625 2.90625l0.203125 1.25q-0.59375 0.125 -1.0625 0.125q-0.765625 0 -1.1875 -0.234375q-0.421875 -0.25 -0.59375 -0.640625q-0.171875 -0.40625 -0.171875 -1.671875l0 -4.765625l-1.03125 0l0 -1.09375l1.03125 0l0 -2.0625l1.40625 -0.84375l0 2.90625l1.40625 0l0 1.09375l-1.40625 0l0 4.84375q0 0.609375 0.0625 0.78125q0.078125 0.171875 0.25 0.28125q0.171875 0.09375 0.484375 0.09375q0.234375 0 0.609375 -0.0625zm1.3828125 -8.578125l0 -1.609375l1.40625 0l0 1.609375l-1.40625 0zm0 9.84375l0 -8.296875l1.40625 0l0 8.296875l-1.40625 0zm3.0234375 -4.15625q0 -2.296875 1.28125 -3.40625q1.078125 -0.921875 2.609375 -0.921875q1.71875 0 2.796875 1.125q1.09375 1.109375 1.09375 3.09375q0 1.59375 -0.484375 2.515625q-0.484375 0.921875 -1.40625 1.4375q-0.90625 0.5 -2.0 0.5q-1.734375 0 -2.8125 -1.109375q-1.078125 -1.125 -1.078125 -3.234375zm1.453125 0q0 1.59375 0.6875 2.390625q0.703125 0.796875 1.75 0.796875q1.046875 0 1.734375 -0.796875q0.703125 -0.796875 0.703125 -2.4375q0 -1.53125 -0.703125 -2.328125q-0.6875 -0.796875 -1.734375 -0.796875q-1.046875 0 -1.75 0.796875q-0.6875 0.78125 -0.6875 2.375zm7.9765625 4.15625l0 -8.296875l1.265625 0l0 1.171875q0.90625 -1.359375 2.640625 -1.359375q0.75 0 1.375 0.265625q0.625 0.265625 0.9375 0.703125q0.3125 0.4375 0.4375 1.046875q0.078125 0.390625 0.078125 1.359375l0 5.109375l-1.40625 0l0 -5.046875q0 -0.859375 -0.171875 -1.28125q-0.15625 -0.4375 -0.578125 -0.6875q-0.40625 -0.25 -0.96875 -0.25q-0.90625 0 -1.5625 0.578125q-0.640625 0.5625 -0.640625 2.15625l0 4.53125l-1.40625 0z" fill-rule="nonzero"/><path fill="#f6921e" d="m350.8194 530.0994l0 0c0 -4.630005 3.753357 -8.383423 8.383362 -8.383423l132.71356 0c2.2234192 0 4.355774 0.8833008 5.927948 2.4554443c1.5721741 1.5722046 2.4554443 3.7045288 2.4554443 5.9279785l0 33.53247c0 4.630005 -3.7533875 8.383362 -8.383392 8.383362l-132.71356 0c-4.630005 0 -8.383362 -3.753357 -8.383362 -8.383362z" fill-rule="evenodd"/><path fill="#ffffff" d="m384.0322 552.72565l0 -11.453125l1.515625 0l0 10.09375l5.640625 0l0 1.359375l-7.15625 0zm14.4609375 -2.671875l1.453125 0.171875q-0.34375 1.28125 -1.28125 1.984375q-0.921875 0.703125 -2.359375 0.703125q-1.828125 0 -2.890625 -1.125q-1.0625 -1.125 -1.0625 -3.140625q0 -2.09375 1.078125 -3.25q1.078125 -1.15625 2.796875 -1.15625q1.65625 0 2.703125 1.140625q1.0625 1.125 1.0625 3.171875q0 0.125 0 0.375l-6.1875 0q0.078125 1.375 0.765625 2.109375q0.703125 0.71875 1.734375 0.71875q0.78125 0 1.328125 -0.40625q0.546875 -0.40625 0.859375 -1.296875zm-4.609375 -2.28125l4.625 0q-0.09375 -1.046875 -0.53125 -1.5625q-0.671875 -0.8125 -1.734375 -0.8125q-0.96875 0 -1.640625 0.65625q-0.65625 0.640625 -0.71875 1.71875zm7.5703125 5.640625l1.375 0.203125q0.078125 0.640625 0.46875 0.921875q0.53125 0.390625 1.4375 0.390625q0.96875 0 1.5 -0.390625q0.53125 -0.390625 0.71875 -1.09375q0.109375 -0.421875 0.109375 -1.8125q-0.921875 1.09375 -2.296875 1.09375q-1.71875 0 -2.65625 -1.234375q-0.9375 -1.234375 -0.9375 -2.96875q0 -1.1875 0.421875 -2.1875q0.4375 -1.0 1.25 -1.546875q0.828125 -0.546875 1.921875 -0.546875q1.46875 0 2.421875 1.1875l0 -1.0l1.296875 0l0 7.171875q0 1.9375 -0.390625 2.75q-0.390625 0.8125 -1.25 1.28125q-0.859375 0.46875 -2.109375 0.46875q-1.484375 0 -2.40625 -0.671875q-0.90625 -0.671875 -0.875 -2.015625zm1.171875 -4.984375q0 1.625 0.640625 2.375q0.65625 0.75 1.625 0.75q0.96875 0 1.625 -0.734375q0.65625 -0.75 0.65625 -2.34375q0 -1.53125 -0.671875 -2.296875q-0.671875 -0.78125 -1.625 -0.78125q-0.9375 0 -1.59375 0.765625q-0.65625 0.765625 -0.65625 2.265625zm13.3984375 3.265625q-0.78125 0.671875 -1.5 0.953125q-0.71875 0.265625 -1.546875 0.265625q-1.375 0 -2.109375 -0.671875q-0.734375 -0.671875 -0.734375 -1.703125q0 -0.609375 0.28125 -1.109375q0.28125 -0.515625 0.71875 -0.8125q0.453125 -0.3125 1.015625 -0.46875q0.421875 -0.109375 1.25 -0.203125q1.703125 -0.203125 2.515625 -0.484375q0 -0.296875 0 -0.375q0 -0.859375 -0.390625 -1.203125q-0.546875 -0.484375 -1.609375 -0.484375q-0.984375 0 -1.46875 0.359375q-0.46875 0.34375 -0.6875 1.21875l-1.375 -0.1875q0.1875 -0.875 0.609375 -1.421875q0.4375 -0.546875 1.25 -0.828125q0.8125 -0.296875 1.875 -0.296875q1.0625 0 1.71875 0.25q0.671875 0.25 0.984375 0.625q0.3125 0.375 0.4375 0.953125q0.078125 0.359375 0.078125 1.296875l0 1.875q0 1.96875 0.078125 2.484375q0.09375 0.515625 0.359375 1.0l-1.46875 0q-0.21875 -0.4375 -0.28125 -1.03125zm-0.109375 -3.140625q-0.765625 0.3125 -2.296875 0.53125q-0.875 0.125 -1.234375 0.28125q-0.359375 0.15625 -0.5625 0.46875q-0.1875 0.296875 -0.1875 0.65625q0 0.5625 0.421875 0.9375q0.4375 0.375 1.25 0.375q0.8125 0 1.4375 -0.34375q0.640625 -0.359375 0.9375 -0.984375q0.234375 -0.46875 0.234375 -1.40625l0 -0.515625zm3.5703125 4.171875l0 -11.453125l1.40625 0l0 11.453125l-1.40625 0zm3.5859375 -9.84375l0 -1.609375l1.40625 0l0 1.609375l-1.40625 0zm0 9.84375l0 -8.296875l1.40625 0l0 8.296875l-1.40625 0zm2.8046875 0l0 -1.140625l5.28125 -6.0625q-0.890625 0.046875 -1.578125 0.046875l-3.390625 0l0 -1.140625l6.78125 0l0 0.921875l-4.484375 5.265625l-0.875 0.96875q0.953125 -0.078125 1.78125 -0.078125l3.828125 0l0 1.21875l-7.34375 0zm14.15625 -1.03125q-0.78125 0.671875 -1.5 0.953125q-0.71875 0.265625 -1.546875 0.265625q-1.375 0 -2.109375 -0.671875q-0.734375 -0.671875 -0.734375 -1.703125q0 -0.609375 0.28125 -1.109375q0.28125 -0.515625 0.71875 -0.8125q0.453125 -0.3125 1.015625 -0.46875q0.421875 -0.109375 1.25 -0.203125q1.703125 -0.203125 2.515625 -0.484375q0 -0.296875 0 -0.375q0 -0.859375 -0.390625 -1.203125q-0.546875 -0.484375 -1.609375 -0.484375q-0.984375 0 -1.46875 0.359375q-0.46875 0.34375 -0.6875 1.21875l-1.375 -0.1875q0.1875 -0.875 0.609375 -1.421875q0.4375 -0.546875 1.25 -0.828125q0.8125 -0.296875 1.875 -0.296875q1.0625 0 1.71875 0.25q0.671875 0.25 0.984375 0.625q0.3125 0.375 0.4375 0.953125q0.078125 0.359375 0.078125 1.296875l0 1.875q0 1.96875 0.078125 2.484375q0.09375 0.515625 0.359375 1.0l-1.46875 0q-0.21875 -0.4375 -0.28125 -1.03125zm-0.109375 -3.140625q-0.765625 0.3125 -2.296875 0.53125q-0.875 0.125 -1.234375 0.28125q-0.359375 0.15625 -0.5625 0.46875q-0.1875 0.296875 -0.1875 0.65625q0 0.5625 0.421875 0.9375q0.4375 0.375 1.25 0.375q0.8125 0 1.4375 -0.34375q0.640625 -0.359375 0.9375 -0.984375q0.234375 -0.46875 0.234375 -1.40625l0 -0.515625zm6.6640625 2.90625l0.203125 1.25q-0.59375 0.125 -1.0625 0.125q-0.765625 0 -1.1875 -0.234375q-0.421875 -0.25 -0.59375 -0.640625q-0.171875 -0.40625 -0.171875 -1.671875l0 -4.765625l-1.03125 0l0 -1.09375l1.03125 0l0 -2.0625l1.40625 -0.84375l0 2.90625l1.40625 0l0 1.09375l-1.40625 0l0 4.84375q0 0.609375 0.0625 0.78125q0.078125 0.171875 0.25 0.28125q0.171875 0.09375 0.484375 0.09375q0.234375 0 0.609375 -0.0625zm1.3828125 -8.578125l0 -1.609375l1.40625 0l0 1.609375l-1.40625 0zm0 9.84375l0 -8.296875l1.40625 0l0 8.296875l-1.40625 0zm3.0234375 -4.15625q0 -2.296875 1.28125 -3.40625q1.078125 -0.921875 2.609375 -0.921875q1.71875 0 2.796875 1.125q1.09375 1.109375 1.09375 3.09375q0 1.59375 -0.484375 2.515625q-0.484375 0.921875 -1.40625 1.4375q-0.90625 0.5 -2.0 0.5q-1.734375 0 -2.8125 -1.109375q-1.078125 -1.125 -1.078125 -3.234375zm1.453125 0q0 1.59375 0.6875 2.390625q0.703125 0.796875 1.75 0.796875q1.046875 0 1.734375 -0.796875q0.703125 -0.796875 0.703125 -2.4375q0 -1.53125 -0.703125 -2.328125q-0.6875 -0.796875 -1.734375 -0.796875q-1.046875 0 -1.75 0.796875q-0.6875 0.78125 -0.6875 2.375zm7.9765625 4.15625l0 -8.296875l1.265625 0l0 1.171875q0.90625 -1.359375 2.640625 -1.359375q0.75 0 1.375 0.265625q0.625 0.265625 0.9375 0.703125q0.3125 0.4375 0.4375 1.046875q0.078125 0.390625 0.078125 1.359375l0 5.109375l-1.40625 0l0 -5.046875q0 -0.859375 -0.171875 -1.28125q-0.15625 -0.4375 -0.578125 -0.6875q-0.40625 -0.25 -0.96875 -0.25q-0.90625 0 -1.5625 0.578125q-0.640625 0.5625 -0.640625 2.15625l0 4.53125l-1.40625 0z" fill-rule="nonzero"/><path fill="#f6921e" d="m350.8194 600.3627l0 0c0 -4.630005 3.753357 -8.383362 8.383362 -8.383362l132.71356 0c2.2234192 0 4.355774 0.88323975 5.927948 2.4554443c1.5721741 1.5722046 2.4554443 3.7045288 2.4554443 5.9279175l0 33.53247c0 4.630005 -3.7533875 8.383362 -8.383392 8.383362l-132.71356 0c-4.630005 0 -8.383362 -3.753357 -8.383362 -8.383362z" fill-rule="evenodd"/><path fill="#ffffff" d="m420.3447 622.9889l0 -1.609375l1.609375 0l0 1.609375l-1.609375 0zm4.4453125 0l0 -1.609375l1.609375 0l0 1.609375l-1.609375 0zm4.4453125 0l0 -1.609375l1.609375 0l0 1.609375l-1.609375 0z" fill-rule="nonzero"/><path fill="#a64d79" d="m541.56616 221.85857l0 0c0 -8.404755 6.8134155 -15.21814 15.21814 -15.21814l117.87866 0c4.036133 0 7.9069214 1.6033325 10.760864 4.4572906c2.8539429 2.8539581 4.4573364 6.7247467 4.4573364 10.760849l0 60.870773c0 8.404755 -6.8134155 15.21817 -15.218201 15.21817l-117.87866 0c-8.404724 0 -15.21814 -6.8134155 -15.21814 -15.21817z" fill-rule="evenodd"/><path fill="#ffffff" d="m599.7218 248.51083q-1.953125 0 -3.5625 -0.921875q-1.59375 -0.921875 -2.515625 -2.515625q-0.921875 -1.59375 -0.921875 -3.53125q0 -1.953125 0.921875 -3.546875q0.921875 -1.59375 2.515625 -2.515625q1.609375 -0.921875 3.5625 -0.921875q3.015625 0 5.0 2.1875l-1.78125 1.71875q-1.28125 -1.515625 -3.203125 -1.515625q-1.25 0 -2.28125 0.578125q-1.015625 0.5625 -1.609375 1.609375q-0.59375 1.03125 -0.59375 2.40625q0 1.359375 0.59375 2.40625q0.59375 1.03125 1.609375 1.609375q1.03125 0.5625 2.28125 0.5625q2.109375 0 3.53125 -1.75l1.78125 1.703125q-0.984375 1.1875 -2.34375 1.8125q-1.34375 0.625 -2.984375 0.625zm11.50946 0q-1.484314 0 -2.640564 -0.65625q-1.15625 -0.671875 -1.796875 -1.8125q-0.640625 -1.15625 -0.640625 -2.578125q0 -1.421875 0.640625 -2.578125q0.640625 -1.15625 1.796875 -1.8125q1.15625 -0.671875 2.640564 -0.671875q1.453125 0 2.609375 0.671875q1.15625 0.65625 1.796875 1.8125q0.640625 1.15625 0.640625 2.578125q0 1.421875 -0.640625 2.578125q-0.640625 1.140625 -1.796875 1.8125q-1.15625 0.65625 -2.609375 0.65625zm0 -2.25q0.703125 0 1.296875 -0.328125q0.609375 -0.34375 0.953125 -0.984375q0.359375 -0.640625 0.359375 -1.484375q0 -0.84375 -0.359375 -1.46875q-0.34375 -0.640625 -0.953125 -0.96875q-0.59375 -0.34375 -1.296875 -0.34375q-0.71875 0 -1.312439 0.34375q-0.59375 0.328125 -0.96875 0.96875q-0.359375 0.625 -0.359375 1.46875q0 0.828125 0.359375 1.484375q0.375 0.640625 0.96875 0.984375q0.59368896 0.328125 1.312439 0.328125zm11.004883 2.25q-1.28125 0 -2.34375 -0.640625q-1.046875 -0.640625 -1.65625 -1.78125q-0.609375 -1.15625 -0.609375 -2.625q0 -1.46875 0.609375 -2.625q0.609375 -1.15625 1.65625 -1.796875q1.0625 -0.640625 2.34375 -0.640625q0.96875 0 1.71875 0.421875q0.75 0.421875 1.140625 1.03125l0.140625 0l-0.140625 -1.34375l0 -3.65625l2.421875 0l0 13.359375l-2.28125 0l0 -1.140625l-0.140625 0q-0.375 0.625 -1.140625 1.03125q-0.75 0.40625 -1.71875 0.40625zm0.40625 -2.25q0.6875 0 1.28125 -0.359375q0.609375 -0.359375 0.953125 -0.984375q0.359375 -0.640625 0.359375 -1.453125q0 -0.828125 -0.359375 -1.453125q-0.34375 -0.640625 -0.953125 -0.984375q-0.59375 -0.34375 -1.28125 -0.34375q-0.671875 0 -1.265625 0.34375q-0.59375 0.34375 -0.953125 0.984375q-0.359375 0.625 -0.359375 1.453125q0 0.8125 0.359375 1.453125q0.359375 0.625 0.953125 0.984375q0.59375 0.359375 1.265625 0.359375zm11.647522 2.25q-1.421875 0 -2.5625 -0.640625q-1.140625 -0.65625 -1.78125 -1.796875q-0.640625 -1.15625 -0.640625 -2.59375q0 -1.359375 0.625 -2.53125q0.640625 -1.171875 1.75 -1.859375q1.125 -0.6875 2.515625 -0.6875q1.46875 0 2.53125 0.640625q1.0625 0.625 1.609375 1.734375q0.5625 1.09375 0.5625 2.4375q0 0.421875 -0.0625 0.828125l-7.140625 0q0.1875 1.125 0.90625 1.71875q0.71875 0.59375 1.71875 0.59375q0.84375 0 1.453125 -0.359375q0.609375 -0.375 0.953125 -0.984375l1.984375 0.96875q-1.453125 2.53125 -4.421875 2.53125zm2.25 -6.203125q-0.03125 -0.453125 -0.328125 -0.890625q-0.28125 -0.4375 -0.796875 -0.71875q-0.515625 -0.296875 -1.203125 -0.296875q-0.875 0 -1.5 0.515625q-0.625 0.5 -0.90625 1.390625l4.734375 0zm3.0449219 5.90625l0 0z" fill-rule="nonzero"/><path fill="#ffffff" d="m572.76385 270.51083q-1.921875 0 -3.53125 -0.90625q-1.609375 -0.90625 -2.5625 -2.5q-0.953125 -1.59375 -0.953125 -3.5625q0 -1.984375 0.953125 -3.578125q0.953125 -1.59375 2.5625 -2.5q1.609375 -0.90625 3.53125 -0.90625q1.578125 0 2.875 0.546875q1.296875 0.546875 2.21875 1.5625l-1.734375 1.703125q-0.671875 -0.71875 -1.484375 -1.0625q-0.8125 -0.359375 -1.890625 -0.359375q-1.234375 0 -2.265625 0.578125q-1.015625 0.5625 -1.625 1.625q-0.609375 1.046875 -0.609375 2.390625q0 1.34375 0.609375 2.390625q0.625 1.046875 1.65625 1.625q1.046875 0.5625 2.265625 0.5625q1.875 0 3.0625 -1.09375q0.375 -0.34375 0.65625 -0.859375q0.28125 -0.53125 0.40625 -1.140625l-4.1875 0l0 -2.140625l6.546875 0q0.125 0.515625 0.125 1.171875q0 1.328125 -0.40625 2.46875q-0.390625 1.125 -1.21875 1.984375q-0.890625 0.96875 -2.15625 1.484375q-1.265625 0.515625 -2.84375 0.515625zm13.066711 0q-1.421875 0 -2.5625 -0.640625q-1.140625 -0.65625 -1.78125 -1.796875q-0.640625 -1.15625 -0.640625 -2.59375q0 -1.359375 0.625 -2.53125q0.640625 -1.171875 1.75 -1.859375q1.125 -0.6875 2.515625 -0.6875q1.46875 0 2.53125 0.640625q1.0625 0.625 1.609375 1.734375q0.5625 1.09375 0.5625 2.4375q0 0.421875 -0.0625 0.828125l-7.140625 0q0.1875 1.125 0.90625 1.71875q0.71875 0.59375 1.71875 0.59375q0.84375 0 1.453125 -0.359375q0.609375 -0.375 0.953125 -0.984375l1.984375 0.96875q-1.453125 2.53125 -4.421875 2.53125zm2.25 -6.203125q-0.03125 -0.453125 -0.328125 -0.890625q-0.28125 -0.4375 -0.796875 -0.71875q-0.515625 -0.296875 -1.203125 -0.296875q-0.875 0 -1.5 0.515625q-0.625 0.5 -0.90625 1.390625l4.734375 0zm4.232483 -3.609375l2.296875 0l0 1.1875l0.15625 0q0.421875 -0.6875 1.203125 -1.078125q0.796875 -0.40625 1.703125 -0.40625q1.71875 0 2.609375 1.046875q0.90625 1.046875 0.90625 2.859375l0 5.90625l-2.453125 0l0 -5.609375q0 -0.921875 -0.46875 -1.421875q-0.46875 -0.5 -1.296875 -0.5q-1.0 0 -1.609375 0.765625q-0.59375 0.765625 -0.59375 1.921875l0 4.84375l-2.453125 0l0 -9.515625zm15.64386 9.8125q-1.421875 0 -2.5625 -0.640625q-1.140625 -0.65625 -1.78125 -1.796875q-0.640625 -1.15625 -0.640625 -2.59375q0 -1.359375 0.625 -2.53125q0.640625 -1.171875 1.75 -1.859375q1.125 -0.6875 2.515625 -0.6875q1.46875 0 2.53125 0.640625q1.0625 0.625 1.609375 1.734375q0.5625 1.09375 0.5625 2.4375q0 0.421875 -0.0625 0.828125l-7.140625 0q0.1875 1.125 0.90625 1.71875q0.71875 0.59375 1.71875 0.59375q0.84375 0 1.453125 -0.359375q0.609375 -0.375 0.953125 -0.984375l1.984375 0.96875q-1.453125 2.53125 -4.421875 2.53125zm2.25 -6.203125q-0.03125 -0.453125 -0.328125 -0.890625q-0.28125 -0.4375 -0.796875 -0.71875q-0.515625 -0.296875 -1.203125 -0.296875q-0.875 0 -1.5 0.515625q-0.625 0.5 -0.90625 1.390625l4.734375 0zm4.232422 -3.609375l2.296875 0l0 1.265625l0.15625 0q0.359375 -0.671875 1.09375 -1.109375q0.75 -0.4375 1.640625 -0.4375q0.65625 0 1.203125 0.203125l0 2.453125q-0.5 -0.1875 -0.859375 -0.265625q-0.34375 -0.09375 -0.734375 -0.09375q-1.078125 0 -1.71875 0.78125q-0.625 0.78125 -0.625 1.984375l0 4.734375l-2.453125 0l0 -9.515625zm10.589478 9.8125q-1.515625 0 -2.484375 -0.875q-0.96875 -0.890625 -0.96875 -2.328125q0 -0.953125 0.5 -1.671875q0.5 -0.734375 1.375 -1.125q0.890625 -0.390625 1.953125 -0.390625q1.46875 0 2.515625 0.421875l0 -0.40625q0 -0.78125 -0.59375 -1.265625q-0.578125 -0.484375 -1.578125 -0.484375q-0.671875 0 -1.296875 0.3125q-0.625 0.296875 -1.03125 0.796875l-1.5625 -1.234375q0.6875 -0.890625 1.734375 -1.375q1.046875 -0.484375 2.265625 -0.484375q2.1875 0 3.328125 1.015625q1.140625 1.0 1.140625 2.9375l0 5.859375l-2.40625 0l0 -0.96875l-0.15625 0q-0.421875 0.5625 -1.125 0.921875q-0.703125 0.34375 -1.609375 0.34375zm0.578125 -1.90625q1.0625 0 1.6875 -0.671875q0.625 -0.6875 0.625 -1.59375q-0.953125 -0.453125 -2.0 -0.453125q-1.90625 0 -1.90625 1.421875q0 0.578125 0.40625 0.9375q0.421875 0.359375 1.1875 0.359375zm10.920105 1.765625q-0.703125 0 -1.296875 -0.21875q-0.59375 -0.21875 -0.96875 -0.59375q-0.875 -0.84375 -0.875 -2.390625l0 -4.375l-1.671875 0l0 -2.09375l1.671875 0l0 -2.6875l2.4375 0l0 2.6875l2.328125 0l0 2.09375l-2.328125 0l0 3.9375q0 0.71875 0.3125 1.046875q0.265625 0.296875 0.90625 0.296875q0.34375 0 0.578125 -0.09375q0.234375 -0.09375 0.609375 -0.328125l0 2.390625q-0.796875 0.328125 -1.703125 0.328125zm5.1416626 -10.71875q-0.65625 0 -1.125 -0.46875q-0.46875 -0.46875 -0.46875 -1.109375q0 -0.65625 0.46875 -1.109375q0.46875 -0.46875 1.125 -0.46875q0.65625 0 1.109375 0.46875q0.453125 0.453125 0.453125 1.109375q0 0.640625 -0.453125 1.109375q-0.453125 0.46875 -1.109375 0.46875zm-1.234375 1.046875l2.4375 0l0 9.515625l-2.4375 0l0 -9.515625zm9.443359 9.8125q-1.484375 0 -2.640625 -0.65625q-1.15625 -0.671875 -1.796875 -1.8125q-0.640625 -1.15625 -0.640625 -2.578125q0 -1.421875 0.640625 -2.578125q0.640625 -1.15625 1.796875 -1.8125q1.15625 -0.671875 2.640625 -0.671875q1.453125 0 2.609375 0.671875q1.15625 0.65625 1.796875 1.8125q0.640625 1.15625 0.640625 2.578125q0 1.421875 -0.640625 2.578125q-0.640625 1.140625 -1.796875 1.8125q-1.15625 0.65625 -2.609375 0.65625zm0 -2.25q0.703125 0 1.296875 -0.328125q0.609375 -0.34375 0.953125 -0.984375q0.359375 -0.640625 0.359375 -1.484375q0 -0.84375 -0.359375 -1.46875q-0.34375 -0.640625 -0.953125 -0.96875q-0.59375 -0.34375 -1.296875 -0.34375q-0.71875 0 -1.3125 0.34375q-0.59375 0.328125 -0.96875 0.96875q-0.359375 0.625 -0.359375 1.46875q0 0.828125 0.359375 1.484375q0.375 0.640625 0.96875 0.984375q0.59375 0.328125 1.3125 0.328125zm6.7244263 -7.5625l2.296875 0l0 1.1875l0.15625 0q0.421875 -0.6875 1.203125 -1.078125q0.796875 -0.40625 1.703125 -0.40625q1.71875 0 2.609375 1.046875q0.90625 1.046875 0.90625 2.859375l0 5.90625l-2.453125 0l0 -5.609375q0 -0.921875 -0.46875 -1.421875q-0.46875 -0.5 -1.296875 -0.5q-1.0 0 -1.609375 0.765625q-0.59375 0.765625 -0.59375 1.921875l0 4.84375l-2.453125 0l0 -9.515625z" fill-rule="nonzero"/><path fill="#000000" fill-opacity="0.0" d="m501.4901 252.29396l40.06299 0" fill-rule="evenodd"/><path stroke="#000000" stroke-width="2.0" stroke-linejoin="round" stroke-linecap="butt" d="m501.4901 252.29396l28.062988 0" fill-rule="evenodd"/><path fill="#000000" stroke="#000000" stroke-width="2.0" stroke-linecap="butt" d="m529.5531 255.59743l9.076233 -3.3034668l-9.076233 -3.3034668z" fill-rule="evenodd"/><path fill="#a64d79" d="m540.3784 322.12283l0 0c0 -4.6300354 3.753357 -8.383392 8.383362 -8.383392l131.54822 0c2.2233887 0 4.355774 0.88323975 5.9279175 2.4554443c1.5722046 1.5721741 2.4554443 3.7045288 2.4554443 5.927948l0 33.53247c0 4.630005 -3.753357 8.383362 -8.383362 8.383362l-131.54822 0c-4.630005 0 -8.383362 -3.753357 -8.383362 -8.383362z" fill-rule="evenodd"/><path fill="#ffffff" d="m595.36084 325.0928l-3.203125 0l0 -1.296875l7.765625 0l0 1.296875l-3.203125 0l0 10.15625l-1.359375 0l0 -10.15625zm7.392395 10.40625q-0.890625 0 -1.59375 -0.34375q-0.703125 -0.359375 -1.09375 -0.96875q-0.375 -0.625 -0.375 -1.40625q0 -1.296875 0.96875 -2.015625q0.984375 -0.734375 2.46875 -0.734375q0.734375 0 1.359375 0.171875q0.640625 0.15625 0.96875 0.359375l0 -0.5q0 -0.90625 -0.640625 -1.453125q-0.625 -0.5625 -1.609375 -0.5625q-0.671875 0 -1.265625 0.296875q-0.578125 0.296875 -0.90625 0.828125l-1.03125 -0.765625q0.484375 -0.734375 1.328125 -1.15625q0.859375 -0.421875 1.875 -0.421875q1.671875 0 2.609375 0.875q0.9375 0.875 0.9375 2.375l0 5.171875l-1.296875 0l0 -1.171875l-0.0625 0q-0.34375 0.59375 -1.046875 1.015625q-0.703125 0.40625 -1.59375 0.40625zm0.140625 -1.1875q0.6875 0 1.265625 -0.34375q0.59375 -0.359375 0.9375 -0.953125q0.359375 -0.59375 0.359375 -1.296875q-0.375 -0.265625 -0.9375 -0.421875q-0.5625 -0.15625 -1.1875 -0.15625q-1.109375 0 -1.6875 0.46875q-0.5625 0.453125 -0.5625 1.1875q0 0.671875 0.5 1.09375q0.515625 0.421875 1.3125 0.421875zm5.7616577 -7.21875l1.28125 0l0 1.3125l0.078125 0q0.234375 -0.671875 0.921875 -1.109375q0.6875 -0.453125 1.4375 -0.453125q0.5625 0 0.96875 0.171875l0 1.46875q-0.515625 -0.265625 -1.15625 -0.265625q-0.59375 0 -1.09375 0.34375q-0.5 0.328125 -0.796875 0.90625q-0.28125 0.5625 -0.28125 1.21875l0 4.5625l-1.359375 0l0 -8.15625zm9.178162 11.875q-1.4375 0 -2.375 -0.671875q-0.9375 -0.671875 -1.265625 -1.625l1.25 -0.53125q0.265625 0.703125 0.890625 1.140625q0.640625 0.453125 1.5 0.453125q1.265625 0 1.953125 -0.734375q0.703125 -0.734375 0.703125 -2.09375l0 -0.90625l-0.0625 0q-0.40625 0.625 -1.140625 1.015625q-0.71875 0.390625 -1.640625 0.390625q-1.0625 0 -1.9375 -0.53125q-0.875 -0.546875 -1.390625 -1.515625q-0.5 -0.984375 -0.5 -2.234375q0 -1.234375 0.5 -2.21875q0.515625 -0.984375 1.390625 -1.53125q0.875 -0.546875 1.9375 -0.546875q0.921875 0 1.640625 0.40625q0.734375 0.390625 1.140625 1.03125l0.0625 0l0 -1.171875l1.296875 0l0 7.84375q0 1.953125 -1.09375 2.984375q-1.078125 1.046875 -2.859375 1.046875zm0 -4.796875q0.734375 0 1.328125 -0.359375q0.609375 -0.375 0.96875 -1.0625q0.359375 -0.6875 0.359375 -1.625q0 -0.96875 -0.359375 -1.65625q-0.359375 -0.6875 -0.96875 -1.046875q-0.59375 -0.359375 -1.328125 -0.359375q-0.734375 0 -1.34375 0.375q-0.609375 0.359375 -0.96875 1.046875q-0.359375 0.6875 -0.359375 1.640625q0 0.9375 0.359375 1.640625q0.359375 0.6875 0.96875 1.046875q0.609375 0.359375 1.34375 0.359375zm9.51886 1.328125q-1.171875 0 -2.109375 -0.546875q-0.921875 -0.5625 -1.4375 -1.546875q-0.515625 -0.984375 -0.515625 -2.21875q0 -1.171875 0.484375 -2.171875q0.484375 -1.0 1.390625 -1.59375q0.90625 -0.59375 2.078125 -0.59375q1.203125 0 2.078125 0.546875q0.875 0.53125 1.34375 1.484375q0.484375 0.9375 0.484375 2.15625q0 0.1875 -0.03125 0.40625l-6.46875 0q0.046875 0.9375 0.4375 1.578125q0.40625 0.640625 1.015625 0.96875q0.625 0.3125 1.296875 0.3125q1.59375 0 2.421875 -1.484375l1.140625 0.5625q-0.5 0.984375 -1.421875 1.5625q-0.90625 0.578125 -2.1875 0.578125zm2.34375 -5.1875q-0.03125 -0.515625 -0.28125 -1.03125q-0.25 -0.515625 -0.796875 -0.859375q-0.546875 -0.359375 -1.375 -0.359375q-0.96875 0 -1.640625 0.625q-0.671875 0.609375 -0.875 1.625l4.96875 0zm5.9242554 5.0625q-0.5 0 -0.921875 -0.15625q-0.421875 -0.15625 -0.703125 -0.4375q-0.640625 -0.609375 -0.640625 -1.671875l0 -4.78125l-1.421875 0l0 -1.234375l1.421875 0l0 -2.3125l1.359375 0l0 2.3125l2.0 0l0 1.234375l-2.0 0l0 4.4375q0 0.671875 0.25 1.0q0.3125 0.34375 0.890625 0.34375q0.484375 0 0.890625 -0.265625l0 1.328125q-0.234375 0.109375 -0.484375 0.15625q-0.25 0.046875 -0.640625 0.046875zm1.7153931 -0.125l0 0z" fill-rule="nonzero"/><path fill="#ffffff" d="m570.7174 354.49905q-1.203125 0 -2.171875 -0.5625q-0.953125 -0.578125 -1.484375 -1.5625q-0.53125 -1.0 -0.53125 -2.203125q0 -1.203125 0.53125 -2.1875q0.53125 -1.0 1.484375 -1.578125q0.96875 -0.578125 2.171875 -0.578125q1.203125 0 2.15625 0.578125q0.96875 0.578125 1.5 1.578125q0.53125 0.984375 0.53125 2.1875q0 1.203125 -0.53125 2.203125q-0.53125 0.984375 -1.5 1.5625q-0.953125 0.5625 -2.15625 0.5625zm0 -1.21875q0.75 0 1.390625 -0.375q0.65625 -0.390625 1.046875 -1.09375q0.390625 -0.703125 0.390625 -1.640625q0 -0.953125 -0.390625 -1.65625q-0.390625 -0.703125 -1.046875 -1.078125q-0.640625 -0.375 -1.390625 -0.375q-0.75 0 -1.40625 0.375q-0.65625 0.375 -1.046875 1.078125q-0.390625 0.703125 -0.390625 1.65625q0 0.9375 0.390625 1.640625q0.390625 0.703125 1.046875 1.09375q0.65625 0.375 1.40625 0.375zm5.6279297 -7.1875l1.296875 0l0 1.21875l0.0625 0q0.375 -0.625 1.109375 -1.046875q0.75 -0.4375 1.6875 -0.4375q1.09375 0 1.984375 0.5625q0.890625 0.5625 1.390625 1.5625q0.515625 0.984375 0.515625 2.21875q0 1.25 -0.515625 2.234375q-0.5 0.984375 -1.390625 1.546875q-0.890625 0.546875 -1.984375 0.546875q-0.9375 0 -1.6875 -0.421875q-0.734375 -0.421875 -1.109375 -1.03125l-0.0625 0l0.0625 1.125l0 3.53125l-1.359375 0l0 -11.609375zm4.0 7.1875q0.703125 0 1.328125 -0.390625q0.625 -0.390625 0.984375 -1.09375q0.375 -0.703125 0.375 -1.625q0 -0.9375 -0.375 -1.640625q-0.359375 -0.703125 -0.984375 -1.078125q-0.625 -0.390625 -1.328125 -0.390625q-0.71875 0 -1.34375 0.390625q-0.609375 0.375 -0.984375 1.078125q-0.375 0.703125 -0.375 1.640625q0 0.921875 0.375 1.625q0.375 0.703125 0.984375 1.09375q0.625 0.390625 1.34375 0.390625zm8.451111 1.09375q-0.5 0 -0.921875 -0.15625q-0.421875 -0.15625 -0.703125 -0.4375q-0.640625 -0.609375 -0.640625 -1.671875l0 -4.78125l-1.421875 0l0 -1.234375l1.421875 0l0 -2.3125l1.359375 0l0 2.3125l2.0 0l0 1.234375l-2.0 0l0 4.4375q0 0.671875 0.25 1.0q0.3125 0.34375 0.890625 0.34375q0.484375 0 0.890625 -0.265625l0 1.328125q-0.234375 0.109375 -0.484375 0.15625q-0.25 0.046875 -0.640625 0.046875zm3.6079102 -9.78125q-0.40625 0 -0.703125 -0.28125q-0.28125 -0.296875 -0.28125 -0.703125q0 -0.421875 0.28125 -0.6875q0.296875 -0.28125 0.703125 -0.28125q0.40625 0 0.6875 0.28125q0.28125 0.265625 0.28125 0.6875q0 0.40625 -0.28125 0.703125q-0.28125 0.28125 -0.6875 0.28125zm-0.671875 1.5l1.359375 0l0 8.15625l-1.359375 0l0 -8.15625zm3.4910278 0l1.28125 0l0 1.203125l0.078125 0q0.34375 -0.625 1.046875 -1.046875q0.703125 -0.421875 1.484375 -0.421875q0.90625 0 1.5625 0.4375q0.671875 0.4375 0.953125 1.15625q0.4375 -0.71875 1.140625 -1.15625q0.71875 -0.4375 1.65625 -0.4375q1.40625 0 2.125 0.859375q0.71875 0.859375 0.71875 2.328125l0 5.234375l-1.34375 0l0 -5.03125q0 -1.125 -0.46875 -1.640625q-0.46875 -0.515625 -1.375 -0.515625q-0.609375 0 -1.109375 0.359375q-0.5 0.34375 -0.78125 0.9375q-0.265625 0.59375 -0.265625 1.28125l0 4.609375l-1.359375 0l0 -5.015625q0 -1.140625 -0.46875 -1.65625q-0.46875 -0.515625 -1.359375 -0.515625q-0.609375 0 -1.109375 0.359375q-0.5 0.359375 -0.78125 0.96875q-0.265625 0.59375 -0.265625 1.28125l0 4.578125l-1.359375 0l0 -8.15625zm14.812866 -1.5q-0.40625 0 -0.703125 -0.28125q-0.28125 -0.296875 -0.28125 -0.703125q0 -0.421875 0.28125 -0.6875q0.296875 -0.28125 0.703125 -0.28125q0.40625 0 0.6875 0.28125q0.28125 0.265625 0.28125 0.6875q0 0.40625 -0.28125 0.703125q-0.28125 0.28125 -0.6875 0.28125zm-0.671875 1.5l1.359375 0l0 8.15625l-1.359375 0l0 -8.15625zm3.0691528 6.984375l4.8125 -5.75l-4.671875 0l0 -1.234375l6.34375 0l0 1.171875l-4.828125 5.75l4.953125 0l0 1.234375l-6.609375 0l0 -1.171875zm10.647644 1.421875q-0.890625 0 -1.59375 -0.34375q-0.703125 -0.359375 -1.09375 -0.96875q-0.375 -0.625 -0.375 -1.40625q0 -1.296875 0.96875 -2.015625q0.984375 -0.734375 2.46875 -0.734375q0.734375 0 1.359375 0.171875q0.640625 0.15625 0.96875 0.359375l0 -0.5q0 -0.90625 -0.640625 -1.453125q-0.625 -0.5625 -1.609375 -0.5625q-0.671875 0 -1.265625 0.296875q-0.578125 0.296875 -0.90625 0.828125l-1.03125 -0.765625q0.484375 -0.734375 1.328125 -1.15625q0.859375 -0.421875 1.875 -0.421875q1.671875 0 2.609375 0.875q0.9375 0.875 0.9375 2.375l0 5.171875l-1.296875 0l0 -1.171875l-0.0625 0q-0.34375 0.59375 -1.046875 1.015625q-0.703125 0.40625 -1.59375 0.40625zm0.140625 -1.1875q0.6875 0 1.265625 -0.34375q0.59375 -0.359375 0.9375 -0.953125q0.359375 -0.59375 0.359375 -1.296875q-0.375 -0.265625 -0.9375 -0.421875q-0.5625 -0.15625 -1.1875 -0.15625q-1.109375 0 -1.6875 0.46875q-0.5625 0.453125 -0.5625 1.1875q0 0.671875 0.5 1.09375q0.515625 0.421875 1.3125 0.421875zm8.726624 1.0625q-0.5 0 -0.921875 -0.15625q-0.421875 -0.15625 -0.703125 -0.4375q-0.640625 -0.609375 -0.640625 -1.671875l0 -4.78125l-1.421875 0l0 -1.234375l1.421875 0l0 -2.3125l1.359375 0l0 2.3125l2.0 0l0 1.234375l-2.0 0l0 4.4375q0 0.671875 0.25 1.0q0.3125 0.34375 0.890625 0.34375q0.484375 0 0.890625 -0.265625l0 1.328125q-0.234375 0.109375 -0.484375 0.15625q-0.25 0.046875 -0.640625 0.046875zm3.6079102 -9.78125q-0.40625 0 -0.703125 -0.28125q-0.28125 -0.296875 -0.28125 -0.703125q0 -0.421875 0.28125 -0.6875q0.296875 -0.28125 0.703125 -0.28125q0.40625 0 0.6875 0.28125q0.28125 0.265625 0.28125 0.6875q0 0.40625 -0.28125 0.703125q-0.28125 0.28125 -0.6875 0.28125zm-0.671875 1.5l1.359375 0l0 8.15625l-1.359375 0l0 -8.15625zm7.1610107 8.40625q-1.203125 0 -2.171875 -0.5625q-0.953125 -0.578125 -1.484375 -1.5625q-0.53125 -1.0 -0.53125 -2.203125q0 -1.203125 0.53125 -2.1875q0.53125 -1.0 1.484375 -1.578125q0.96875 -0.578125 2.171875 -0.578125q1.203125 0 2.15625 0.578125q0.96875 0.578125 1.5 1.578125q0.53125 0.984375 0.53125 2.1875q0 1.203125 -0.53125 2.203125q-0.53125 0.984375 -1.5 1.5625q-0.953125 0.5625 -2.15625 0.5625zm0 -1.21875q0.75 0 1.390625 -0.375q0.65625 -0.390625 1.046875 -1.09375q0.390625 -0.703125 0.390625 -1.640625q0 -0.953125 -0.390625 -1.65625q-0.390625 -0.703125 -1.046875 -1.078125q-0.640625 -0.375 -1.390625 -0.375q-0.75 0 -1.40625 0.375q-0.65625 0.375 -1.046875 1.078125q-0.390625 0.703125 -0.390625 1.65625q0 0.9375 0.390625 1.640625q0.390625 0.703125 1.046875 1.09375q0.65625 0.375 1.40625 0.375zm5.6260376 -7.1875l1.28125 0l0 1.203125l0.078125 0q0.328125 -0.609375 1.0625 -1.03125q0.734375 -0.4375 1.578125 -0.4375q1.46875 0 2.234375 0.859375q0.78125 0.859375 0.78125 2.328125l0 5.234375l-1.359375 0l0 -5.03125q0 -1.125 -0.546875 -1.640625q-0.546875 -0.515625 -1.484375 -0.515625q-0.65625 0 -1.1875 0.375q-0.515625 0.359375 -0.796875 0.96875q-0.28125 0.59375 -0.28125 1.25l0 4.59375l-1.359375 0l0 -8.15625zm11.739746 8.40625q-1.3125 0 -2.203125 -0.609375q-0.875 -0.609375 -1.234375 -1.578125l1.203125 -0.546875q0.3125 0.734375 0.90625 1.140625q0.609375 0.40625 1.328125 0.40625q0.765625 0 1.3125 -0.3125q0.546875 -0.3125 0.546875 -0.890625q0 -0.515625 -0.4375 -0.828125q-0.4375 -0.3125 -1.359375 -0.53125l-1.0 -0.265625q-0.96875 -0.234375 -1.59375 -0.8125q-0.625 -0.578125 -0.625 -1.484375q0 -0.703125 0.421875 -1.234375q0.421875 -0.546875 1.125 -0.828125q0.703125 -0.296875 1.53125 -0.296875q1.0625 0 1.90625 0.46875q0.84375 0.46875 1.1875 1.296875l-1.171875 0.546875q-0.546875 -1.09375 -1.9375 -1.09375q-0.671875 0 -1.1875 0.3125q-0.5 0.3125 -0.5 0.796875q0 0.453125 0.34375 0.734375q0.359375 0.265625 1.0625 0.453125l1.1875 0.296875q1.203125 0.3125 1.8125 0.90625q0.609375 0.59375 0.609375 1.46875q0 0.75 -0.4375 1.3125q-0.4375 0.5625 -1.171875 0.875q-0.734375 0.296875 -1.625 0.296875z" fill-rule="nonzero"/><path fill="#a64d79" d="m540.3784 392.38617l0 0c0 -4.630005 3.753357 -8.383362 8.383362 -8.383362l131.54822 0c2.2233887 0 4.355774 0.88323975 5.9279175 2.4554138c1.5722046 1.5722046 2.4554443 3.7045288 2.4554443 5.927948l0 33.53247c0 4.630005 -3.753357 8.383362 -8.383362 8.383362l-131.54822 0c-4.630005 0 -8.383362 -3.753357 -8.383362 -8.383362z" fill-rule="evenodd"/><path fill="#ffffff" d="m564.0176 415.26242q-1.65625 0 -3.0 -0.796875q-1.34375 -0.796875 -2.109375 -2.171875q-0.765625 -1.375 -0.765625 -3.015625q0 -1.625 0.765625 -3.0q0.765625 -1.375 2.109375 -2.171875q1.34375 -0.8125 3.0 -0.8125q1.640625 0 2.984375 0.8125q1.34375 0.796875 2.109375 2.171875q0.78125 1.375 0.78125 3.0q0 1.640625 -0.78125 3.015625q-0.765625 1.375 -2.109375 2.171875q-1.34375 0.796875 -2.984375 0.796875zm0 -1.296875q1.234375 0 2.265625 -0.59375q1.03125 -0.59375 1.625 -1.65625q0.609375 -1.078125 0.609375 -2.4375q0 -1.359375 -0.609375 -2.421875q-0.59375 -1.0625 -1.625 -1.65625q-1.03125 -0.609375 -2.265625 -0.609375q-1.234375 0 -2.265625 0.609375q-1.03125 0.59375 -1.640625 1.65625q-0.59375 1.0625 -0.59375 2.421875q0 1.359375 0.59375 2.4375q0.609375 1.0625 1.640625 1.65625q1.03125 0.59375 2.265625 0.59375zm7.527771 -7.109375l1.296875 0l0 1.21875l0.0625 0q0.375 -0.625 1.109375 -1.046875q0.75 -0.4375 1.6875 -0.4375q1.09375 0 1.984375 0.5625q0.890625 0.5625 1.390625 1.5625q0.515625 0.984375 0.515625 2.21875q0 1.25 -0.515625 2.234375q-0.5 0.984375 -1.390625 1.546875q-0.890625 0.546875 -1.984375 0.546875q-0.9375 0 -1.6875 -0.421875q-0.734375 -0.421875 -1.109375 -1.03125l-0.0625 0l0.0625 1.125l0 3.53125l-1.359375 0l0 -11.609375zm4.0 7.1875q0.703125 0 1.328125 -0.390625q0.625 -0.390625 0.984375 -1.09375q0.375 -0.703125 0.375 -1.625q0 -0.9375 -0.375 -1.640625q-0.359375 -0.703125 -0.984375 -1.078125q-0.625 -0.390625 -1.328125 -0.390625q-0.71875 0 -1.34375 0.390625q-0.609375 0.375 -0.984375 1.078125q-0.375 0.703125 -0.375 1.640625q0 0.921875 0.375 1.625q0.375 0.703125 0.984375 1.09375q0.625 0.390625 1.34375 0.390625zm9.271118 1.21875q-1.171875 0 -2.109375 -0.546875q-0.921875 -0.5625 -1.4375 -1.546875q-0.515625 -0.984375 -0.515625 -2.21875q0 -1.171875 0.484375 -2.171875q0.484375 -1.0 1.390625 -1.59375q0.90625 -0.59375 2.078125 -0.59375q1.203125 0 2.078125 0.546875q0.875 0.53125 1.34375 1.484375q0.484375 0.9375 0.484375 2.15625q0 0.1875 -0.03125 0.40625l-6.46875 0q0.046875 0.9375 0.4375 1.578125q0.40625 0.640625 1.015625 0.96875q0.625 0.3125 1.296875 0.3125q1.59375 0 2.421875 -1.484375l1.140625 0.5625q-0.5 0.984375 -1.421875 1.5625q-0.90625 0.578125 -2.1875 0.578125zm2.34375 -5.1875q-0.03125 -0.515625 -0.28125 -1.03125q-0.25 -0.515625 -0.796875 -0.859375q-0.546875 -0.359375 -1.375 -0.359375q-0.96875 0 -1.640625 0.625q-0.671875 0.609375 -0.875 1.625l4.96875 0zm3.0392456 -3.21875l1.28125 0l0 1.3125l0.078125 0q0.234375 -0.671875 0.921875 -1.109375q0.6875 -0.453125 1.4375 -0.453125q0.5625 0 0.96875 0.171875l0 1.46875q-0.515625 -0.265625 -1.15625 -0.265625q-0.59375 0 -1.09375 0.34375q-0.5 0.328125 -0.796875 0.90625q-0.28125 0.5625 -0.28125 1.21875l0 4.5625l-1.359375 0l0 -8.15625zm8.273804 8.40625q-0.890625 0 -1.59375 -0.34375q-0.703125 -0.359375 -1.09375 -0.96875q-0.375 -0.625 -0.375 -1.40625q0 -1.296875 0.96875 -2.015625q0.984375 -0.734375 2.46875 -0.734375q0.734375 0 1.359375 0.171875q0.640625 0.15625 0.96875 0.359375l0 -0.5q0 -0.90625 -0.640625 -1.453125q-0.625 -0.5625 -1.609375 -0.5625q-0.671875 0 -1.265625 0.296875q-0.578125 0.296875 -0.90625 0.828125l-1.03125 -0.765625q0.484375 -0.734375 1.328125 -1.15625q0.859375 -0.421875 1.875 -0.421875q1.671875 0 2.609375 0.875q0.9375 0.875 0.9375 2.375l0 5.171875l-1.296875 0l0 -1.171875l-0.0625 0q-0.34375 0.59375 -1.046875 1.015625q-0.703125 0.40625 -1.59375 0.40625zm0.140625 -1.1875q0.6875 0 1.265625 -0.34375q0.59375 -0.359375 0.9375 -0.953125q0.359375 -0.59375 0.359375 -1.296875q-0.375 -0.265625 -0.9375 -0.421875q-0.5625 -0.15625 -1.1875 -0.15625q-1.109375 0 -1.6875 0.46875q-0.5625 0.453125 -0.5625 1.1875q0 0.671875 0.5 1.09375q0.515625 0.421875 1.3125 0.421875zm8.726624 1.0625q-0.5 0 -0.921875 -0.15625q-0.421875 -0.15625 -0.703125 -0.4375q-0.640625 -0.609375 -0.640625 -1.671875l0 -4.78125l-1.421875 0l0 -1.234375l1.421875 0l0 -2.3125l1.359375 0l0 2.3125l2.0 0l0 1.234375l-2.0 0l0 4.4375q0 0.671875 0.25 1.0q0.3125 0.34375 0.890625 0.34375q0.484375 0 0.890625 -0.265625l0 1.328125q-0.234375 0.109375 -0.484375 0.15625q-0.25 0.046875 -0.640625 0.046875zm6.1290283 0.125q-1.203125 0 -2.171875 -0.5625q-0.953125 -0.578125 -1.484375 -1.5625q-0.53125 -1.0 -0.53125 -2.203125q0 -1.203125 0.53125 -2.1875q0.53125 -1.0 1.484375 -1.578125q0.96875 -0.578125 2.171875 -0.578125q1.203125 0 2.15625 0.578125q0.96875 0.578125 1.5 1.578125q0.53125 0.984375 0.53125 2.1875q0 1.203125 -0.53125 2.203125q-0.53125 0.984375 -1.5 1.5625q-0.953125 0.5625 -2.15625 0.5625zm0 -1.21875q0.75 0 1.390625 -0.375q0.65625 -0.390625 1.046875 -1.09375q0.390625 -0.703125 0.390625 -1.640625q0 -0.953125 -0.390625 -1.65625q-0.390625 -0.703125 -1.046875 -1.078125q-0.640625 -0.375 -1.390625 -0.375q-0.75 0 -1.40625 0.375q-0.65625 0.375 -1.046875 1.078125q-0.390625 0.703125 -0.390625 1.65625q0 0.9375 0.390625 1.640625q0.390625 0.703125 1.046875 1.09375q0.65625 0.375 1.40625 0.375zm5.6259766 -7.1875l1.28125 0l0 1.3125l0.078125 0q0.234375 -0.671875 0.921875 -1.109375q0.6875 -0.453125 1.4375 -0.453125q0.5625 0 0.96875 0.171875l0 1.46875q-0.515625 -0.265625 -1.15625 -0.265625q-0.59375 0 -1.09375 0.34375q-0.5 0.328125 -0.796875 0.90625q-0.28125 0.5625 -0.28125 1.21875l0 4.5625l-1.359375 0l0 -8.15625zm4.984436 8.15625l0 0zm5.345093 -6.921875l-1.5 0l0 -1.234375l1.5 0l0 -0.890625q0 -0.765625 0.34375 -1.328125q0.359375 -0.578125 0.953125 -0.890625q0.59375 -0.3125 1.3125 -0.3125q0.6875 0 1.203125 0.1875l0 1.328125q-0.296875 -0.109375 -0.5625 -0.171875q-0.25 -0.078125 -0.59375 -0.078125q-0.53125 0 -0.921875 0.375q-0.375 0.375 -0.375 1.046875l0 0.734375l2.09375 0l0 1.234375l-2.09375 0l0 6.921875l-1.359375 0l0 -6.921875zm7.9196167 7.171875q-1.46875 0 -2.25 -0.859375q-0.78125 -0.859375 -0.78125 -2.421875l0 -5.125l1.359375 0l0 4.921875q0 1.171875 0.53125 1.71875q0.53125 0.546875 1.421875 0.546875q0.6875 0 1.21875 -0.375q0.546875 -0.375 0.84375 -0.953125q0.296875 -0.59375 0.296875 -1.25l0 -4.609375l1.359375 0l0 8.15625l-1.296875 0l0 -1.1875l-0.0625 0q-0.34375 0.609375 -1.078125 1.03125q-0.734375 0.40625 -1.5625 0.40625zm8.786682 0q-1.3125 0 -2.203125 -0.609375q-0.875 -0.609375 -1.234375 -1.578125l1.203125 -0.546875q0.3125 0.734375 0.90625 1.140625q0.609375 0.40625 1.328125 0.40625q0.765625 0 1.3125 -0.3125q0.546875 -0.3125 0.546875 -0.890625q0 -0.515625 -0.4375 -0.828125q-0.4375 -0.3125 -1.359375 -0.53125l-1.0 -0.265625q-0.96875 -0.234375 -1.59375 -0.8125q-0.625 -0.578125 -0.625 -1.484375q0 -0.703125 0.421875 -1.234375q0.421875 -0.546875 1.125 -0.828125q0.703125 -0.296875 1.53125 -0.296875q1.0625 0 1.90625 0.46875q0.84375 0.46875 1.1875 1.296875l-1.171875 0.546875q-0.546875 -1.09375 -1.9375 -1.09375q-0.671875 0 -1.1875 0.3125q-0.5 0.3125 -0.5 0.796875q0 0.453125 0.34375 0.734375q0.359375 0.265625 1.0625 0.453125l1.1875 0.296875q1.203125 0.3125 1.8125 0.90625q0.609375 0.59375 0.609375 1.46875q0 0.75 -0.4375 1.3125q-0.4375 0.5625 -1.171875 0.875q-0.734375 0.296875 -1.625 0.296875zm5.441101 -9.90625q-0.40625 0 -0.703125 -0.28125q-0.28125 -0.296875 -0.28125 -0.703125q0 -0.421875 0.28125 -0.6875q0.296875 -0.28125 0.703125 -0.28125q0.40625 0 0.6875 0.28125q0.28125 0.265625 0.28125 0.6875q0 0.40625 -0.28125 0.703125q-0.28125 0.28125 -0.6875 0.28125zm-0.671875 1.5l1.359375 0l0 8.15625l-1.359375 0l0 -8.15625zm7.1610107 8.40625q-1.203125 0 -2.171875 -0.5625q-0.953125 -0.578125 -1.484375 -1.5625q-0.53125 -1.0 -0.53125 -2.203125q0 -1.203125 0.53125 -2.1875q0.53125 -1.0 1.484375 -1.578125q0.96875 -0.578125 2.171875 -0.578125q1.203125 0 2.15625 0.578125q0.96875 0.578125 1.5 1.578125q0.53125 0.984375 0.53125 2.1875q0 1.203125 -0.53125 2.203125q-0.53125 0.984375 -1.5 1.5625q-0.953125 0.5625 -2.15625 0.5625zm0 -1.21875q0.75 0 1.390625 -0.375q0.65625 -0.390625 1.046875 -1.09375q0.390625 -0.703125 0.390625 -1.640625q0 -0.953125 -0.390625 -1.65625q-0.390625 -0.703125 -1.046875 -1.078125q-0.640625 -0.375 -1.390625 -0.375q-0.75 0 -1.40625 0.375q-0.65625 0.375 -1.046875 1.078125q-0.390625 0.703125 -0.390625 1.65625q0 0.9375 0.390625 1.640625q0.390625 0.703125 1.046875 1.09375q0.65625 0.375 1.40625 0.375zm5.6260376 -7.1875l1.28125 0l0 1.203125l0.078125 0q0.328125 -0.609375 1.0625 -1.03125q0.734375 -0.4375 1.578125 -0.4375q1.46875 0 2.234375 0.859375q0.78125 0.859375 0.78125 2.328125l0 5.234375l-1.359375 0l0 -5.03125q0 -1.125 -0.546875 -1.640625q-0.546875 -0.515625 -1.484375 -0.515625q-0.65625 0 -1.1875 0.375q-0.515625 0.359375 -0.796875 0.96875q-0.28125 0.59375 -0.28125 1.25l0 4.59375l-1.359375 0l0 -8.15625z" fill-rule="nonzero"/><path fill="#a64d79" d="m540.3784 462.64954l0 0c0 -4.630005 3.753357 -8.383392 8.383362 -8.383392l131.54822 0c2.2233887 0 4.355774 0.88327026 5.9279175 2.4554443c1.5722046 1.5721741 2.4554443 3.7045288 2.4554443 5.927948l0 33.53247c0 4.630005 -3.753357 8.383362 -8.383362 8.383362l-131.54822 0c-4.630005 0 -8.383362 -3.753357 -8.383362 -8.383362z" fill-rule="evenodd"/><path fill="#ffffff" d="m558.7859 485.27576l0 -11.453125l1.515625 0l0 5.671875l5.6875 -5.671875l2.0625 0l-4.8125 4.640625l5.015625 6.8125l-2.0 0l-4.078125 -5.796875l-1.875 1.828125l0 3.96875l-1.515625 0zm16.234375 -2.671875l1.453125 0.171875q-0.34375 1.28125 -1.28125 1.984375q-0.921875 0.703125 -2.359375 0.703125q-1.828125 0 -2.890625 -1.125q-1.0625 -1.125 -1.0625 -3.140625q0 -2.09375 1.078125 -3.25q1.078125 -1.15625 2.796875 -1.15625q1.65625 0 2.703125 1.140625q1.0625 1.125 1.0625 3.171875q0 0.125 0 0.375l-6.1875 0q0.078125 1.375 0.765625 2.109375q0.703125 0.71875 1.734375 0.71875q0.78125 0 1.328125 -0.40625q0.546875 -0.40625 0.859375 -1.296875zm-4.609375 -2.28125l4.625 0q-0.09375 -1.046875 -0.53125 -1.5625q-0.671875 -0.8125 -1.734375 -0.8125q-0.96875 0 -1.640625 0.65625q-0.65625 0.640625 -0.71875 1.71875zm7.8203125 4.953125l0 -8.296875l1.265625 0l0 1.25q0.484375 -0.875 0.890625 -1.15625q0.40625 -0.28125 0.90625 -0.28125q0.703125 0 1.4375 0.453125l-0.484375 1.296875q-0.515625 -0.296875 -1.03125 -0.296875q-0.453125 0 -0.828125 0.28125q-0.359375 0.265625 -0.515625 0.765625q-0.234375 0.75 -0.234375 1.640625l0 4.34375l-1.40625 0zm5.34375 0l0 -8.296875l1.265625 0l0 1.171875q0.90625 -1.359375 2.640625 -1.359375q0.75 0 1.375 0.265625q0.625 0.265625 0.9375 0.703125q0.3125 0.4375 0.4375 1.046875q0.078125 0.390625 0.078125 1.359375l0 5.109375l-1.40625 0l0 -5.046875q0 -0.859375 -0.171875 -1.28125q-0.15625 -0.4375 -0.578125 -0.6875q-0.40625 -0.25 -0.96875 -0.25q-0.90625 0 -1.5625 0.578125q-0.640625 0.5625 -0.640625 2.15625l0 4.53125l-1.40625 0zm14.5703125 -2.671875l1.453125 0.171875q-0.34375 1.28125 -1.28125 1.984375q-0.921875 0.703125 -2.359375 0.703125q-1.828125 0 -2.890625 -1.125q-1.0625 -1.125 -1.0625 -3.140625q0 -2.09375 1.078125 -3.25q1.078125 -1.15625 2.796875 -1.15625q1.65625 0 2.703125 1.140625q1.0625 1.125 1.0625 3.171875q0 0.125 0 0.375l-6.1875 0q0.078125 1.375 0.765625 2.109375q0.703125 0.71875 1.734375 0.71875q0.78125 0 1.328125 -0.40625q0.546875 -0.40625 0.859375 -1.296875zm-4.609375 -2.28125l4.625 0q-0.09375 -1.046875 -0.53125 -1.5625q-0.671875 -0.8125 -1.734375 -0.8125q-0.96875 0 -1.640625 0.65625q-0.65625 0.640625 -0.71875 1.71875zm7.8046875 4.953125l0 -11.453125l1.40625 0l0 11.453125l-1.40625 0zm7.46875 -2.484375l1.390625 -0.21875q0.109375 0.84375 0.640625 1.296875q0.546875 0.4375 1.5 0.4375q0.96875 0 1.4375 -0.390625q0.46875 -0.40625 0.46875 -0.9375q0 -0.46875 -0.40625 -0.75q-0.296875 -0.1875 -1.4375 -0.46875q-1.546875 -0.390625 -2.15625 -0.671875q-0.59375 -0.296875 -0.90625 -0.796875q-0.296875 -0.5 -0.296875 -1.109375q0 -0.5625 0.25 -1.03125q0.25 -0.46875 0.6875 -0.78125q0.328125 -0.25 0.890625 -0.40625q0.578125 -0.171875 1.21875 -0.171875q0.984375 0 1.71875 0.28125q0.734375 0.28125 1.078125 0.765625q0.359375 0.46875 0.5 1.28125l-1.375 0.1875q-0.09375 -0.640625 -0.546875 -1.0q-0.453125 -0.359375 -1.265625 -0.359375q-0.96875 0 -1.390625 0.328125q-0.40625 0.3125 -0.40625 0.734375q0 0.28125 0.171875 0.5q0.171875 0.21875 0.53125 0.375q0.21875 0.078125 1.25 0.359375q1.484375 0.390625 2.078125 0.65625q0.59375 0.25 0.921875 0.734375q0.34375 0.484375 0.34375 1.203125q0 0.703125 -0.421875 1.328125q-0.40625 0.609375 -1.1875 0.953125q-0.765625 0.34375 -1.734375 0.34375q-1.625 0 -2.46875 -0.671875q-0.84375 -0.671875 -1.078125 -2.0zm14.234375 -0.1875l1.453125 0.171875q-0.34375 1.28125 -1.28125 1.984375q-0.921875 0.703125 -2.359375 0.703125q-1.828125 0 -2.890625 -1.125q-1.0625 -1.125 -1.0625 -3.140625q0 -2.09375 1.078125 -3.25q1.078125 -1.15625 2.796875 -1.15625q1.65625 0 2.703125 1.140625q1.0625 1.125 1.0625 3.171875q0 0.125 0 0.375l-6.1875 0q0.078125 1.375 0.765625 2.109375q0.703125 0.71875 1.734375 0.71875q0.78125 0 1.328125 -0.40625q0.546875 -0.40625 0.859375 -1.296875zm-4.609375 -2.28125l4.625 0q-0.09375 -1.046875 -0.53125 -1.5625q-0.671875 -0.8125 -1.734375 -0.8125q-0.96875 0 -1.640625 0.65625q-0.65625 0.640625 -0.71875 1.71875zm7.8046875 4.953125l0 -11.453125l1.40625 0l0 11.453125l-1.40625 0zm9.2578125 -2.671875l1.453125 0.171875q-0.34375 1.28125 -1.28125 1.984375q-0.921875 0.703125 -2.359375 0.703125q-1.828125 0 -2.890625 -1.125q-1.0625 -1.125 -1.0625 -3.140625q0 -2.09375 1.078125 -3.25q1.078125 -1.15625 2.796875 -1.15625q1.65625 0 2.703125 1.140625q1.0625 1.125 1.0625 3.171875q0 0.125 0 0.375l-6.1875 0q0.078125 1.375 0.765625 2.109375q0.703125 0.71875 1.734375 0.71875q0.78125 0 1.328125 -0.40625q0.546875 -0.40625 0.859375 -1.296875zm-4.609375 -2.28125l4.625 0q-0.09375 -1.046875 -0.53125 -1.5625q-0.671875 -0.8125 -1.734375 -0.8125q-0.96875 0 -1.640625 0.65625q-0.65625 0.640625 -0.71875 1.71875zm13.2421875 1.90625l1.390625 0.1875q-0.234375 1.421875 -1.171875 2.234375q-0.921875 0.8125 -2.28125 0.8125q-1.703125 0 -2.75 -1.109375q-1.03125 -1.125 -1.03125 -3.203125q0 -1.34375 0.4375 -2.34375q0.453125 -1.015625 1.359375 -1.515625q0.921875 -0.5 1.984375 -0.5q1.359375 0 2.21875 0.6875q0.859375 0.671875 1.09375 1.9375l-1.359375 0.203125q-0.203125 -0.828125 -0.703125 -1.25q-0.484375 -0.421875 -1.1875 -0.421875q-1.0625 0 -1.734375 0.765625q-0.65625 0.75 -0.65625 2.40625q0 1.671875 0.640625 2.4375q0.640625 0.75 1.671875 0.75q0.828125 0 1.375 -0.5q0.5625 -0.515625 0.703125 -1.578125zm5.65625 1.78125l0.203125 1.25q-0.59375 0.125 -1.0625 0.125q-0.765625 0 -1.1875 -0.234375q-0.421875 -0.25 -0.59375 -0.640625q-0.171875 -0.40625 -0.171875 -1.671875l0 -4.765625l-1.03125 0l0 -1.09375l1.03125 0l0 -2.0625l1.40625 -0.84375l0 2.90625l1.40625 0l0 1.09375l-1.40625 0l0 4.84375q0 0.609375 0.0625 0.78125q0.078125 0.171875 0.25 0.28125q0.171875 0.09375 0.484375 0.09375q0.234375 0 0.609375 -0.0625zm1.3828125 -8.578125l0 -1.609375l1.40625 0l0 1.609375l-1.40625 0zm0 9.84375l0 -8.296875l1.40625 0l0 8.296875l-1.40625 0zm3.0234375 -4.15625q0 -2.296875 1.28125 -3.40625q1.078125 -0.921875 2.609375 -0.921875q1.71875 0 2.796875 1.125q1.09375 1.109375 1.09375 3.09375q0 1.59375 -0.484375 2.515625q-0.484375 0.921875 -1.40625 1.4375q-0.90625 0.5 -2.0 0.5q-1.734375 0 -2.8125 -1.109375q-1.078125 -1.125 -1.078125 -3.234375zm1.453125 0q0 1.59375 0.6875 2.390625q0.703125 0.796875 1.75 0.796875q1.046875 0 1.734375 -0.796875q0.703125 -0.796875 0.703125 -2.4375q0 -1.53125 -0.703125 -2.328125q-0.6875 -0.796875 -1.734375 -0.796875q-1.046875 0 -1.75 0.796875q-0.6875 0.78125 -0.6875 2.375zm7.9765625 4.15625l0 -8.296875l1.265625 0l0 1.171875q0.90625 -1.359375 2.640625 -1.359375q0.75 0 1.375 0.265625q0.625 0.265625 0.9375 0.703125q0.3125 0.4375 0.4375 1.046875q0.078125 0.390625 0.078125 1.359375l0 5.109375l-1.40625 0l0 -5.046875q0 -0.859375 -0.171875 -1.28125q-0.15625 -0.4375 -0.578125 -0.6875q-0.40625 -0.25 -0.96875 -0.25q-0.90625 0 -1.5625 0.578125q-0.640625 0.5625 -0.640625 2.15625l0 4.53125l-1.40625 0z" fill-rule="nonzero"/><path fill="#a64d79" d="m540.3784 532.9129l0 0c0 -4.630005 3.753357 -8.383423 8.383362 -8.383423l131.54822 0c2.2233887 0 4.355774 0.8833008 5.9279175 2.4554443c1.5722046 1.5722046 2.4554443 3.7045288 2.4554443 5.9279785l0 33.53247c0 4.630005 -3.753357 8.383362 -8.383362 8.383362l-131.54822 0c-4.630005 0 -8.383362 -3.753357 -8.383362 -8.383362z" fill-rule="evenodd"/><path fill="#ffffff" d="m579.12964 546.0391l0 -11.453125l1.515625 0l0 11.453125l-1.515625 0zm4.0078125 0l0 -8.296875l1.265625 0l0 1.171875q0.90625 -1.359375 2.640625 -1.359375q0.75 0 1.375 0.265625q0.625 0.265625 0.9375 0.703125q0.3125 0.4375 0.4375 1.046875q0.078125 0.390625 0.078125 1.359375l0 5.109375l-1.40625 0l0 -5.046875q0 -0.859375 -0.171875 -1.28125q-0.15625 -0.4375 -0.578125 -0.6875q-0.40625 -0.25 -0.96875 -0.25q-0.90625 0 -1.5625 0.578125q-0.640625 0.5625 -0.640625 2.15625l0 4.53125l-1.40625 0zm8.3359375 -2.484375l1.390625 -0.21875q0.109375 0.84375 0.640625 1.296875q0.546875 0.4375 1.5 0.4375q0.96875 0 1.4375 -0.390625q0.46875 -0.40625 0.46875 -0.9375q0 -0.46875 -0.40625 -0.75q-0.296875 -0.1875 -1.4375 -0.46875q-1.546875 -0.390625 -2.15625 -0.671875q-0.59375 -0.296875 -0.90625 -0.796875q-0.296875 -0.5 -0.296875 -1.109375q0 -0.5625 0.25 -1.03125q0.25 -0.46875 0.6875 -0.78125q0.328125 -0.25 0.890625 -0.40625q0.578125 -0.171875 1.21875 -0.171875q0.984375 0 1.71875 0.28125q0.734375 0.28125 1.078125 0.765625q0.359375 0.46875 0.5 1.28125l-1.375 0.1875q-0.09375 -0.640625 -0.546875 -1.0q-0.453125 -0.359375 -1.265625 -0.359375q-0.96875 0 -1.390625 0.328125q-0.40625 0.3125 -0.40625 0.734375q0 0.28125 0.171875 0.5q0.171875 0.21875 0.53125 0.375q0.21875 0.078125 1.25 0.359375q1.484375 0.390625 2.078125 0.65625q0.59375 0.25 0.921875 0.734375q0.34375 0.484375 0.34375 1.203125q0 0.703125 -0.421875 1.328125q-0.40625 0.609375 -1.1875 0.953125q-0.765625 0.34375 -1.734375 0.34375q-1.625 0 -2.46875 -0.671875q-0.84375 -0.671875 -1.078125 -2.0zm11.625 1.21875l0.203125 1.25q-0.59375 0.125 -1.0625 0.125q-0.765625 0 -1.1875 -0.234375q-0.421875 -0.25 -0.59375 -0.640625q-0.171875 -0.40625 -0.171875 -1.671875l0 -4.765625l-1.03125 0l0 -1.09375l1.03125 0l0 -2.0625l1.40625 -0.84375l0 2.90625l1.40625 0l0 1.09375l-1.40625 0l0 4.84375q0 0.609375 0.0625 0.78125q0.078125 0.171875 0.25 0.28125q0.171875 0.09375 0.484375 0.09375q0.234375 0 0.609375 -0.0625zm1.3671875 1.265625l0 -8.296875l1.265625 0l0 1.25q0.484375 -0.875 0.890625 -1.15625q0.40625 -0.28125 0.90625 -0.28125q0.703125 0 1.4375 0.453125l-0.484375 1.296875q-0.515625 -0.296875 -1.03125 -0.296875q-0.453125 0 -0.828125 0.28125q-0.359375 0.265625 -0.515625 0.765625q-0.234375 0.75 -0.234375 1.640625l0 4.34375l-1.40625 0zm10.78125 0l0 -1.21875q-0.96875 1.40625 -2.640625 1.40625q-0.734375 0 -1.375 -0.28125q-0.625 -0.28125 -0.9375 -0.703125q-0.3125 -0.4375 -0.4375 -1.046875q-0.078125 -0.421875 -0.078125 -1.3125l0 -5.140625l1.40625 0l0 4.59375q0 1.109375 0.078125 1.484375q0.140625 0.5625 0.5625 0.875q0.4375 0.3125 1.0625 0.3125q0.640625 0 1.1875 -0.3125q0.5625 -0.328125 0.78125 -0.890625q0.234375 -0.5625 0.234375 -1.625l0 -4.4375l1.40625 0l0 8.296875l-1.25 0zm8.8671875 -3.046875l1.390625 0.1875q-0.234375 1.421875 -1.171875 2.234375q-0.921875 0.8125 -2.28125 0.8125q-1.703125 0 -2.75 -1.109375q-1.03125 -1.125 -1.03125 -3.203125q0 -1.34375 0.4375 -2.34375q0.453125 -1.015625 1.359375 -1.515625q0.921875 -0.5 1.984375 -0.5q1.359375 0 2.21875 0.6875q0.859375 0.671875 1.09375 1.9375l-1.359375 0.203125q-0.203125 -0.828125 -0.703125 -1.25q-0.484375 -0.421875 -1.1875 -0.421875q-1.0625 0 -1.734375 0.765625q-0.65625 0.75 -0.65625 2.40625q0 1.671875 0.640625 2.4375q0.640625 0.75 1.671875 0.75q0.828125 0 1.375 -0.5q0.5625 -0.515625 0.703125 -1.578125zm5.65625 1.78125l0.203125 1.25q-0.59375 0.125 -1.0625 0.125q-0.765625 0 -1.1875 -0.234375q-0.421875 -0.25 -0.59375 -0.640625q-0.171875 -0.40625 -0.171875 -1.671875l0 -4.765625l-1.03125 0l0 -1.09375l1.03125 0l0 -2.0625l1.40625 -0.84375l0 2.90625l1.40625 0l0 1.09375l-1.40625 0l0 4.84375q0 0.609375 0.0625 0.78125q0.078125 0.171875 0.25 0.28125q0.171875 0.09375 0.484375 0.09375q0.234375 0 0.609375 -0.0625zm1.3828125 -8.578125l0 -1.609375l1.40625 0l0 1.609375l-1.40625 0zm0 9.84375l0 -8.296875l1.40625 0l0 8.296875l-1.40625 0zm3.0234375 -4.15625q0 -2.296875 1.28125 -3.40625q1.078125 -0.921875 2.609375 -0.921875q1.71875 0 2.796875 1.125q1.09375 1.109375 1.09375 3.09375q0 1.59375 -0.484375 2.515625q-0.484375 0.921875 -1.40625 1.4375q-0.90625 0.5 -2.0 0.5q-1.734375 0 -2.8125 -1.109375q-1.078125 -1.125 -1.078125 -3.234375zm1.453125 0q0 1.59375 0.6875 2.390625q0.703125 0.796875 1.75 0.796875q1.046875 0 1.734375 -0.796875q0.703125 -0.796875 0.703125 -2.4375q0 -1.53125 -0.703125 -2.328125q-0.6875 -0.796875 -1.734375 -0.796875q-1.046875 0 -1.75 0.796875q-0.6875 0.78125 -0.6875 2.375zm7.9765625 4.15625l0 -8.296875l1.265625 0l0 1.171875q0.90625 -1.359375 2.640625 -1.359375q0.75 0 1.375 0.265625q0.625 0.265625 0.9375 0.703125q0.3125 0.4375 0.4375 1.046875q0.078125 0.390625 0.078125 1.359375l0 5.109375l-1.40625 0l0 -5.046875q0 -0.859375 -0.171875 -1.28125q-0.15625 -0.4375 -0.578125 -0.6875q-0.40625 -0.25 -0.96875 -0.25q-0.90625 0 -1.5625 0.578125q-0.640625 0.5625 -0.640625 2.15625l0 4.53125l-1.40625 0z" fill-rule="nonzero"/><path fill="#ffffff" d="m583.4617 562.55475l1.390625 -0.21875q0.109375 0.84375 0.640625 1.296875q0.546875 0.4375 1.5 0.4375q0.96875 0 1.4375 -0.390625q0.46875 -0.40625 0.46875 -0.9375q0 -0.46875 -0.40625 -0.75q-0.296875 -0.1875 -1.4375 -0.46875q-1.546875 -0.390625 -2.15625 -0.671875q-0.59375 -0.296875 -0.90625 -0.796875q-0.296875 -0.5 -0.296875 -1.109375q0 -0.5625 0.25 -1.03125q0.25 -0.46875 0.6875 -0.78125q0.328125 -0.25 0.890625 -0.40625q0.578125 -0.171875 1.21875 -0.171875q0.984375 0 1.71875 0.28125q0.734375 0.28125 1.078125 0.765625q0.359375 0.46875 0.5 1.28125l-1.375 0.1875q-0.09375 -0.640625 -0.546875 -1.0q-0.453125 -0.359375 -1.265625 -0.359375q-0.96875 0 -1.390625 0.328125q-0.40625 0.3125 -0.40625 0.734375q0 0.28125 0.171875 0.5q0.171875 0.21875 0.53125 0.375q0.21875 0.078125 1.25 0.359375q1.484375 0.390625 2.078125 0.65625q0.59375 0.25 0.921875 0.734375q0.34375 0.484375 0.34375 1.203125q0 0.703125 -0.421875 1.328125q-0.40625 0.609375 -1.1875 0.953125q-0.765625 0.34375 -1.734375 0.34375q-1.625 0 -2.46875 -0.671875q-0.84375 -0.671875 -1.078125 -2.0zm14.234375 -0.1875l1.453125 0.171875q-0.34375 1.28125 -1.28125 1.984375q-0.921875 0.703125 -2.359375 0.703125q-1.828125 0 -2.890625 -1.125q-1.0625 -1.125 -1.0625 -3.140625q0 -2.09375 1.078125 -3.25q1.078125 -1.15625 2.796875 -1.15625q1.65625 0 2.703125 1.140625q1.0625 1.125 1.0625 3.171875q0 0.125 0 0.375l-6.1875 0q0.078125 1.375 0.765625 2.109375q0.703125 0.71875 1.734375 0.71875q0.78125 0 1.328125 -0.40625q0.546875 -0.40625 0.859375 -1.296875zm-4.609375 -2.28125l4.625 0q-0.09375 -1.046875 -0.53125 -1.5625q-0.671875 -0.8125 -1.734375 -0.8125q-0.96875 0 -1.640625 0.65625q-0.65625 0.640625 -0.71875 1.71875zm7.8046875 4.953125l0 -11.453125l1.40625 0l0 11.453125l-1.40625 0zm9.2578125 -2.671875l1.453125 0.171875q-0.34375 1.28125 -1.28125 1.984375q-0.921875 0.703125 -2.359375 0.703125q-1.828125 0 -2.890625 -1.125q-1.0625 -1.125 -1.0625 -3.140625q0 -2.09375 1.078125 -3.25q1.078125 -1.15625 2.796875 -1.15625q1.65625 0 2.703125 1.140625q1.0625 1.125 1.0625 3.171875q0 0.125 0 0.375l-6.1875 0q0.078125 1.375 0.765625 2.109375q0.703125 0.71875 1.734375 0.71875q0.78125 0 1.328125 -0.40625q0.546875 -0.40625 0.859375 -1.296875zm-4.609375 -2.28125l4.625 0q-0.09375 -1.046875 -0.53125 -1.5625q-0.671875 -0.8125 -1.734375 -0.8125q-0.96875 0 -1.640625 0.65625q-0.65625 0.640625 -0.71875 1.71875zm13.2421875 1.90625l1.390625 0.1875q-0.234375 1.421875 -1.171875 2.234375q-0.921875 0.8125 -2.28125 0.8125q-1.703125 0 -2.75 -1.109375q-1.03125 -1.125 -1.03125 -3.203125q0 -1.34375 0.4375 -2.34375q0.453125 -1.015625 1.359375 -1.515625q0.921875 -0.5 1.984375 -0.5q1.359375 0 2.21875 0.6875q0.859375 0.671875 1.09375 1.9375l-1.359375 0.203125q-0.203125 -0.828125 -0.703125 -1.25q-0.484375 -0.421875 -1.1875 -0.421875q-1.0625 0 -1.734375 0.765625q-0.65625 0.75 -0.65625 2.40625q0 1.671875 0.640625 2.4375q0.640625 0.75 1.671875 0.75q0.828125 0 1.375 -0.5q0.5625 -0.515625 0.703125 -1.578125zm5.65625 1.78125l0.203125 1.25q-0.59375 0.125 -1.0625 0.125q-0.765625 0 -1.1875 -0.234375q-0.421875 -0.25 -0.59375 -0.640625q-0.171875 -0.40625 -0.171875 -1.671875l0 -4.765625l-1.03125 0l0 -1.09375l1.03125 0l0 -2.0625l1.40625 -0.84375l0 2.90625l1.40625 0l0 1.09375l-1.40625 0l0 4.84375q0 0.609375 0.0625 0.78125q0.078125 0.171875 0.25 0.28125q0.171875 0.09375 0.484375 0.09375q0.234375 0 0.609375 -0.0625zm1.3828125 -8.578125l0 -1.609375l1.40625 0l0 1.609375l-1.40625 0zm0 9.84375l0 -8.296875l1.40625 0l0 8.296875l-1.40625 0zm3.0234375 -4.15625q0 -2.296875 1.28125 -3.40625q1.078125 -0.921875 2.609375 -0.921875q1.71875 0 2.796875 1.125q1.09375 1.109375 1.09375 3.09375q0 1.59375 -0.484375 2.515625q-0.484375 0.921875 -1.40625 1.4375q-0.90625 0.5 -2.0 0.5q-1.734375 0 -2.8125 -1.109375q-1.078125 -1.125 -1.078125 -3.234375zm1.453125 0q0 1.59375 0.6875 2.390625q0.703125 0.796875 1.75 0.796875q1.046875 0 1.734375 -0.796875q0.703125 -0.796875 0.703125 -2.4375q0 -1.53125 -0.703125 -2.328125q-0.6875 -0.796875 -1.734375 -0.796875q-1.046875 0 -1.75 0.796875q-0.6875 0.78125 -0.6875 2.375zm7.9765625 4.15625l0 -8.296875l1.265625 0l0 1.171875q0.90625 -1.359375 2.640625 -1.359375q0.75 0 1.375 0.265625q0.625 0.265625 0.9375 0.703125q0.3125 0.4375 0.4375 1.046875q0.078125 0.390625 0.078125 1.359375l0 5.109375l-1.40625 0l0 -5.046875q0 -0.859375 -0.171875 -1.28125q-0.15625 -0.4375 -0.578125 -0.6875q-0.40625 -0.25 -0.96875 -0.25q-0.90625 0 -1.5625 0.578125q-0.640625 0.5625 -0.640625 2.15625l0 4.53125l-1.40625 0z" fill-rule="nonzero"/><path fill="#a64d79" d="m540.3784 603.1762l0 0c0 -4.630005 3.753357 -8.383362 8.383362 -8.383362l131.54822 0c2.2233887 0 4.355774 0.88323975 5.9279175 2.4554443c1.5722046 1.5722046 2.4554443 3.7045288 2.4554443 5.9279175l0 33.53247c0 4.630005 -3.753357 8.383423 -8.383362 8.383423l-131.54822 0c-4.630005 0 -8.383362 -3.753418 -8.383362 -8.383423z" fill-rule="evenodd"/><path fill="#ffffff" d="m609.32104 625.8025l0 -1.609375l1.609375 0l0 1.609375l-1.609375 0zm4.4453125 0l0 -1.609375l1.609375 0l0 1.609375l-1.609375 0zm4.4453125 0l0 -1.609375l1.609375 0l0 1.609375l-1.609375 0z" fill-rule="nonzero"/><path fill="#000000" fill-opacity="0.0" d="m689.88116 252.29396c8.189758 0 12.285095 43.65355 16.379578 87.30708c4.0944824 43.653564 8.188171 87.3071 16.376282 87.3071" fill-rule="evenodd"/><path stroke="#3f3f3f" stroke-width="2.0" stroke-linejoin="round" stroke-linecap="butt" d="m689.88116 252.29396c8.189758 0 12.285095 43.65355 16.379578 87.30708c2.0472412 21.826782 4.0942383 43.653564 6.653076 60.02362c0.63964844 4.0925293 1.3113403 7.843994 2.0229492 11.169189c0.35577393 1.6625977 0.7216797 3.218567 1.0985107 4.6573486c0.094177246 0.3597107 0.18902588 0.71206665 0.28466797 1.0569458l0.04724121 0.16818237" fill-rule="evenodd"/><path fill="#3f3f3f" stroke="#3f3f3f" stroke-width="2.0" stroke-linecap="butt" d="m713.55054 418.40237l7.558899 6.012787l-1.9254761 -9.464813z" fill-rule="evenodd"/><path fill="#254a89" d="m152.4042 148.53035l0 0c0 -4.630005 3.753357 -8.383362 8.383362 -8.383362l553.4695 0c2.2233887 0 4.355713 0.88323975 5.9279175 2.455429c1.5722046 1.5721893 2.4554443 3.7045288 2.4554443 5.9279327l0 33.53247c0 4.630005 -3.753357 8.383377 -8.383362 8.383377l-553.4695 0c-4.630005 0 -8.383362 -3.7533722 -8.383362 -8.383377z" fill-rule="evenodd"/><path fill="#ffffff" d="m421.15555 159.70346l1.96875 0l3.640625 9.390625l0.0625 0l3.65625 -9.390625l1.96875 0l0 11.453125l-1.34375 0l0 -7.28125l0.0625 -2.171875l-0.0625 0l-3.75 9.453125l-1.109375 0l-3.75 -9.453125l-0.0625 0l0.0625 2.171875l0 7.28125l-1.34375 0l0 -11.453125zm13.856049 0l1.359375 0l0 10.15625l5.0 0l0 1.296875l-6.359375 0l0 -11.453125zm8.143982 0l1.359375 0l0 11.453125l-1.359375 0l0 -11.453125zm3.920044 0l3.875 0q0.9375 0 1.75 0.4375q0.828125 0.421875 1.3125 1.203125q0.484375 0.765625 0.484375 1.75q0 0.796875 -0.359375 1.484375q-0.359375 0.6875 -1.0 1.15625q-0.625 0.46875 -1.390625 0.625l-0.03125 0.046875l3.234375 4.6875l0 0.0625l-1.609375 0l-3.109375 -4.671875l-1.796875 0l0 4.671875l-1.359375 0l0 -11.453125zm3.796875 5.515625q0.578125 0 1.09375 -0.265625q0.53125 -0.28125 0.84375 -0.765625q0.3125 -0.484375 0.3125 -1.09375q0 -0.53125 -0.28125 -1.0q-0.265625 -0.484375 -0.75 -0.78125q-0.484375 -0.3125 -1.125 -0.3125l-2.53125 0l0 4.21875l2.4375 0z" fill-rule="nonzero"/><path fill="#741b47" d="m843.1496 343.00012l0 0c0 -3.3804626 2.7404175 -6.1208496 6.1208496 -6.1208496l100.010254 0c1.623352 0 3.1802368 0.64486694 4.328125 1.7927551c1.1478882 1.1478882 1.7927246 2.7047424 1.7927246 4.3280945l0 24.482697c0 3.3804626 -2.7403564 6.1208496 -6.1208496 6.1208496l-100.010254 0c-3.3804321 0 -6.1208496 -2.740387 -6.1208496 -6.1208496z" fill-rule="evenodd"/><path fill="#ffffff" d="m889.5958 361.35147q-1.609375 0 -2.984375 -0.78125q-1.359375 -0.78125 -2.15625 -2.15625q-0.796875 -1.375 -0.796875 -3.046875q0 -1.65625 0.796875 -3.03125q0.796875 -1.375 2.15625 -2.15625q1.375 -0.796875 2.984375 -0.796875q1.265625 0 2.375 0.453125q1.109375 0.453125 1.828125 1.265625l-0.953125 0.96875q-0.5625 -0.671875 -1.421875 -1.03125q-0.84375 -0.359375 -1.8125 -0.359375q-1.203125 0 -2.265625 0.59375q-1.046875 0.578125 -1.6875 1.640625q-0.625 1.0625 -0.625 2.453125q0 1.390625 0.625 2.46875q0.640625 1.0625 1.6875 1.640625q1.0625 0.578125 2.265625 0.578125q1.15625 0 1.921875 -0.359375q0.78125 -0.359375 1.34375 -0.953125q0.421875 -0.4375 0.671875 -1.0625q0.25 -0.640625 0.3125 -1.40625l-4.234375 0l0 -1.265625l5.5 0q0.078125 0.453125 0.078125 0.828125q0 1.0625 -0.34375 2.0625q-0.328125 1.0 -1.0625 1.75q-1.578125 1.703125 -4.203125 1.703125zm7.6329956 -11.703125l3.859375 0q0.953125 0 1.765625 0.4375q0.828125 0.421875 1.3125 1.203125q0.484375 0.765625 0.484375 1.75q0 0.96875 -0.484375 1.75q-0.484375 0.78125 -1.3125 1.21875q-0.8125 0.421875 -1.765625 0.421875l-2.5 0l0 4.671875l-1.359375 0l0 -11.453125zm3.890625 5.484375q0.640625 0 1.125 -0.296875q0.484375 -0.3125 0.75 -0.78125q0.28125 -0.484375 0.28125 -1.015625q0 -0.53125 -0.28125 -1.0q-0.265625 -0.484375 -0.75 -0.78125q-0.484375 -0.3125 -1.125 -0.3125l-2.53125 0l0 4.1875l2.53125 0zm9.262878 6.21875q-1.234375 0 -2.203125 -0.5625q-0.953125 -0.5625 -1.484375 -1.578125q-0.515625 -1.015625 -0.515625 -2.34375l0 -7.21875l1.359375 0l0 7.28125q0 1.390625 0.734375 2.265625q0.734375 0.859375 2.109375 0.859375q1.375 0 2.109375 -0.859375q0.75 -0.875 0.75 -2.265625l0 -7.28125l1.359375 0l0 7.21875q0 1.328125 -0.5 2.359375q-0.5 1.015625 -1.46875 1.578125q-0.953125 0.546875 -2.25 0.546875z" fill-rule="nonzero"/><path fill="#741b47" d="m843.1496 391.00012l0 0c0 -3.3804626 2.7404175 -6.1208496 6.1208496 -6.1208496l100.010254 0c1.623352 0 3.1802368 0.64486694 4.328125 1.7927551c1.1478882 1.1478882 1.7927246 2.7047424 1.7927246 4.3280945l0 24.482697c0 3.3804626 -2.7403564 6.1208496 -6.1208496 6.1208496l-100.010254 0c-3.3804321 0 -6.1208496 -2.740387 -6.1208496 -6.1208496z" fill-rule="evenodd"/><path fill="#ffffff" d="m879.9968 397.64835l6.625 0l0 1.296875l-5.265625 0l0 3.921875l4.75 0l0 1.28125l-4.75 0l0 4.953125l-1.359375 0l0 -11.453125zm8.495972 0l3.859375 0q0.953125 0 1.765625 0.4375q0.828125 0.421875 1.3125 1.203125q0.484375 0.765625 0.484375 1.75q0 0.96875 -0.484375 1.75q-0.484375 0.78125 -1.3125 1.21875q-0.8125 0.421875 -1.765625 0.421875l-2.5 0l0 4.671875l-1.359375 0l0 -11.453125zm3.890625 5.484375q0.640625 0 1.125 -0.296875q0.484375 -0.3125 0.75 -0.78125q0.28125 -0.484375 0.28125 -1.015625q0 -0.53125 -0.28125 -1.0q-0.265625 -0.484375 -0.75 -0.78125q-0.484375 -0.3125 -1.125 -0.3125l-2.53125 0l0 4.1875l2.53125 0zm10.540405 6.21875q-1.609375 0 -2.984375 -0.78125q-1.359375 -0.78125 -2.15625 -2.15625q-0.796875 -1.375 -0.796875 -3.046875q0 -1.65625 0.796875 -3.03125q0.796875 -1.375 2.15625 -2.15625q1.375 -0.796875 2.984375 -0.796875q1.265625 0 2.375 0.453125q1.109375 0.453125 1.828125 1.265625l-0.953125 0.96875q-0.5625 -0.671875 -1.421875 -1.03125q-0.84375 -0.359375 -1.8125 -0.359375q-1.203125 0 -2.265625 0.59375q-1.046875 0.578125 -1.6875 1.640625q-0.625 1.0625 -0.625 2.453125q0 1.390625 0.625 2.46875q0.640625 1.0625 1.6875 1.640625q1.0625 0.578125 2.265625 0.578125q1.15625 0 1.921875 -0.359375q0.78125 -0.359375 1.34375 -0.953125q0.421875 -0.4375 0.671875 -1.0625q0.25 -0.640625 0.3125 -1.40625l-4.234375 0l0 -1.265625l5.5 0q0.078125 0.453125 0.078125 0.828125q0 1.0625 -0.34375 2.0625q-0.328125 1.0 -1.0625 1.75q-1.578125 1.703125 -4.203125 1.703125zm10.785522 -11.703125l1.53125 0l4.359375 11.453125l-1.5 0l-1.140625 -3.15625l-4.96875 0l-1.140625 3.15625l-1.5 0l4.359375 -11.453125zm2.78125 7.015625l-1.53125 -4.125l-0.453125 -1.234375l-0.0625 0l-0.453125 1.234375l-1.53125 4.125l4.03125 0z" fill-rule="nonzero"/><path fill="#741b47" d="m843.1496 447.00012l0 0c0 -3.3804626 2.7404175 -6.1208496 6.1208496 -6.1208496l100.010254 0c1.623352 0 3.1802368 0.64486694 4.328125 1.7927551c1.1478882 1.1478882 1.7927246 2.7047424 1.7927246 4.3280945l0 24.482697c0 3.3804626 -2.7403564 6.1208496 -6.1208496 6.1208496l-100.010254 0c-3.3804321 0 -6.1208496 -2.740387 -6.1208496 -6.1208496z" fill-rule="evenodd"/><path fill="#ffffff" d="m895.46985 465.19522q-0.4375 0 -0.734375 -0.296875q-0.296875 -0.296875 -0.296875 -0.71875q0 -0.421875 0.296875 -0.71875q0.296875 -0.296875 0.734375 -0.296875q0.4375 0 0.71875 0.296875q0.296875 0.296875 0.296875 0.71875q0 0.421875 -0.296875 0.71875q-0.28125 0.296875 -0.71875 0.296875zm3.8079834 0q-0.4375 0 -0.734375 -0.296875q-0.296875 -0.296875 -0.296875 -0.71875q0 -0.421875 0.296875 -0.71875q0.296875 -0.296875 0.734375 -0.296875q0.4375 0 0.71875 0.296875q0.296875 0.296875 0.296875 0.71875q0 0.421875 -0.296875 0.71875q-0.28125 0.296875 -0.71875 0.296875zm3.8080444 0q-0.4375 0 -0.734375 -0.296875q-0.296875 -0.296875 -0.296875 -0.71875q0 -0.421875 0.296875 -0.71875q0.296875 -0.296875 0.734375 -0.296875q0.4375 0 0.71875 0.296875q0.296875 0.296875 0.296875 0.71875q0 0.421875 -0.296875 0.71875q-0.28125 0.296875 -0.71875 0.296875z" fill-rule="nonzero"/><path fill="#741b47" d="m843.1496 503.00012l0 0c0 -3.3804626 2.7404175 -6.1208496 6.1208496 -6.1208496l100.010254 0c1.623352 0 3.1802368 0.64486694 4.328125 1.7927551c1.1478882 1.1478882 1.7927246 2.7047424 1.7927246 4.3280945l0 24.482666c0 3.3804932 -2.7403564 6.1209106 -6.1208496 6.1209106l-100.010254 0c-3.3804321 0 -6.1208496 -2.7404175 -6.1208496 -6.1209106z" fill-rule="evenodd"/><path fill="#ffffff" d="m888.7646 510.94522l-3.203125 0l0 -1.296875l7.765625 0l0 1.296875l-3.203125 0l0 10.1562195l-1.359375 0l0 -10.1562195zm6.264221 -1.296875l3.859375 0q0.953125 0 1.765625 0.4375q0.828125 0.421875 1.3125 1.203125q0.484375 0.7655945 0.484375 1.7499695q0 0.96875 -0.484375 1.75q-0.484375 0.78125 -1.3125 1.21875q-0.8125 0.421875 -1.765625 0.421875l-2.5 0l0 4.671875l-1.359375 0l0 -11.4530945zm3.890625 5.4843445q0.640625 0 1.125 -0.296875q0.484375 -0.3125 0.75 -0.78125q0.28125 -0.484375 0.28125 -1.015625q0 -0.53125 -0.28125 -1.0q-0.265625 -0.48434448 -0.75 -0.7812195q-0.484375 -0.3125 -1.125 -0.3125l-2.53125 0l0 4.1874695l2.53125 0zm9.262939 6.21875q-1.234375 0 -2.203125 -0.5625q-0.953125 -0.5625 -1.484375 -1.578125q-0.515625 -1.015625 -0.515625 -2.34375l0 -7.2187195l1.359375 0l0 7.2812195q0 1.390625 0.734375 2.265625q0.734375 0.859375 2.109375 0.859375q1.375 0 2.109375 -0.859375q0.75 -0.875 0.75 -2.265625l0 -7.2812195l1.359375 0l0 7.2187195q0 1.328125 -0.5 2.359375q-0.5 1.015625 -1.46875 1.578125q-0.953125 0.546875 -2.25 0.546875z" fill-rule="nonzero"/></g></svg>
\ No newline at end of file
diff --git a/tensorflow/compiler/mlir/g3doc/overview.md b/tensorflow/compiler/mlir/g3doc/overview.md
new file mode 100644
index 00000000000..885c04b9588
--- /dev/null
+++ b/tensorflow/compiler/mlir/g3doc/overview.md
@@ -0,0 +1,5 @@
+# MLIR overview
+
+## Overview
+
+<img alt="MLIR overview diagram" src="./images/mlir-infra.svg"/>
diff --git a/tensorflow/compiler/mlir/g3doc/tf_ops.md b/tensorflow/compiler/mlir/g3doc/tf_ops.md
new file mode 100644
index 00000000000..cedeba5dae1
--- /dev/null
+++ b/tensorflow/compiler/mlir/g3doc/tf_ops.md
@@ -0,0 +1,2761 @@
+<!-- Autogenerated by mlir-tblgen; don't manually edit -->
+# Operation definition
+## tf.Abs (TF::AbsOp)
+Computes the absolute value of a tensor.
+
+### Description:
+
+Given a tensor `x`, this operation returns a tensor containing the absolute
+value of each element in `x`. For example, if x is an input element and y is
+an output element, this operation computes \\(y = |x|\\).
+
+### Operands:
+1. `x`: tensor of floating-point or 32/64-bit integer values
+
+### Attributes:
+| Attribute | MLIR Type | Description |
+| :-------: | :-------: | ----------- |
+| `T` | `Attribute` | derived attribute attribute |
+
+### Results:
+1. `y`: tensor of floating-point or 32/64-bit integer values
+
+## tf.AddN (TF::AddNOp)
+Add all input tensors element wise.
+
+### Description:
+
+
+### Operands:
+1. `inputs`: tensor of bfloat16 type or 16-bit float or 32-bit float or 64-bit float or 16-bit integer or 32-bit integer or 64-bit integer or 8-bit integer or complex128 type or complex64 type or TensorFlow variant type values
+
+### Attributes:
+| Attribute | MLIR Type | Description |
+| :-------: | :-------: | ----------- |
+| `N` | `IntegerAttr` | 64-bit integer attribute whose minimal value is 1 attribute |
+| `T` | `Attribute` | derived attribute attribute |
+
+### Results:
+1. `sum`: tensor of bfloat16 type or 16-bit float or 32-bit float or 64-bit float or 16-bit integer or 32-bit integer or 64-bit integer or 8-bit integer or complex128 type or complex64 type or TensorFlow variant type values
+
+## tf.Add (TF::AddOp)
+Returns x + y element-wise.
+
+### Description:
+
+*NOTE*: `Add` supports broadcasting. `AddN` does not. More about broadcasting
+[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+
+### Operands:
+1. `x`: tensor of number or TensorFlow string type values
+1. `y`: tensor of number or TensorFlow string type values
+
+### Attributes:
+| Attribute | MLIR Type | Description |
+| :-------: | :-------: | ----------- |
+| `T` | `Attribute` | derived attribute attribute |
+
+### Results:
+1. `z`: tensor of number or TensorFlow string type values
+
+## tf.AddV2 (TF::AddV2Op)
+Returns x + y element-wise.
+
+### Description:
+
+*NOTE*: `Add` supports broadcasting. `AddN` does not. More about broadcasting
+[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+
+### Operands:
+1. `x`: tensor of number values
+1. `y`: tensor of number values
+
+### Attributes:
+| Attribute | MLIR Type | Description |
+| :-------: | :-------: | ----------- |
+| `T` | `Attribute` | derived attribute attribute |
+
+### Results:
+1. `z`: tensor of number values
+
+## tf.AvgPool (TF::AvgPoolOp)
+Performs average pooling on the input.
+
+### Description:
+
+Each entry in `output` is the mean of the corresponding size `ksize`
+window in `value`.
+
+### Operands:
+1. `value`: tensor of floating-point values
+
+### Attributes:
+| Attribute | MLIR Type | Description |
+| :-------: | :-------: | ----------- |
+| `ksize` | `ArrayAttr` | 64-bit integer array attribute with at least 4 elements attribute |
+| `strides` | `ArrayAttr` | 64-bit integer array attribute with at least 4 elements attribute |
+| `padding` | `StringAttr` | string attribute whose value is SAME, or VALID attribute |
+| `data_format` | `StringAttr` | 'NHWC' or 'NCHW' convnet data format attribute |
+| `T` | `Attribute` | derived attribute attribute |
+
+### Results:
+1. `output`: tensor of floating-point values
+
+## tf.BatchToSpaceND (TF::BatchToSpaceNDOp)
+BatchToSpace for N-D tensors of type T.
+
+### Description:
+
+This operation reshapes the "batch" dimension 0 into `M + 1` dimensions of shape
+`block_shape + [batch]`, interleaves these blocks back into the grid defined by
+the spatial dimensions `[1, ..., M]`, to obtain a result with the same rank as
+the input.  The spatial dimensions of this intermediate result are then
+optionally cropped according to `crops` to produce the output.  This is the
+reverse of SpaceToBatch.  See below for a precise description.
+
+### Operands:
+1. `input`: tensor of tf.dtype values
+1. `block_shape`: tensor of 32/64-bit integer values
+1. `crops`: tensor of 32/64-bit integer values
+
+### Attributes:
+| Attribute | MLIR Type | Description |
+| :-------: | :-------: | ----------- |
+| `T` | `Attribute` | derived attribute attribute |
+| `Tcrops` | `Attribute` | derived attribute attribute |
+| `Tblock_shape` | `Attribute` | derived attribute attribute |
+
+### Results:
+1. `output`: tensor of tf.dtype values
+
+## tf.BiasAdd (TF::BiasAddOp)
+Adds `bias` to `value`.
+
+### Description:
+
+This is a special case of `tf.add` where `bias` is restricted to be 1-D.
+Broadcasting is supported, so `value` may have any number of dimensions.
+
+### Operands:
+1. `value`: tensor of number values
+1. `bias`: tensor of number values
+
+### Attributes:
+| Attribute | MLIR Type | Description |
+| :-------: | :-------: | ----------- |
+| `data_format` | `StringAttr` | 'NHWC' or 'NCHW' convnet data format attribute |
+| `T` | `Attribute` | derived attribute attribute |
+
+### Results:
+1. `output`: tensor of number values
+
+## tf.Bitcast (TF::BitcastOp)
+
+Bitcasts a tensor from one type to another without copying data.
+  
+
+### Description:
+
+Given a tensor `input`, this operation returns a tensor that has the same buffer
+data as `input` with datatype `type`.
+
+If the input datatype `T` is larger than the output datatype `type` then the
+shape changes from [...] to [..., sizeof(`T`)/sizeof(`type`)].
+
+If `T` is smaller than `type`, the operator requires that the rightmost
+dimension be equal to sizeof(`type`)/sizeof(`T`). The shape then goes from
+[..., sizeof(`type`)/sizeof(`T`)] to [...].
+
+tf.bitcast() and tf.cast() work differently when real dtype is casted as a complex dtype
+(e.g. tf.complex64 or tf.complex128) as tf.cast() make imaginary part 0 while tf.bitcast()
+gives module error.
+For example,
+
+Example 1:
+```python
+>>> a = [1., 2., 3.]
+>>> equality_bitcast = tf.bitcast(a,tf.complex128)
+tensorflow.python.framework.errors_impl.InvalidArgumentError: Cannot bitcast from float to complex128: shape [3] [Op:Bitcast]
+>>> equality_cast = tf.cast(a,tf.complex128)
+>>> print(equality_cast)
+tf.Tensor([1.+0.j 2.+0.j 3.+0.j], shape=(3,), dtype=complex128)
+```
+Example 2:
+```python
+>>> tf.bitcast(tf.constant(0xffffffff, dtype=tf.uint32), tf.uint8)
+<tf.Tensor: ... shape=(4,), dtype=uint8, numpy=array([255, 255, 255, 255], dtype=uint8)>
+```
+Example 3:
+```python
+>>> x = [1., 2., 3.]
+>>> y = [0., 2., 3.]
+>>> equality= tf.equal(x,y)
+>>> equality_cast = tf.cast(equality,tf.float32)
+>>> equality_bitcast = tf.bitcast(equality_cast,tf.uint8)
+>>> print(equality)
+tf.Tensor([False True True], shape=(3,), dtype=bool)
+>>> print(equality_cast)
+tf.Tensor([0. 1. 1.], shape=(3,), dtype=float32)
+>>> print(equality_bitcast)
+tf.Tensor(
+[[ 0 0 0 0]
+ [ 0 0 128 63]
+ [ 0 0 128 63]], shape=(3, 4), dtype=uint8)
+```
+
+*NOTE*: Bitcast is implemented as a low-level cast, so machines with different
+endian orderings will give different results.
+
+### Operands:
+1. `input`: tensor of number values
+
+### Attributes:
+| Attribute | MLIR Type | Description |
+| :-------: | :-------: | ----------- |
+| `T` | `Attribute` | derived attribute attribute |
+| `type` | `Attribute` | derived attribute attribute |
+
+### Results:
+1. `output`: tensor of number values
+
+## tf.BroadcastTo (TF::BroadcastToOp)
+Broadcast an array for a compatible shape.
+
+### Description:
+
+Broadcasting is the process of making arrays to have compatible shapes
+for arithmetic operations. Two shapes are compatible if for each
+dimension pair they are either equal or one of them is one. When trying
+to broadcast a Tensor to a shape, it starts with the trailing dimensions,
+and works its way forward.
+
+For example,
+
+```python
+>>> x = tf.constant([1, 2, 3])
+>>> y = tf.broadcast_to(x, [3, 3])
+>>> sess.run(y)
+array([[1, 2, 3],
+       [1, 2, 3],
+       [1, 2, 3]], dtype=int32)
+```
+
+In the above example, the input Tensor with the shape of `[1, 3]`
+is broadcasted to output Tensor with shape of `[3, 3]`.
+
+### Operands:
+1. `input`: tensor of tf.dtype values
+1. `shape`: tensor of 32/64-bit integer values
+
+### Attributes:
+| Attribute | MLIR Type | Description |
+| :-------: | :-------: | ----------- |
+| `T` | `Attribute` | derived attribute attribute |
+| `Tidx` | `Attribute` | derived attribute attribute |
+
+### Results:
+1. `output`: tensor of tf.dtype values
+
+## tf.Cast (TF::CastOp)
+Cast x of type SrcT to y of DstT.
+
+### Description:
+
+
+### Operands:
+1. `x`: tensor of tf.dtype values
+
+### Attributes:
+| Attribute | MLIR Type | Description |
+| :-------: | :-------: | ----------- |
+| `Truncate` | `BoolAttr` | bool attribute attribute |
+| `SrcT` | `Attribute` | derived attribute attribute |
+| `DstT` | `Attribute` | derived attribute attribute |
+
+### Results:
+1. `y`: tensor of tf.dtype values
+
+## tf.Ceil (TF::CeilOp)
+Returns element-wise smallest integer not less than x.
+
+### Description:
+
+
+### Operands:
+1. `x`: tensor of floating-point values
+
+### Attributes:
+| Attribute | MLIR Type | Description |
+| :-------: | :-------: | ----------- |
+| `T` | `Attribute` | derived attribute attribute |
+
+### Results:
+1. `y`: tensor of floating-point values
+
+## tf.Concat (TF::ConcatOp)
+Concatenates tensors along one dimension.
+
+### Description:
+
+
+### Operands:
+1. `concat_dim`: tensor of 32-bit integer values
+1. `values`: tensor of tf.dtype values
+
+### Attributes:
+| Attribute | MLIR Type | Description |
+| :-------: | :-------: | ----------- |
+| `N` | `IntegerAttr` | 64-bit integer attribute whose minimal value is 2 attribute |
+| `T` | `Attribute` | derived attribute attribute |
+
+### Results:
+1. `output`: tensor of tf.dtype values
+
+## tf.ConcatV2 (TF::ConcatV2Op)
+Concatenates tensors along one dimension.
+
+### Description:
+
+
+### Operands:
+1. `values`: tensor of tf.dtype values
+1. `axis`: tensor of 32/64-bit integer values
+
+### Attributes:
+| Attribute | MLIR Type | Description |
+| :-------: | :-------: | ----------- |
+| `N` | `IntegerAttr` | 64-bit integer attribute whose minimal value is 2 attribute |
+| `T` | `Attribute` | derived attribute attribute |
+| `Tidx` | `Attribute` | derived attribute attribute |
+
+### Results:
+1. `output`: tensor of tf.dtype values
+
+## tf.Conj (TF::ConjOp)
+Returns the complex conjugate of a complex number.
+
+### Description:
+
+Given a tensor `input` of complex numbers, this operation returns a tensor of
+complex numbers that are the complex conjugate of each element in `input`. The
+complex numbers in `input` must be of the form \\(a + bj\\), where *a* is the
+real part and *b* is the imaginary part.
+
+The complex conjugate returned by this operation is of the form \\(a - bj\\).
+
+For example:
+
+```
+# tensor 'input' is [-2.25 + 4.75j, 3.25 + 5.75j]
+tf.conj(input) ==> [-2.25 - 4.75j, 3.25 - 5.75j]
+```
+
+### Operands:
+1. `input`: tensor of complex128 type or complex64 type or TensorFlow variant type values
+
+### Attributes:
+| Attribute | MLIR Type | Description |
+| :-------: | :-------: | ----------- |
+| `T` | `Attribute` | derived attribute attribute |
+
+### Results:
+1. `output`: tensor of complex128 type or complex64 type or TensorFlow variant type values
+
+## tf.Const (TF::ConstOp)
+Constant tensor op
+
+### Description:
+
+
+### Operands:
+
+### Attributes:
+| Attribute | MLIR Type | Description |
+| :-------: | :-------: | ----------- |
+| `value` | `ElementsAttr` | constant vector/tensor attribute attribute |
+| `dtype` | `Attribute` | derived attribute attribute |
+
+### Results:
+1. `output`: tensor of tf.dtype values
+
+## tf.Conv2D (TF::Conv2DOp)
+
+Computes a 2-D convolution given 4-D `input` and `filter` tensors.
+  
+
+### Description:
+
+Given an input tensor of shape `[batch, in_height, in_width, in_channels]`
+and a filter / kernel tensor of shape
+`[filter_height, filter_width, in_channels, out_channels]`, this op
+performs the following:
+
+1. Flattens the filter to a 2-D matrix with shape
+   `[filter_height * filter_width * in_channels, output_channels]`.
+2. Extracts image patches from the input tensor to form a *virtual*
+   tensor of shape `[batch, out_height, out_width,
+   filter_height * filter_width * in_channels]`.
+3. For each patch, right-multiplies the filter matrix and the image patch
+   vector.
+
+In detail, with the default NHWC format,
+
+    output[b, i, j, k] =
+        sum_{di, dj, q} input[b, strides[1] * i + di, strides[2] * j + dj, q] *
+                        filter[di, dj, q, k]
+
+Must have `strides[0] = strides[3] = 1`.  For the most common case of the same
+horizontal and vertices strides, `strides = [1, stride, stride, 1]`.
+
+### Operands:
+1. `input`: tensor of floating-point values
+1. `filter`: tensor of floating-point values
+
+### Attributes:
+| Attribute | MLIR Type | Description |
+| :-------: | :-------: | ----------- |
+| `strides` | `ArrayAttr` | 64-bit integer array attribute attribute |
+| `use_cudnn_on_gpu` | `BoolAttr` | bool attribute attribute |
+| `padding` | `StringAttr` | string attribute whose value is SAME, or VALID, or EXPLICIT attribute |
+| `explicit_paddings` | `ArrayAttr` | 64-bit integer array attribute attribute |
+| `data_format` | `StringAttr` | 'NHWC' or 'NCHW' convnet data format attribute |
+| `dilations` | `ArrayAttr` | 64-bit integer array attribute attribute |
+| `T` | `Attribute` | derived attribute attribute |
+
+### Results:
+1. `output`: tensor of floating-point values
+
+## tf.Cos (TF::CosOp)
+Computes cos of x element-wise.
+
+### Description:
+
+
+### Operands:
+1. `x`: tensor of floating-point or 64/128-bit complex type values
+
+### Attributes:
+| Attribute | MLIR Type | Description |
+| :-------: | :-------: | ----------- |
+| `T` | `Attribute` | derived attribute attribute |
+
+### Results:
+1. `y`: tensor of floating-point or 64/128-bit complex type values
+
+## tf.DepthwiseConv2dNative (TF::DepthwiseConv2dNativeOp)
+
+Computes a 2-D depthwise convolution given 4-D `input` and `filter` tensors.
+  
+
+### Description:
+
+Given an input tensor of shape `[batch, in_height, in_width, in_channels]`
+and a filter / kernel tensor of shape
+`[filter_height, filter_width, in_channels, channel_multiplier]`, containing
+`in_channels` convolutional filters of depth 1, `depthwise_conv2d` applies
+a different filter to each input channel (expanding from 1 channel to
+`channel_multiplier` channels for each), then concatenates the results
+together. Thus, the output has `in_channels * channel_multiplier` channels.
+
+```
+for k in 0..in_channels-1
+  for q in 0..channel_multiplier-1
+    output[b, i, j, k * channel_multiplier + q] =
+      sum_{di, dj} input[b, strides[1] * i + di, strides[2] * j + dj, k] *
+                        filter[di, dj, k, q]
+```
+
+Must have `strides[0] = strides[3] = 1`.  For the most common case of the same
+horizontal and vertices strides, `strides = [1, stride, stride, 1]`.
+
+### Operands:
+1. `input`: tensor of floating-point values
+1. `filter`: tensor of floating-point values
+
+### Attributes:
+| Attribute | MLIR Type | Description |
+| :-------: | :-------: | ----------- |
+| `strides` | `ArrayAttr` | 64-bit integer array attribute attribute |
+| `padding` | `StringAttr` | string attribute whose value is SAME, or VALID attribute |
+| `data_format` | `StringAttr` | 'NHWC' or 'NCHW' convnet data format attribute |
+| `dilations` | `ArrayAttr` | 64-bit integer array attribute attribute |
+| `T` | `Attribute` | derived attribute attribute |
+
+### Results:
+1. `output`: tensor of floating-point values
+
+## tf.Div (TF::DivOp)
+Returns x / y element-wise.
+
+### Description:
+
+*NOTE*: `Div` supports broadcasting. More about broadcasting
+[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+
+### Operands:
+1. `x`: tensor of number values
+1. `y`: tensor of number values
+
+### Attributes:
+| Attribute | MLIR Type | Description |
+| :-------: | :-------: | ----------- |
+| `T` | `Attribute` | derived attribute attribute |
+
+### Results:
+1. `z`: tensor of number values
+
+## tf.Elu (TF::EluOp)
+
+Computes exponential linear: `exp(features) - 1` if < 0, `features` otherwise.
+  
+
+### Description:
+
+See [Fast and Accurate Deep Network Learning by Exponential Linear Units (ELUs)
+](http://arxiv.org/abs/1511.07289)
+
+### Operands:
+1. `features`: tensor of floating-point values
+
+### Attributes:
+| Attribute | MLIR Type | Description |
+| :-------: | :-------: | ----------- |
+| `T` | `Attribute` | derived attribute attribute |
+
+### Results:
+1. `activations`: tensor of floating-point values
+
+## tf.Equal (TF::EqualOp)
+Returns the truth value of (x == y) element-wise.
+
+### Description:
+
+*NOTE*: `Equal` supports broadcasting. More about broadcasting
+[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+
+```python
+x = tf.constant([2, 4])
+y = tf.constant(2)
+tf.math.equal(x, y) ==> array([True, False])
+
+x = tf.constant([2, 4])
+y = tf.constant([2, 4])
+tf.math.equal(x, y) ==> array([True,  True])
+```
+
+### Operands:
+1. `x`: tensor of bfloat16 type or 16-bit float or 32-bit float or 64-bit float or 1-bit integer or 16-bit integer or 32-bit integer or 64-bit integer or 8-bit integer or complex128 type or complex64 type or TensorFlow string type values
+1. `y`: tensor of bfloat16 type or 16-bit float or 32-bit float or 64-bit float or 1-bit integer or 16-bit integer or 32-bit integer or 64-bit integer or 8-bit integer or complex128 type or complex64 type or TensorFlow string type values
+
+### Attributes:
+| Attribute | MLIR Type | Description |
+| :-------: | :-------: | ----------- |
+| `T` | `Attribute` | derived attribute attribute |
+
+### Results:
+1. `z`: tensor of 1-bit integer values
+
+## tf.ExpandDims (TF::ExpandDimsOp)
+Inserts a dimension of 1 into a tensor's shape.
+
+### Description:
+
+Given a tensor `input`, this operation inserts a dimension of 1 at the
+dimension index `axis` of `input`'s shape. The dimension index `axis` starts at
+zero; if you specify a negative number for `axis` it is counted backward from
+the end.
+
+This operation is useful if you want to add a batch dimension to a single
+element. For example, if you have a single image of shape `[height, width,
+channels]`, you can make it a batch of 1 image with `expand_dims(image, 0)`,
+which will make the shape `[1, height, width, channels]`.
+
+Other examples:
+
+```
+# 't' is a tensor of shape [2]
+shape(expand_dims(t, 0)) ==> [1, 2]
+shape(expand_dims(t, 1)) ==> [2, 1]
+shape(expand_dims(t, -1)) ==> [2, 1]
+
+# 't2' is a tensor of shape [2, 3, 5]
+shape(expand_dims(t2, 0)) ==> [1, 2, 3, 5]
+shape(expand_dims(t2, 2)) ==> [2, 3, 1, 5]
+shape(expand_dims(t2, 3)) ==> [2, 3, 5, 1]
+```
+
+This operation requires that:
+
+`-1-input.dims() <= dim <= input.dims()`
+
+This operation is related to `squeeze()`, which removes dimensions of
+size 1.
+
+### Operands:
+1. `input`: tensor of tf.dtype values
+1. `dim`: tensor of 32/64-bit integer values
+
+### Attributes:
+| Attribute | MLIR Type | Description |
+| :-------: | :-------: | ----------- |
+| `T` | `Attribute` | derived attribute attribute |
+| `Tdim` | `Attribute` | derived attribute attribute |
+
+### Results:
+1. `output`: tensor of tf.dtype values
+
+## tf.FakeQuantWithMinMaxArgs (TF::FakeQuantWithMinMaxArgsOp)
+
+Fake-quantize the 'inputs' tensor, type float to 'outputs' tensor of same type.
+  
+
+### Description:
+
+Attributes `[min; max]` define the clamping range for the `inputs` data.
+`inputs` values are quantized into the quantization range (`[0; 2^num_bits - 1]`
+when `narrow_range` is false and `[1; 2^num_bits - 1]` when it is true) and
+then de-quantized and output as floats in `[min; max]` interval.
+`num_bits` is the bitwidth of the quantization; between 2 and 16, inclusive.
+
+Before quantization, `min` and `max` values are adjusted with the following
+logic.
+It is suggested to have `min <= 0 <= max`. If `0` is not in the range of values,
+the behavior can be unexpected:
+If `0 < min < max`: `min_adj = 0` and `max_adj = max - min`.
+If `min < max < 0`: `min_adj = min - max` and `max_adj = 0`.
+If `min <= 0 <= max`: `scale = (max - min) / (2^num_bits - 1) `,
+`min_adj = scale * round(min / scale)` and `max_adj = max + min_adj - min`.
+
+Quantization is called fake since the output is still in floating point.
+
+### Operands:
+1. `inputs`: tensor of 32-bit float values
+
+### Attributes:
+| Attribute | MLIR Type | Description |
+| :-------: | :-------: | ----------- |
+| `min` | `FloatAttr` | 32-bit float attribute attribute |
+| `max` | `FloatAttr` | 32-bit float attribute attribute |
+| `num_bits` | `IntegerAttr` | 64-bit integer attribute attribute |
+| `narrow_range` | `BoolAttr` | bool attribute attribute |
+
+### Results:
+1. `outputs`: tensor of 32-bit float values
+
+## tf.FakeQuantWithMinMaxVars (TF::FakeQuantWithMinMaxVarsOp)
+
+Fake-quantize the 'inputs' tensor of type float via global float scalars `min`
+  
+
+### Description:
+
+and `max` to 'outputs' tensor of same shape as `inputs`.
+
+`[min; max]` define the clamping range for the `inputs` data.
+`inputs` values are quantized into the quantization range (`[0; 2^num_bits - 1]`
+when `narrow_range` is false and `[1; 2^num_bits - 1]` when it is true) and
+then de-quantized and output as floats in `[min; max]` interval.
+`num_bits` is the bitwidth of the quantization; between 2 and 16, inclusive.
+
+Before quantization, `min` and `max` values are adjusted with the following
+logic.
+It is suggested to have `min <= 0 <= max`. If `0` is not in the range of values,
+the behavior can be unexpected:
+If `0 < min < max`: `min_adj = 0` and `max_adj = max - min`.
+If `min < max < 0`: `min_adj = min - max` and `max_adj = 0`.
+If `min <= 0 <= max`: `scale = (max - min) / (2^num_bits - 1) `,
+`min_adj = scale * round(min / scale)` and `max_adj = max + min_adj - min`.
+
+This operation has a gradient and thus allows for training `min` and `max`
+values.
+
+### Operands:
+1. `inputs`: tensor of 32-bit float values
+1. `min`: tensor of 32-bit float values
+1. `max`: tensor of 32-bit float values
+
+### Attributes:
+| Attribute | MLIR Type | Description |
+| :-------: | :-------: | ----------- |
+| `num_bits` | `IntegerAttr` | 64-bit integer attribute attribute |
+| `narrow_range` | `BoolAttr` | bool attribute attribute |
+
+### Results:
+1. `outputs`: tensor of 32-bit float values
+
+## tf.Fill (TF::FillOp)
+Creates a tensor filled with a scalar value.
+
+### Description:
+
+This operation creates a tensor of shape `dims` and fills it with `value`.
+
+For example:
+
+```
+# Output tensor has shape [2, 3].
+fill([2, 3], 9) ==> [[9, 9, 9]
+                     [9, 9, 9]]
+```
+
+`tf.fill` differs from `tf.constant` in a few ways:
+
+*   `tf.fill` only supports scalar contents, whereas `tf.constant` supports
+    Tensor values.
+*   `tf.fill` creates an Op in the computation graph that constructs the actual
+    Tensor value at runtime. This is in contrast to `tf.constant` which embeds
+    the entire Tensor into the graph with a `Const` node.
+*   Because `tf.fill` evaluates at graph runtime, it supports dynamic shapes
+    based on other runtime Tensors, unlike `tf.constant`.
+
+### Operands:
+1. `dims`: tensor of 32/64-bit integer values
+1. `value`: tensor of tf.dtype values
+
+### Attributes:
+| Attribute | MLIR Type | Description |
+| :-------: | :-------: | ----------- |
+| `T` | `Attribute` | derived attribute attribute |
+| `index_type` | `Attribute` | derived attribute attribute |
+
+### Results:
+1. `output`: tensor of tf.dtype values
+
+## tf.FloorDiv (TF::FloorDivOp)
+Returns x // y element-wise.
+
+### Description:
+
+*NOTE*: `FloorDiv` supports broadcasting. More about broadcasting
+[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+
+### Operands:
+1. `x`: tensor of number values
+1. `y`: tensor of number values
+
+### Attributes:
+| Attribute | MLIR Type | Description |
+| :-------: | :-------: | ----------- |
+| `T` | `Attribute` | derived attribute attribute |
+
+### Results:
+1. `z`: tensor of number values
+
+## tf.Floor (TF::FloorOp)
+Returns element-wise largest integer not greater than x.
+
+### Description:
+
+
+### Operands:
+1. `x`: tensor of floating-point values
+
+### Attributes:
+| Attribute | MLIR Type | Description |
+| :-------: | :-------: | ----------- |
+| `T` | `Attribute` | derived attribute attribute |
+
+### Results:
+1. `y`: tensor of floating-point values
+
+## tf.FusedBatchNorm (TF::FusedBatchNormOp)
+Batch normalization.
+
+### Description:
+
+Note that the size of 4D Tensors are defined by either "NHWC" or "NCHW".
+The size of 1D Tensors matches the dimension C of the 4D Tensors.
+
+### Operands:
+1. `x`: tensor of 32-bit float values
+1. `scale`: tensor of 32-bit float values
+1. `offset`: tensor of 32-bit float values
+1. `mean`: tensor of 32-bit float values
+1. `variance`: tensor of 32-bit float values
+
+### Attributes:
+| Attribute | MLIR Type | Description |
+| :-------: | :-------: | ----------- |
+| `epsilon` | `FloatAttr` | 32-bit float attribute attribute |
+| `data_format` | `StringAttr` | 'NHWC' or 'NCHW' convnet data format attribute |
+| `is_training` | `BoolAttr` | bool attribute attribute |
+| `T` | `Attribute` | derived attribute attribute |
+
+### Results:
+1. `y`: tensor of 32-bit float values
+1. `batch_mean`: tensor of 32-bit float values
+1. `batch_variance`: tensor of 32-bit float values
+1. `reserve_space_1`: tensor of 32-bit float values
+1. `reserve_space_2`: tensor of 32-bit float values
+
+## tf.Gather (TF::GatherOp)
+Gather slices from `params` according to `indices`.
+
+### Description:
+
+`indices` must be an integer tensor of any dimension (usually 0-D or 1-D).
+Produces an output tensor with shape `indices.shape + params.shape[1:]` where:
+
+```python
+    # Scalar indices
+    output[:, ..., :] = params[indices, :, ... :]
+
+    # Vector indices
+    output[i, :, ..., :] = params[indices[i], :, ... :]
+
+    # Higher rank indices
+    output[i, ..., j, :, ... :] = params[indices[i, ..., j], :, ..., :]
+```
+
+If `indices` is a permutation and `len(indices) == params.shape[0]` then
+this operation will permute `params` accordingly.
+
+`validate_indices`: DEPRECATED. If this operation is assigned to CPU, values in
+`indices` are always validated to be within range. If assigned to GPU,
+out-of-bound indices result in safe but unspecified behavior, which may include
+raising an error.
+
+<div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+<img style="width:100%" src="https://www.tensorflow.org/images/Gather.png" alt>
+</div>
+
+### Operands:
+1. `params`: tensor of tf.dtype values
+1. `indices`: tensor of 32/64-bit integer values
+
+### Attributes:
+| Attribute | MLIR Type | Description |
+| :-------: | :-------: | ----------- |
+| `validate_indices` | `BoolAttr` | bool attribute attribute |
+| `Tindices` | `Attribute` | derived attribute attribute |
+| `Tparams` | `Attribute` | derived attribute attribute |
+
+### Results:
+1. `output`: tensor of tf.dtype values
+
+## tf.GatherV2 (TF::GatherV2Op)
+
+Gather slices from `params` axis `axis` according to `indices`.
+  
+
+### Description:
+
+`indices` must be an integer tensor of any dimension (usually 0-D or 1-D).
+Produces an output tensor with shape `params.shape[:axis] + indices.shape +
+params.shape[axis + 1:]` where:
+
+```python
+    # Scalar indices (output is rank(params) - 1).
+    output[a_0, ..., a_n, b_0, ..., b_n] =
+      params[a_0, ..., a_n, indices, b_0, ..., b_n]
+
+    # Vector indices (output is rank(params)).
+    output[a_0, ..., a_n, i, b_0, ..., b_n] =
+      params[a_0, ..., a_n, indices[i], b_0, ..., b_n]
+
+    # Higher rank indices (output is rank(params) + rank(indices) - 1).
+    output[a_0, ..., a_n, i, ..., j, b_0, ... b_n] =
+      params[a_0, ..., a_n, indices[i, ..., j], b_0, ..., b_n]
+```
+
+<div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+<img style="width:100%" src="https://www.tensorflow.org/images/Gather.png" alt>
+</div>
+
+Note that on CPU, if an out of bound index is found, an error is returned.
+On GPU, if an out of bound index is found, a 0 is stored in the
+corresponding output value.
+
+See also `tf.batch_gather` and `tf.gather_nd`.
+
+### Operands:
+1. `params`: tensor of tf.dtype values
+1. `indices`: tensor of 32/64-bit integer values
+1. `axis`: tensor of 32/64-bit integer values
+
+### Attributes:
+| Attribute | MLIR Type | Description |
+| :-------: | :-------: | ----------- |
+| `batch_dims` | `IntegerAttr` | 64-bit integer attribute attribute |
+| `Tindices` | `Attribute` | derived attribute attribute |
+| `Tparams` | `Attribute` | derived attribute attribute |
+| `Taxis` | `Attribute` | derived attribute attribute |
+
+### Results:
+1. `output`: tensor of tf.dtype values
+
+## tf.GreaterEqual (TF::GreaterEqualOp)
+Returns the truth value of (x >= y) element-wise.
+
+### Description:
+
+*NOTE*: `GreaterEqual` supports broadcasting. More about broadcasting
+[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+
+### Operands:
+1. `x`: tensor of 8/16/32/64-bit integer or floating-point values
+1. `y`: tensor of 8/16/32/64-bit integer or floating-point values
+
+### Attributes:
+| Attribute | MLIR Type | Description |
+| :-------: | :-------: | ----------- |
+| `T` | `Attribute` | derived attribute attribute |
+
+### Results:
+1. `z`: tensor of 1-bit integer values
+
+## tf.Greater (TF::GreaterOp)
+Returns the truth value of (x > y) element-wise.
+
+### Description:
+
+*NOTE*: `Greater` supports broadcasting. More about broadcasting
+[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+
+### Operands:
+1. `x`: tensor of 8/16/32/64-bit integer or floating-point values
+1. `y`: tensor of 8/16/32/64-bit integer or floating-point values
+
+### Attributes:
+| Attribute | MLIR Type | Description |
+| :-------: | :-------: | ----------- |
+| `T` | `Attribute` | derived attribute attribute |
+
+### Results:
+1. `z`: tensor of 1-bit integer values
+
+## tf.IdentityN (TF::IdentityNOp)
+
+Returns a list of tensors with the same shapes and contents as the input
+  
+
+### Description:
+
+tensors.
+
+This op can be used to override the gradient for complicated functions. For
+example, suppose y = f(x) and we wish to apply a custom function g for backprop
+such that dx = g(dy). In Python,
+
+```python
+with tf.get_default_graph().gradient_override_map(
+    {'IdentityN': 'OverrideGradientWithG'}):
+  y, _ = identity_n([f(x), x])
+
+@tf.RegisterGradient('OverrideGradientWithG')
+def ApplyG(op, dy, _):
+  return [None, g(dy)]  # Do not backprop to f(x).
+```
+
+### Operands:
+1. `input`: tensor of tf.dtype values
+
+### Attributes:
+| Attribute | MLIR Type | Description |
+| :-------: | :-------: | ----------- |
+| `T` | `Attribute` | derived attribute attribute |
+
+### Results:
+1. `output`: tensor of tf.dtype values
+
+## tf.Identity (TF::IdentityOp)
+Identity op
+
+### Description:
+
+Returns a tensor with the same shape and contents as input.
+
+### Operands:
+1. `input`: tensor of tf.dtype values
+
+### Attributes:
+| Attribute | MLIR Type | Description |
+| :-------: | :-------: | ----------- |
+| `T` | `Attribute` | derived attribute attribute |
+
+### Results:
+1. `output`: tensor of tf.dtype values
+
+## tf.Invert (TF::InvertOp)
+
+Invert (flip) each bit of supported types; for example, type `uint8` value 01010101 becomes 10101010.
+  
+
+### Description:
+
+Flip each bit of supported types.  For example, type `int8` (decimal 2) binary 00000010 becomes (decimal -3) binary 11111101.
+This operation is performed on each element of the tensor argument `x`.
+
+### Operands:
+1. `x`: tensor of 8/16/32/64-bit integer values
+
+### Attributes:
+| Attribute | MLIR Type | Description |
+| :-------: | :-------: | ----------- |
+| `T` | `Attribute` | derived attribute attribute |
+
+### Results:
+1. `y`: tensor of 8/16/32/64-bit integer values
+
+## tf.LeakyRelu (TF::LeakyReluOp)
+Computes rectified linear: `max(features, features * alpha)`.
+
+### Description:
+
+
+### Operands:
+1. `features`: tensor of floating-point values
+
+### Attributes:
+| Attribute | MLIR Type | Description |
+| :-------: | :-------: | ----------- |
+| `alpha` | `FloatAttr` | 32-bit float attribute attribute |
+| `T` | `Attribute` | derived attribute attribute |
+
+### Results:
+1. `activations`: tensor of floating-point values
+
+## tf.LessEqual (TF::LessEqualOp)
+Returns the truth value of (x <= y) element-wise.
+
+### Description:
+
+*NOTE*: `LessEqual` supports broadcasting. More about broadcasting
+[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+
+### Operands:
+1. `x`: tensor of 8/16/32/64-bit integer or floating-point values
+1. `y`: tensor of 8/16/32/64-bit integer or floating-point values
+
+### Attributes:
+| Attribute | MLIR Type | Description |
+| :-------: | :-------: | ----------- |
+| `T` | `Attribute` | derived attribute attribute |
+
+### Results:
+1. `z`: tensor of 1-bit integer values
+
+## tf.Less (TF::LessOp)
+Returns the truth value of (x < y) element-wise.
+
+### Description:
+
+*NOTE*: `Less` supports broadcasting. More about broadcasting
+[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+
+### Operands:
+1. `x`: tensor of 8/16/32/64-bit integer or floating-point values
+1. `y`: tensor of 8/16/32/64-bit integer or floating-point values
+
+### Attributes:
+| Attribute | MLIR Type | Description |
+| :-------: | :-------: | ----------- |
+| `T` | `Attribute` | derived attribute attribute |
+
+### Results:
+1. `z`: tensor of 1-bit integer values
+
+## tf.Log (TF::LogOp)
+Computes natural logarithm of x element-wise.
+
+### Description:
+
+I.e., \\(y = \log_e x\\).
+
+### Operands:
+1. `x`: tensor of floating-point or 64/128-bit complex type values
+
+### Attributes:
+| Attribute | MLIR Type | Description |
+| :-------: | :-------: | ----------- |
+| `T` | `Attribute` | derived attribute attribute |
+
+### Results:
+1. `y`: tensor of floating-point or 64/128-bit complex type values
+
+## tf.LogSoftmax (TF::LogSoftmaxOp)
+Computes log softmax activations.
+
+### Description:
+
+For each batch `i` and class `j` we have
+
+    logsoftmax[i, j] = logits[i, j] - log(sum(exp(logits[i])))
+
+### Operands:
+1. `logits`: tensor of floating-point values
+
+### Attributes:
+| Attribute | MLIR Type | Description |
+| :-------: | :-------: | ----------- |
+| `T` | `Attribute` | derived attribute attribute |
+
+### Results:
+1. `logsoftmax`: tensor of floating-point values
+
+## tf.LogicalAnd (TF::LogicalAndOp)
+Returns the truth value of x AND y element-wise.
+
+### Description:
+
+*NOTE*: `LogicalAnd` supports broadcasting. More about broadcasting
+[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+
+### Operands:
+1. `x`: tensor of 1-bit integer values
+1. `y`: tensor of 1-bit integer values
+
+### Attributes:
+
+### Results:
+1. `z`: tensor of 1-bit integer values
+
+## tf.LogicalNot (TF::LogicalNotOp)
+Returns the truth value of NOT x element-wise.
+
+### Description:
+
+
+### Operands:
+1. `x`: tensor of 1-bit integer values
+
+### Attributes:
+
+### Results:
+1. `y`: tensor of 1-bit integer values
+
+## tf.LogicalOr (TF::LogicalOrOp)
+Returns the truth value of x OR y element-wise.
+
+### Description:
+
+*NOTE*: `LogicalOr` supports broadcasting. More about broadcasting
+[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+
+### Operands:
+1. `x`: tensor of 1-bit integer values
+1. `y`: tensor of 1-bit integer values
+
+### Attributes:
+
+### Results:
+1. `z`: tensor of 1-bit integer values
+
+## tf.MatMul (TF::MatMulOp)
+
+Multiply the matrix "a" by the matrix "b".
+  
+
+### Description:
+
+The inputs must be two-dimensional matrices and the inner dimension of
+"a" (after being transposed if transpose_a is true) must match the
+outer dimension of "b" (after being transposed if transposed_b is
+true).
+
+*Note*: The default kernel implementation for MatMul on GPUs uses
+cublas.
+
+### Operands:
+1. `a`: tensor of bfloat16 type or 16-bit float or 32-bit float or 64-bit float or 32-bit integer or 64-bit integer or complex128 type or complex64 type values
+1. `b`: tensor of bfloat16 type or 16-bit float or 32-bit float or 64-bit float or 32-bit integer or 64-bit integer or complex128 type or complex64 type values
+
+### Attributes:
+| Attribute | MLIR Type | Description |
+| :-------: | :-------: | ----------- |
+| `transpose_a` | `BoolAttr` | bool attribute attribute |
+| `transpose_b` | `BoolAttr` | bool attribute attribute |
+| `T` | `Attribute` | derived attribute attribute |
+
+### Results:
+1. `product`: tensor of bfloat16 type or 16-bit float or 32-bit float or 64-bit float or 32-bit integer or 64-bit integer or complex128 type or complex64 type values
+
+## tf.Max (TF::MaxOp)
+
+Computes the maximum of elements across dimensions of a tensor.
+  
+
+### Description:
+
+Reduces `input` along the dimensions given in `axis`. Unless
+`keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
+`axis`. If `keep_dims` is true, the reduced dimensions are
+retained with length 1.
+
+### Operands:
+1. `input`: tensor of number values
+1. `reduction_indices`: tensor of 32/64-bit integer values
+
+### Attributes:
+| Attribute | MLIR Type | Description |
+| :-------: | :-------: | ----------- |
+| `keep_dims` | `BoolAttr` | bool attribute attribute |
+| `T` | `Attribute` | derived attribute attribute |
+| `Tidx` | `Attribute` | derived attribute attribute |
+
+### Results:
+1. `output`: tensor of number values
+
+## tf.MaxPool (TF::MaxPoolOp)
+Performs max pooling on the input.
+
+### Description:
+
+
+### Operands:
+1. `input`: tensor of 8/16/32/64-bit integer or floating-point values
+
+### Attributes:
+| Attribute | MLIR Type | Description |
+| :-------: | :-------: | ----------- |
+| `ksize` | `ArrayAttr` | 64-bit integer array attribute with at least 4 elements attribute |
+| `strides` | `ArrayAttr` | 64-bit integer array attribute with at least 4 elements attribute |
+| `padding` | `StringAttr` | string attribute whose value is SAME, or VALID attribute |
+| `data_format` | `StringAttr` | string attribute whose value is NHWC, or NCHW, or NCHW_VECT_C attribute |
+| `T` | `Attribute` | derived attribute attribute |
+
+### Results:
+1. `output`: tensor of 8/16/32/64-bit integer or floating-point values
+
+## tf.Maximum (TF::MaximumOp)
+Returns the max of x and y (i.e. x > y ? x : y) element-wise.
+
+### Description:
+
+*NOTE*: `Maximum` supports broadcasting. More about broadcasting
+[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+
+### Operands:
+1. `x`: tensor of floating-point or 32/64-bit integer values
+1. `y`: tensor of floating-point or 32/64-bit integer values
+
+### Attributes:
+| Attribute | MLIR Type | Description |
+| :-------: | :-------: | ----------- |
+| `T` | `Attribute` | derived attribute attribute |
+
+### Results:
+1. `z`: tensor of floating-point or 32/64-bit integer values
+
+## tf.Mean (TF::MeanOp)
+Computes the mean of elements across dimensions of a tensor.
+
+### Description:
+
+Reduces `input` along the dimensions given in `axis`. Unless
+`keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
+`axis`. If `keep_dims` is true, the reduced dimensions are
+retained with length 1.
+
+### Operands:
+1. `input`: tensor of number values
+1. `reduction_indices`: tensor of 32/64-bit integer values
+
+### Attributes:
+| Attribute | MLIR Type | Description |
+| :-------: | :-------: | ----------- |
+| `keep_dims` | `BoolAttr` | bool attribute attribute |
+| `T` | `Attribute` | derived attribute attribute |
+| `Tidx` | `Attribute` | derived attribute attribute |
+
+### Results:
+1. `output`: tensor of number values
+
+## tf.Min (TF::MinOp)
+
+Computes the minimum of elements across dimensions of a tensor.
+  
+
+### Description:
+
+Reduces `input` along the dimensions given in `axis`. Unless
+`keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
+`axis`. If `keep_dims` is true, the reduced dimensions are
+retained with length 1.
+
+### Operands:
+1. `input`: tensor of number values
+1. `reduction_indices`: tensor of 32/64-bit integer values
+
+### Attributes:
+| Attribute | MLIR Type | Description |
+| :-------: | :-------: | ----------- |
+| `keep_dims` | `BoolAttr` | bool attribute attribute |
+| `T` | `Attribute` | derived attribute attribute |
+| `Tidx` | `Attribute` | derived attribute attribute |
+
+### Results:
+1. `output`: tensor of number values
+
+## tf.Minimum (TF::MinimumOp)
+Returns the min of x and y (i.e. x < y ? x : y) element-wise.
+
+### Description:
+
+*NOTE*: `Minimum` supports broadcasting. More about broadcasting
+[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+
+### Operands:
+1. `x`: tensor of floating-point or 32/64-bit integer values
+1. `y`: tensor of floating-point or 32/64-bit integer values
+
+### Attributes:
+| Attribute | MLIR Type | Description |
+| :-------: | :-------: | ----------- |
+| `T` | `Attribute` | derived attribute attribute |
+
+### Results:
+1. `z`: tensor of floating-point or 32/64-bit integer values
+
+## tf.MulNoNan (TF::MulNoNanOp)
+
+Returns x * y element-wise. Returns zero if y is zero, even if x if infinite or NaN.
+  
+
+### Description:
+
+*NOTE*: `MulNoNan` supports broadcasting. More about broadcasting
+[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+
+### Operands:
+1. `x`: tensor of 16-bit float or 32-bit float or 64-bit float or complex128 type or complex64 type values
+1. `y`: tensor of 16-bit float or 32-bit float or 64-bit float or complex128 type or complex64 type values
+
+### Attributes:
+| Attribute | MLIR Type | Description |
+| :-------: | :-------: | ----------- |
+| `T` | `Attribute` | derived attribute attribute |
+
+### Results:
+1. `z`: tensor of 16-bit float or 32-bit float or 64-bit float or complex128 type or complex64 type values
+
+## tf.Mul (TF::MulOp)
+Returns x * y element-wise.
+
+### Description:
+
+*NOTE*: `Multiply` supports broadcasting. More about broadcasting
+[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+
+### Operands:
+1. `x`: tensor of number values
+1. `y`: tensor of number values
+
+### Attributes:
+| Attribute | MLIR Type | Description |
+| :-------: | :-------: | ----------- |
+| `T` | `Attribute` | derived attribute attribute |
+
+### Results:
+1. `z`: tensor of number values
+
+## tf.Neg (TF::NegOp)
+Computes numerical negative value element-wise.
+
+### Description:
+
+I.e., \\(y = -x\\).
+
+### Operands:
+1. `x`: tensor of bfloat16 type or 16-bit float or 32-bit float or 64-bit float or 32-bit integer or 64-bit integer or complex128 type or complex64 type values
+
+### Attributes:
+| Attribute | MLIR Type | Description |
+| :-------: | :-------: | ----------- |
+| `T` | `Attribute` | derived attribute attribute |
+
+### Results:
+1. `y`: tensor of bfloat16 type or 16-bit float or 32-bit float or 64-bit float or 32-bit integer or 64-bit integer or complex128 type or complex64 type values
+
+## tf.NoOp (TF::NoOp)
+Does nothing. Only useful as a placeholder for control edges.
+
+### Description:
+
+
+### Operands:
+
+### Attributes:
+
+### Results:
+
+## tf.NotEqual (TF::NotEqualOp)
+Returns the truth value of (x != y) element-wise.
+
+### Description:
+
+*NOTE*: `NotEqual` supports broadcasting. More about broadcasting
+[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+
+### Operands:
+1. `x`: tensor of bfloat16 type or 16-bit float or 32-bit float or 64-bit float or 1-bit integer or 16-bit integer or 32-bit integer or 64-bit integer or 8-bit integer or complex128 type or complex64 type or TensorFlow string type values
+1. `y`: tensor of bfloat16 type or 16-bit float or 32-bit float or 64-bit float or 1-bit integer or 16-bit integer or 32-bit integer or 64-bit integer or 8-bit integer or complex128 type or complex64 type or TensorFlow string type values
+
+### Attributes:
+| Attribute | MLIR Type | Description |
+| :-------: | :-------: | ----------- |
+| `T` | `Attribute` | derived attribute attribute |
+
+### Results:
+1. `z`: tensor of 1-bit integer values
+
+## tf.Pack (TF::PackOp)
+
+Packs a list of `N` rank-`R` tensors into one rank-`(R+1)` tensor.
+  
+
+### Description:
+
+Packs the `N` tensors in `values` into a tensor with rank one higher than each
+tensor in `values`, by packing them along the `axis` dimension.
+Given a list of tensors of shape `(A, B, C)`;
+
+if `axis == 0` then the `output` tensor will have the shape `(N, A, B, C)`.
+if `axis == 1` then the `output` tensor will have the shape `(A, N, B, C)`.
+Etc.
+
+For example:
+
+```
+# 'x' is [1, 4]
+# 'y' is [2, 5]
+# 'z' is [3, 6]
+pack([x, y, z]) => [[1, 4], [2, 5], [3, 6]]  # Pack along first dim.
+pack([x, y, z], axis=1) => [[1, 2, 3], [4, 5, 6]]
+```
+
+This is the opposite of `unpack`.
+
+### Operands:
+1. `values`: tensor of tf.dtype values
+
+### Attributes:
+| Attribute | MLIR Type | Description |
+| :-------: | :-------: | ----------- |
+| `N` | `IntegerAttr` | 64-bit integer attribute whose minimal value is 1 attribute |
+| `axis` | `IntegerAttr` | 64-bit integer attribute attribute |
+| `T` | `Attribute` | derived attribute attribute |
+
+### Results:
+1. `output`: tensor of tf.dtype values
+
+## tf.Pad (TF::PadOp)
+Pads a tensor with zeros.
+
+### Description:
+
+This operation pads a `input` with zeros according to the `paddings` you
+specify. `paddings` is an integer tensor with shape `[Dn, 2]`, where n is the
+rank of `input`. For each dimension D of `input`, `paddings[D, 0]` indicates
+how many zeros to add before the contents of `input` in that dimension, and
+`paddings[D, 1]` indicates how many zeros to add after the contents of `input`
+in that dimension.
+
+The padded size of each dimension D of the output is:
+
+`paddings(D, 0) + input.dim_size(D) + paddings(D, 1)`
+
+For example:
+
+```
+# 't' is [[1, 1], [2, 2]]
+# 'paddings' is [[1, 1], [2, 2]]
+# rank of 't' is 2
+pad(t, paddings) ==> [[0, 0, 0, 0, 0, 0]
+                      [0, 0, 1, 1, 0, 0]
+                      [0, 0, 2, 2, 0, 0]
+                      [0, 0, 0, 0, 0, 0]]
+```
+
+### Operands:
+1. `input`: tensor of tf.dtype values
+1. `paddings`: tensor of 32/64-bit integer values
+
+### Attributes:
+| Attribute | MLIR Type | Description |
+| :-------: | :-------: | ----------- |
+| `T` | `Attribute` | derived attribute attribute |
+| `Tpaddings` | `Attribute` | derived attribute attribute |
+
+### Results:
+1. `output`: tensor of tf.dtype values
+
+## tf.PadV2 (TF::PadV2Op)
+Pads a tensor.
+
+### Description:
+
+This operation pads `input` according to the `paddings` and `constant_values`
+you specify. `paddings` is an integer tensor with shape `[Dn, 2]`, where n is
+the rank of `input`. For each dimension D of `input`, `paddings[D, 0]` indicates
+how many padding values to add before the contents of `input` in that dimension,
+and `paddings[D, 1]` indicates how many padding values to add after the contents
+of `input` in that dimension. `constant_values` is a scalar tensor of the same
+type as `input` that indicates the value to use for padding `input`.
+
+The padded size of each dimension D of the output is:
+
+`paddings(D, 0) + input.dim_size(D) + paddings(D, 1)`
+
+For example:
+
+```
+# 't' is [[1, 1], [2, 2]]
+# 'paddings' is [[1, 1], [2, 2]]
+# 'constant_values' is 0
+# rank of 't' is 2
+pad(t, paddings) ==> [[0, 0, 0, 0, 0, 0]
+                      [0, 0, 1, 1, 0, 0]
+                      [0, 0, 2, 2, 0, 0]
+                      [0, 0, 0, 0, 0, 0]]
+```
+
+### Operands:
+1. `input`: tensor of tf.dtype values
+1. `paddings`: tensor of 32/64-bit integer values
+1. `constant_values`: tensor of tf.dtype values
+
+### Attributes:
+| Attribute | MLIR Type | Description |
+| :-------: | :-------: | ----------- |
+| `T` | `Attribute` | derived attribute attribute |
+| `Tpaddings` | `Attribute` | derived attribute attribute |
+
+### Results:
+1. `output`: tensor of tf.dtype values
+
+## tf.Placeholder.input (TF::PlaceholderInputOp)
+PlaceholderInput op
+
+### Description:
+
+Inserts a placeholder for a tensor that will be always fed.
+
+### Operands:
+1. `arg`: tensor of tf.dtype values
+
+### Attributes:
+| Attribute | MLIR Type | Description |
+| :-------: | :-------: | ----------- |
+| `min` | `FloatAttr` | 32-bit float attribute attribute |
+| `max` | `FloatAttr` | 32-bit float attribute attribute |
+| `type` | `TypeAttr` | integer type attribute |
+| `dtype` | `Attribute` | derived attribute attribute |
+
+### Results:
+1. `output`: tensor of tf.dtype values
+
+## tf.Placeholder (TF::PlaceholderOp)
+Placeholder op
+
+### Description:
+
+Inserts a placeholder for a tensor that will be always fed.
+
+### Operands:
+
+### Attributes:
+| Attribute | MLIR Type | Description |
+| :-------: | :-------: | ----------- |
+| `dtype` | `Attribute` | derived attribute attribute |
+
+### Results:
+1. `output`: tensor of tf.dtype values
+
+## tf.QuantizeAndDequantize (TF::QuantizeAndDequantizeOp)
+Use QuantizeAndDequantizeV2 instead.
+
+### Description:
+
+
+### Operands:
+1. `input`: tensor of floating-point values
+
+### Attributes:
+| Attribute | MLIR Type | Description |
+| :-------: | :-------: | ----------- |
+| `signed_input` | `BoolAttr` | bool attribute attribute |
+| `num_bits` | `IntegerAttr` | 64-bit integer attribute attribute |
+| `range_given` | `BoolAttr` | bool attribute attribute |
+| `input_min` | `FloatAttr` | 32-bit float attribute attribute |
+| `input_max` | `FloatAttr` | 32-bit float attribute attribute |
+| `T` | `Attribute` | derived attribute attribute |
+
+### Results:
+1. `output`: tensor of floating-point values
+
+## tf.QuantizeAndDequantizeV2 (TF::QuantizeAndDequantizeV2Op)
+Quantizes then dequantizes a tensor.
+
+### Description:
+
+This op simulates the precision loss from the quantized forward pass by:
+
+1. Quantizing the tensor to fixed point numbers, which should match the target
+   quantization method when it is used in inference.
+2. Dequantizing it back to floating point numbers for the following ops, most
+   likely matmul.
+
+There are different ways to quantize. This version uses only scaling, so 0.0
+maps to 0.
+
+From the specified 'num_bits' in the quantized output type, it determines
+minimum and maximum representable quantized values.
+
+e.g.
+
+*   [-128, 127] for signed, num_bits = 8, or
+*   [0, 255] for unsigned, num_bits = 8.
+
+If range_given == False, the initial input_min, input_max will be determined
+automatically as the minimum and maximum values in the input tensor, otherwise
+the specified values of input_min, input_max are used.
+
+Note: If the input_min, input_max are specified, they do not need to equal the
+actual minimum and maximum values in the tensor. e.g. in some cases it may be
+beneficial to specify these values such that the low probability extremes of the
+input distribution are clipped.
+
+This op determines the maximum scale_factor that would map the initial
+[input_min, input_max] range to a range that lies within the representable
+quantized range.
+
+It determines the scale from one of input_min and input_max, then updates the
+other one to maximize the respresentable range.
+
+e.g.
+
+*   if the output is signed, num_bits = 8, [input_min, input_max] = [-10.0,
+    5.0]: it would use a scale_factor of -128 / -10.0 = 12.8 In this case, it
+    would update input_max to be 127 / 12.8 = 9.921875
+*   if the output is signed, num_bits = 8, [input_min, input_max] = [-10.0,
+    10.0]: it would use a scale_factor of 127 / 10.0 = 12.7 In this case, it
+    would update input_min to be 128.0 / 12.7 = -10.07874
+*   if the output is unsigned, input_min is forced to be 0, and only the
+    specified input_max is used.
+
+After determining the scale_factor and updating the input range, it applies the
+following to each value in the 'input' tensor.
+
+output = round(clamp(value, input_min, input_max) * scale_factor) / scale_factor.
+
+The above round function rounds the value based on the given round_mode.
+
+### Operands:
+1. `input`: tensor of floating-point values
+1. `input_min`: tensor of floating-point values
+1. `input_max`: tensor of floating-point values
+
+### Attributes:
+| Attribute | MLIR Type | Description |
+| :-------: | :-------: | ----------- |
+| `signed_input` | `BoolAttr` | bool attribute attribute |
+| `num_bits` | `IntegerAttr` | 64-bit integer attribute attribute |
+| `range_given` | `BoolAttr` | bool attribute attribute |
+| `round_mode` | `StringAttr` | string attribute whose value is HALF_TO_EVEN, or HALF_UP attribute |
+| `narrow_range` | `BoolAttr` | bool attribute attribute |
+| `T` | `Attribute` | derived attribute attribute |
+
+### Results:
+1. `output`: tensor of floating-point values
+
+## tf.QuantizeAndDequantizeV3 (TF::QuantizeAndDequantizeV3Op)
+Quantizes then dequantizes a tensor.
+
+### Description:
+
+This is almost identical to QuantizeAndDequantizeV2, except that num_bits is a
+tensor, so its value can change during training.
+
+### Operands:
+1. `input`: tensor of floating-point values
+1. `input_min`: tensor of floating-point values
+1. `input_max`: tensor of floating-point values
+1. `num_bits`: tensor of 32-bit integer values
+
+### Attributes:
+| Attribute | MLIR Type | Description |
+| :-------: | :-------: | ----------- |
+| `signed_input` | `BoolAttr` | bool attribute attribute |
+| `range_given` | `BoolAttr` | bool attribute attribute |
+| `narrow_range` | `BoolAttr` | bool attribute attribute |
+| `T` | `Attribute` | derived attribute attribute |
+
+### Results:
+1. `output`: tensor of floating-point values
+
+## tf.RandomUniform (TF::RandomUniformOp)
+Outputs random values from a uniform distribution.
+
+### Description:
+
+The generated values follow a uniform distribution in the range `[0, 1)`. The
+lower bound 0 is included in the range, while the upper bound 1 is excluded.
+
+### Operands:
+1. `shape`: tensor of 32/64-bit integer values
+
+### Attributes:
+| Attribute | MLIR Type | Description |
+| :-------: | :-------: | ----------- |
+| `seed` | `IntegerAttr` | 64-bit integer attribute attribute |
+| `seed2` | `IntegerAttr` | 64-bit integer attribute attribute |
+| `T` | `Attribute` | derived attribute attribute |
+| `dtype` | `Attribute` | derived attribute attribute |
+
+### Results:
+1. `output`: tensor of floating-point values
+
+## tf.Range (TF::RangeOp)
+Creates a sequence of numbers.
+
+### Description:
+
+This operation creates a sequence of numbers that begins at `start` and
+extends by increments of `delta` up to but not including `limit`.
+
+For example:
+
+```
+# 'start' is 3
+# 'limit' is 18
+# 'delta' is 3
+tf.range(start, limit, delta) ==> [3, 6, 9, 12, 15]
+```
+
+### Operands:
+1. `start`: tensor of bfloat16 type or 32-bit float or 64-bit float or 32-bit integer or 64-bit integer values
+1. `limit`: tensor of bfloat16 type or 32-bit float or 64-bit float or 32-bit integer or 64-bit integer values
+1. `delta`: tensor of bfloat16 type or 32-bit float or 64-bit float or 32-bit integer or 64-bit integer values
+
+### Attributes:
+| Attribute | MLIR Type | Description |
+| :-------: | :-------: | ----------- |
+| `Tidx` | `Attribute` | derived attribute attribute |
+
+### Results:
+1. `output`: tensor of bfloat16 type or 32-bit float or 64-bit float or 32-bit integer or 64-bit integer values
+
+## tf.Rank (TF::RankOp)
+Returns the rank of a tensor.
+
+### Description:
+
+This operation returns an integer representing the rank of `input`.
+
+For example:
+
+```
+# 't' is [[[1, 1, 1], [2, 2, 2]], [[3, 3, 3], [4, 4, 4]]]
+# shape of tensor 't' is [2, 2, 3]
+rank(t) ==> 3
+```
+
+**Note**: The rank of a tensor is not the same as the rank of a matrix. The rank
+of a tensor is the number of indices required to uniquely select each element
+of the tensor. Rank is also known as "order", "degree", or "ndims."
+
+### Operands:
+1. `input`: tensor of tf.dtype values
+
+### Attributes:
+| Attribute | MLIR Type | Description |
+| :-------: | :-------: | ----------- |
+| `T` | `Attribute` | derived attribute attribute |
+
+### Results:
+1. `output`: tensor of 32-bit integer values
+
+## tf.RealDiv (TF::RealDivOp)
+Returns x / y element-wise for real types.
+
+### Description:
+
+If `x` and `y` are reals, this will return the floating-point division.
+
+*NOTE*: `Div` supports broadcasting. More about broadcasting
+[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+
+### Operands:
+1. `x`: tensor of number values
+1. `y`: tensor of number values
+
+### Attributes:
+| Attribute | MLIR Type | Description |
+| :-------: | :-------: | ----------- |
+| `T` | `Attribute` | derived attribute attribute |
+
+### Results:
+1. `z`: tensor of number values
+
+## tf.Reciprocal (TF::ReciprocalOp)
+Computes the reciprocal of x element-wise.
+
+### Description:
+
+I.e., \\(y = 1 / x\\).
+
+### Operands:
+1. `x`: tensor of bfloat16 type or 16-bit float or 32-bit float or 64-bit float or 32-bit integer or 64-bit integer or complex128 type or complex64 type values
+
+### Attributes:
+| Attribute | MLIR Type | Description |
+| :-------: | :-------: | ----------- |
+| `T` | `Attribute` | derived attribute attribute |
+
+### Results:
+1. `y`: tensor of bfloat16 type or 16-bit float or 32-bit float or 64-bit float or 32-bit integer or 64-bit integer or complex128 type or complex64 type values
+
+## tf.Relu6 (TF::Relu6Op)
+Computes rectified linear 6: `min(max(features, 0), 6)`.
+
+### Description:
+
+
+### Operands:
+1. `features`: tensor of 8/16/32/64-bit integer or floating-point values
+
+### Attributes:
+| Attribute | MLIR Type | Description |
+| :-------: | :-------: | ----------- |
+| `T` | `Attribute` | derived attribute attribute |
+
+### Results:
+1. `activations`: tensor of 8/16/32/64-bit integer or floating-point values
+
+## tf.Relu (TF::ReluOp)
+Computes rectified linear: `max(features, 0)`.
+
+### Description:
+
+
+### Operands:
+1. `features`: tensor of 8/16/32/64-bit integer or floating-point values
+
+### Attributes:
+| Attribute | MLIR Type | Description |
+| :-------: | :-------: | ----------- |
+| `T` | `Attribute` | derived attribute attribute |
+
+### Results:
+1. `activations`: tensor of 8/16/32/64-bit integer or floating-point values
+
+## tf.Reshape (TF::ReshapeOp)
+Reshapes a tensor.
+
+### Description:
+
+Given `tensor`, this operation returns a tensor that has the same values
+as `tensor` with shape `shape`.
+
+If one component of `shape` is the special value -1, the size of that dimension
+is computed so that the total size remains constant.  In particular, a `shape`
+of `[-1]` flattens into 1-D.  At most one component of `shape` can be -1.
+
+If `shape` is 1-D or higher, then the operation returns a tensor with shape
+`shape` filled with the values of `tensor`. In this case, the number of elements
+implied by `shape` must be the same as the number of elements in `tensor`.
+
+For example:
+
+```
+# tensor 't' is [1, 2, 3, 4, 5, 6, 7, 8, 9]
+# tensor 't' has shape [9]
+reshape(t, [3, 3]) ==> [[1, 2, 3],
+                        [4, 5, 6],
+                        [7, 8, 9]]
+
+# tensor 't' is [[[1, 1], [2, 2]],
+#                [[3, 3], [4, 4]]]
+# tensor 't' has shape [2, 2, 2]
+reshape(t, [2, 4]) ==> [[1, 1, 2, 2],
+                        [3, 3, 4, 4]]
+
+# tensor 't' is [[[1, 1, 1],
+#                 [2, 2, 2]],
+#                [[3, 3, 3],
+#                 [4, 4, 4]],
+#                [[5, 5, 5],
+#                 [6, 6, 6]]]
+# tensor 't' has shape [3, 2, 3]
+# pass '[-1]' to flatten 't'
+reshape(t, [-1]) ==> [1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5, 6, 6, 6]
+
+# -1 can also be used to infer the shape
+
+# -1 is inferred to be 9:
+reshape(t, [2, -1]) ==> [[1, 1, 1, 2, 2, 2, 3, 3, 3],
+                         [4, 4, 4, 5, 5, 5, 6, 6, 6]]
+# -1 is inferred to be 2:
+reshape(t, [-1, 9]) ==> [[1, 1, 1, 2, 2, 2, 3, 3, 3],
+                         [4, 4, 4, 5, 5, 5, 6, 6, 6]]
+# -1 is inferred to be 3:
+reshape(t, [ 2, -1, 3]) ==> [[[1, 1, 1],
+                              [2, 2, 2],
+                              [3, 3, 3]],
+                             [[4, 4, 4],
+                              [5, 5, 5],
+                              [6, 6, 6]]]
+
+# tensor 't' is [7]
+# shape `[]` reshapes to a scalar
+reshape(t, []) ==> 7
+```
+
+### Operands:
+1. `tensor`: tensor of tf.dtype values
+1. `shape`: tensor of 32/64-bit integer values
+
+### Attributes:
+| Attribute | MLIR Type | Description |
+| :-------: | :-------: | ----------- |
+| `T` | `Attribute` | derived attribute attribute |
+| `Tshape` | `Attribute` | derived attribute attribute |
+
+### Results:
+1. `output`: tensor of tf.dtype values
+
+## tf.ResizeBilinear (TF::ResizeBilinearOp)
+Resize `images` to `size` using bilinear interpolation.
+
+### Description:
+
+Input images can be of different types but output images are always float.
+
+### Operands:
+1. `images`: tensor of 8/16/32/64-bit integer or floating-point values
+1. `size`: tensor of 32-bit integer values
+
+### Attributes:
+| Attribute | MLIR Type | Description |
+| :-------: | :-------: | ----------- |
+| `align_corners` | `BoolAttr` | bool attribute attribute |
+| `half_pixel_centers` | `BoolAttr` | bool attribute attribute |
+| `T` | `Attribute` | derived attribute attribute |
+
+### Results:
+1. `resized_images`: tensor of 32-bit float values
+
+## tf.ReverseV2 (TF::ReverseV2Op)
+Reverses specific dimensions of a tensor.
+
+### Description:
+
+NOTE `tf.reverse` has now changed behavior in preparation for 1.0.
+`tf.reverse_v2` is currently an alias that will be deprecated before TF 1.0.
+
+Given a `tensor`, and a `int32` tensor `axis` representing the set of
+dimensions of `tensor` to reverse. This operation reverses each dimension
+`i` for which there exists `j` s.t. `axis[j] == i`.
+
+`tensor` can have up to 8 dimensions. The number of dimensions specified
+in `axis` may be 0 or more entries. If an index is specified more than
+once, a InvalidArgument error is raised.
+
+For example:
+
+```
+# tensor 't' is [[[[ 0,  1,  2,  3],
+#                  [ 4,  5,  6,  7],
+#                  [ 8,  9, 10, 11]],
+#                 [[12, 13, 14, 15],
+#                  [16, 17, 18, 19],
+#                  [20, 21, 22, 23]]]]
+# tensor 't' shape is [1, 2, 3, 4]
+
+# 'dims' is [3] or 'dims' is [-1]
+reverse(t, dims) ==> [[[[ 3,  2,  1,  0],
+                        [ 7,  6,  5,  4],
+                        [ 11, 10, 9, 8]],
+                       [[15, 14, 13, 12],
+                        [19, 18, 17, 16],
+                        [23, 22, 21, 20]]]]
+
+# 'dims' is '[1]' (or 'dims' is '[-3]')
+reverse(t, dims) ==> [[[[12, 13, 14, 15],
+                        [16, 17, 18, 19],
+                        [20, 21, 22, 23]
+                       [[ 0,  1,  2,  3],
+                        [ 4,  5,  6,  7],
+                        [ 8,  9, 10, 11]]]]
+
+# 'dims' is '[2]' (or 'dims' is '[-2]')
+reverse(t, dims) ==> [[[[8, 9, 10, 11],
+                        [4, 5, 6, 7],
+                        [0, 1, 2, 3]]
+                       [[20, 21, 22, 23],
+                        [16, 17, 18, 19],
+                        [12, 13, 14, 15]]]]
+```
+
+### Operands:
+1. `tensor`: tensor of bfloat16 type or 16-bit float or 32-bit float or 64-bit float or 1-bit integer or 16-bit integer or 32-bit integer or 64-bit integer or 8-bit integer or complex128 type or complex64 type or TensorFlow string type values
+1. `axis`: tensor of 32/64-bit integer values
+
+### Attributes:
+| Attribute | MLIR Type | Description |
+| :-------: | :-------: | ----------- |
+| `T` | `Attribute` | derived attribute attribute |
+| `Tidx` | `Attribute` | derived attribute attribute |
+
+### Results:
+1. `output`: tensor of bfloat16 type or 16-bit float or 32-bit float or 64-bit float or 1-bit integer or 16-bit integer or 32-bit integer or 64-bit integer or 8-bit integer or complex128 type or complex64 type or TensorFlow string type values
+
+## tf.Rsqrt (TF::RsqrtOp)
+Computes reciprocal of square root of x element-wise.
+
+### Description:
+
+I.e., \\(y = 1 / \sqrt{x}\\).
+
+### Operands:
+1. `x`: tensor of floating-point or 64/128-bit complex type values
+
+### Attributes:
+| Attribute | MLIR Type | Description |
+| :-------: | :-------: | ----------- |
+| `T` | `Attribute` | derived attribute attribute |
+
+### Results:
+1. `y`: tensor of floating-point or 64/128-bit complex type values
+
+## tf.Select (TF::SelectOp)
+Selects elements from `x` or `y`, depending on `condition`.
+
+### Description:
+
+The `x`, and `y` tensors must all have the same shape, and the
+output will also have that shape.
+
+The `condition` tensor must be a scalar if `x` and `y` are scalars.
+If `x` and `y` are vectors or higher rank, then `condition` must be either a
+scalar, a vector with size matching the first dimension of `x`, or must have
+the same shape as `x`.
+
+The `condition` tensor acts as a mask that chooses, based on the value at each
+element, whether the corresponding element / row in the output should be
+taken from `x` (if true) or `y` (if false).
+
+If `condition` is a vector and `x` and `y` are higher rank matrices, then
+it chooses which row (outer dimension) to copy from `x` and `y`.
+If `condition` has the same shape as `x` and `y`, then it chooses which
+element to copy from `x` and `y`.
+
+For example:
+
+```python
+# 'condition' tensor is [[True,  False]
+#                        [False, True]]
+# 't' is [[1, 2],
+#         [3, 4]]
+# 'e' is [[5, 6],
+#         [7, 8]]
+select(condition, t, e)  # => [[1, 6], [7, 4]]
+
+
+# 'condition' tensor is [True, False]
+# 't' is [[1, 2],
+#         [3, 4]]
+# 'e' is [[5, 6],
+#         [7, 8]]
+select(condition, t, e) ==> [[1, 2],
+                             [7, 8]]
+
+```
+
+### Operands:
+1. `condition`: tensor of 1-bit integer values
+1. `t`: tensor of tf.dtype values
+1. `e`: tensor of tf.dtype values
+
+### Attributes:
+| Attribute | MLIR Type | Description |
+| :-------: | :-------: | ----------- |
+| `T` | `Attribute` | derived attribute attribute |
+
+### Results:
+1. `output`: tensor of tf.dtype values
+
+## tf.Shape (TF::ShapeOp)
+Returns the shape of a tensor.
+
+### Description:
+
+This operation returns a 1-D integer tensor representing the shape of `input`.
+
+For example:
+
+```
+# 't' is [[[1, 1, 1], [2, 2, 2]], [[3, 3, 3], [4, 4, 4]]]
+shape(t) ==> [2, 2, 3]
+```
+
+### Operands:
+1. `input`: tensor of tf.dtype values
+
+### Attributes:
+| Attribute | MLIR Type | Description |
+| :-------: | :-------: | ----------- |
+| `T` | `Attribute` | derived attribute attribute |
+| `out_type` | `Attribute` | derived attribute attribute |
+
+### Results:
+1. `output`: tensor of 32/64-bit integer values
+
+## tf.Sigmoid (TF::SigmoidOp)
+Computes sigmoid of `x` element-wise.
+
+### Description:
+
+Specifically, `y = 1 / (1 + exp(-x))`.
+
+### Operands:
+1. `x`: tensor of floating-point or 64/128-bit complex type values
+
+### Attributes:
+| Attribute | MLIR Type | Description |
+| :-------: | :-------: | ----------- |
+| `T` | `Attribute` | derived attribute attribute |
+
+### Results:
+1. `y`: tensor of floating-point or 64/128-bit complex type values
+
+## tf.Sin (TF::SinOp)
+Computes sin of x element-wise.
+
+### Description:
+
+
+### Operands:
+1. `x`: tensor of floating-point or 64/128-bit complex type values
+
+### Attributes:
+| Attribute | MLIR Type | Description |
+| :-------: | :-------: | ----------- |
+| `T` | `Attribute` | derived attribute attribute |
+
+### Results:
+1. `y`: tensor of floating-point or 64/128-bit complex type values
+
+## tf.Slice (TF::SliceOp)
+Return a slice from 'input'.
+
+### Description:
+
+The output tensor is a tensor with dimensions described by 'size'
+whose values are extracted from 'input' starting at the offsets in
+'begin'.
+
+*Requirements*:
+  0 <= begin[i] <= begin[i] + size[i] <= Di  for i in [0, n)
+
+### Operands:
+1. `input`: tensor of tf.dtype values
+1. `begin`: tensor of 32/64-bit integer values
+1. `size`: tensor of 32/64-bit integer values
+
+### Attributes:
+| Attribute | MLIR Type | Description |
+| :-------: | :-------: | ----------- |
+| `T` | `Attribute` | derived attribute attribute |
+| `Index` | `Attribute` | derived attribute attribute |
+
+### Results:
+1. `output`: tensor of tf.dtype values
+
+## tf.Softmax (TF::SoftmaxOp)
+Computes softmax activations.
+
+### Description:
+
+For each batch `i` and class `j` we have
+
+    $$softmax[i, j] = exp(logits[i, j]) / sum_j(exp(logits[i, j]))$$
+
+### Operands:
+1. `logits`: tensor of floating-point values
+
+### Attributes:
+| Attribute | MLIR Type | Description |
+| :-------: | :-------: | ----------- |
+| `T` | `Attribute` | derived attribute attribute |
+
+### Results:
+1. `softmax`: tensor of floating-point values
+
+## tf.SpaceToBatchND (TF::SpaceToBatchNDOp)
+SpaceToBatch for N-D tensors of type T.
+
+### Description:
+
+This operation divides "spatial" dimensions `[1, ..., M]` of the input into a
+grid of blocks of shape `block_shape`, and interleaves these blocks with the
+"batch" dimension (0) such that in the output, the spatial dimensions
+`[1, ..., M]` correspond to the position within the grid, and the batch
+dimension combines both the position within a spatial block and the original
+batch position.  Prior to division into blocks, the spatial dimensions of the
+input are optionally zero padded according to `paddings`.  See below for a
+precise description.
+
+### Operands:
+1. `input`: tensor of tf.dtype values
+1. `block_shape`: tensor of 32/64-bit integer values
+1. `paddings`: tensor of 32/64-bit integer values
+
+### Attributes:
+| Attribute | MLIR Type | Description |
+| :-------: | :-------: | ----------- |
+| `T` | `Attribute` | derived attribute attribute |
+| `Tpaddings` | `Attribute` | derived attribute attribute |
+| `Tblock_shape` | `Attribute` | derived attribute attribute |
+
+### Results:
+1. `output`: tensor of tf.dtype values
+
+## tf.Split (TF::SplitOp)
+Splits a tensor into `num_split` tensors along one dimension.
+
+### Description:
+
+
+### Operands:
+1. `split_dim`: tensor of 32-bit integer values
+1. `value`: tensor of tf.dtype values
+
+### Attributes:
+| Attribute | MLIR Type | Description |
+| :-------: | :-------: | ----------- |
+| `num_split` | `IntegerAttr` | 64-bit integer attribute whose minimal value is 1 attribute |
+| `T` | `Attribute` | derived attribute attribute |
+
+### Results:
+1. `output`: tensor of tf.dtype values
+
+## tf.SplitV (TF::SplitVOp)
+Splits a tensor into `num_split` tensors along one dimension.
+
+### Description:
+
+
+### Operands:
+1. `value`: tensor of tf.dtype values
+1. `size_splits`: tensor of 32/64-bit integer values
+1. `split_dim`: tensor of 32-bit integer values
+
+### Attributes:
+| Attribute | MLIR Type | Description |
+| :-------: | :-------: | ----------- |
+| `num_split` | `IntegerAttr` | 64-bit integer attribute whose minimal value is 1 attribute |
+| `Tlen` | `Attribute` | derived attribute attribute |
+| `T` | `Attribute` | derived attribute attribute |
+
+### Results:
+1. `output`: tensor of tf.dtype values
+
+## tf.Sqrt (TF::SqrtOp)
+Computes square root of x element-wise.
+
+### Description:
+
+I.e., \\(y = \sqrt{x} = x^{1/2}\\).
+
+### Operands:
+1. `x`: tensor of floating-point or 64/128-bit complex type values
+
+### Attributes:
+| Attribute | MLIR Type | Description |
+| :-------: | :-------: | ----------- |
+| `T` | `Attribute` | derived attribute attribute |
+
+### Results:
+1. `y`: tensor of floating-point or 64/128-bit complex type values
+
+## tf.Square (TF::SquareOp)
+Computes square of x element-wise.
+
+### Description:
+
+I.e., \\(y = x * x = x^2\\).
+
+### Operands:
+1. `x`: tensor of bfloat16 type or 16-bit float or 32-bit float or 64-bit float or 32-bit integer or 64-bit integer or complex128 type or complex64 type values
+
+### Attributes:
+| Attribute | MLIR Type | Description |
+| :-------: | :-------: | ----------- |
+| `T` | `Attribute` | derived attribute attribute |
+
+### Results:
+1. `y`: tensor of bfloat16 type or 16-bit float or 32-bit float or 64-bit float or 32-bit integer or 64-bit integer or complex128 type or complex64 type values
+
+## tf.SquaredDifference (TF::SquaredDifferenceOp)
+Returns (x - y)(x - y) element-wise.
+
+### Description:
+
+*NOTE*: `SquaredDifference` supports broadcasting. More about broadcasting
+[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+
+### Operands:
+1. `x`: tensor of bfloat16 type or 16-bit float or 32-bit float or 64-bit float or 32-bit integer or 64-bit integer or complex128 type or complex64 type values
+1. `y`: tensor of bfloat16 type or 16-bit float or 32-bit float or 64-bit float or 32-bit integer or 64-bit integer or complex128 type or complex64 type values
+
+### Attributes:
+| Attribute | MLIR Type | Description |
+| :-------: | :-------: | ----------- |
+| `T` | `Attribute` | derived attribute attribute |
+
+### Results:
+1. `z`: tensor of bfloat16 type or 16-bit float or 32-bit float or 64-bit float or 32-bit integer or 64-bit integer or complex128 type or complex64 type values
+
+## tf.Squeeze (TF::SqueezeOp)
+Removes dimensions of size 1 from the shape of a tensor.
+
+### Description:
+
+Given a tensor `input`, this operation returns a tensor of the same type with
+all dimensions of size 1 removed. If you don't want to remove all size 1
+dimensions, you can remove specific size 1 dimensions by specifying
+`axis`.
+
+For example:
+
+```
+# 't' is a tensor of shape [1, 2, 1, 3, 1, 1]
+shape(squeeze(t)) ==> [2, 3]
+```
+
+Or, to remove specific size 1 dimensions:
+
+```
+# 't' is a tensor of shape [1, 2, 1, 3, 1, 1]
+shape(squeeze(t, [2, 4])) ==> [1, 2, 3, 1]
+```
+
+### Operands:
+1. `input`: tensor of tf.dtype values
+
+### Attributes:
+| Attribute | MLIR Type | Description |
+| :-------: | :-------: | ----------- |
+| `squeeze_dims` | `ArrayAttr` | 64-bit integer array attribute attribute |
+| `T` | `Attribute` | derived attribute attribute |
+
+### Results:
+1. `output`: tensor of tf.dtype values
+
+## tf.StridedSlice (TF::StridedSliceOp)
+Return a strided slice from `input`.
+
+### Description:
+
+Note, most python users will want to use the Python `Tensor.__getitem__`
+or `Variable.__getitem__` rather than this op directly.
+
+The goal of this op is to produce a new tensor with a subset of
+the elements from the `n` dimensional `input` tensor. The subset is chosen using
+a sequence of `m` sparse range specifications encoded into the arguments
+of this function. Note, in some cases
+`m` could be equal to `n`, but this need not be the case. Each
+range specification entry can be one of the following:
+
+- An ellipsis (...). Ellipses are used to imply zero or more
+  dimensions of full-dimension selection and are produced using
+  `ellipsis_mask`. For example, `foo[...]` is the identity slice.
+
+- A new axis. This is used to insert a new shape=1 dimension and is
+  produced using `new_axis_mask`. For example, `foo[:, ...]` where
+  `foo` is shape `(3, 4)` produces a `(1, 3, 4)` tensor.
+
+
+- A range `begin:end:stride`. This is used to specify how much to choose from
+  a given dimension. `stride` can be any integer but 0.  `begin` is an integer
+  which represents the index of the first value to select while `end` represents
+  the index of the last value to select. The number of values selected in each
+  dimension is `end - begin` if `stride > 0` and `begin - end` if `stride < 0`.
+  `begin` and `end` can be negative where `-1` is the last element, `-2` is
+  the second to last. `begin_mask` controls whether to replace the explicitly
+  given `begin` with an implicit effective value of `0` if `stride > 0` and
+  `-1` if `stride < 0`. `end_mask` is analogous but produces the number
+  required to create the largest open interval. For example, given a shape
+  `(3,)` tensor `foo[:]`, the effective `begin` and `end` are `0` and `3`. Do
+  not assume this is equivalent to `foo[0:-1]` which has an effective `begin`
+  and `end` of `0` and `2`. Another example is `foo[-2::-1]` which reverses the
+  first dimension of a tensor while dropping the last two (in the original
+  order elements). For example `foo = [1,2,3,4]; foo[-2::-1]` is `[4,3]`.
+
+- A single index. This is used to keep only elements that have a given
+  index. For example (`foo[2, :]` on a shape `(5,6)` tensor produces a
+  shape `(6,)` tensor. This is encoded in `begin` and `end` and
+  `shrink_axis_mask`.
+
+Each conceptual range specification is encoded in the op's argument. This
+encoding is best understand by considering a non-trivial example. In
+particular,
+`foo[1, 2:4, None, ..., :-3:-1, :]` will be encoded as
+
+```
+begin = [1, 2, x, x, 0, x] # x denotes don't care (usually 0)
+end = [2, 4, x, x, -3, x]
+strides = [1, 1, x, x, -1, 1]
+begin_mask = 1<<4 | 1 << 5 = 48
+end_mask = 1<<5 = 32
+ellipsis_mask = 1<<3 = 8
+new_axis_mask = 1<<2 4
+shrink_axis_mask = 1<<0
+```
+
+In this case if `foo.shape` is (5, 5, 5, 5, 5, 5) the final shape of
+the slice becomes (2, 1, 5, 5, 2, 5).
+Let us walk step by step through each argument specification.
+
+1.  The first argument in the example slice is turned into `begin = 1` and
+`end = begin + 1 = 2`. To disambiguate from the original spec `2:4` we
+also set the appropriate bit in `shrink_axis_mask`.
+
+2. `2:4` is contributes 2, 4, 1 to begin, end, and stride. All masks have
+zero bits contributed.
+
+3. None is a synonym for `tf.newaxis`. This means insert a dimension of size 1
+dimension in the final shape. Dummy values are contributed to begin,
+end and stride, while the new_axis_mask bit is set.
+
+4. `...` grab the full ranges from as many dimensions as needed to
+fully specify a slice for every dimension of the input shape.
+
+5. `:-3:-1` shows the use of negative indices. A negative index `i` associated
+with a dimension that has shape `s` is converted to a positive index
+`s + i`. So `-1` becomes `s-1` (i.e. the last element). This conversion
+is done internally so begin, end and strides receive x, -3, and -1.
+The appropriate begin_mask bit is set to indicate the start range is the
+full range (ignoring the x).
+
+6. `:` indicates that the entire contents of the corresponding dimension
+is selected. This is equivalent to `::` or `0::1`. begin, end, and strides
+receive 0, 0, and 1, respectively. The appropriate bits in `begin_mask` and
+`end_mask` are also set.
+
+*Requirements*:
+  `0 != strides[i] for i in [0, m)`
+  `ellipsis_mask must be a power of two (only one ellipsis)`
+
+### Operands:
+1. `input`: tensor of tf.dtype values
+1. `begin`: tensor of 32/64-bit integer values
+1. `end`: tensor of 32/64-bit integer values
+1. `strides`: tensor of 32/64-bit integer values
+
+### Attributes:
+| Attribute | MLIR Type | Description |
+| :-------: | :-------: | ----------- |
+| `begin_mask` | `IntegerAttr` | 64-bit integer attribute attribute |
+| `end_mask` | `IntegerAttr` | 64-bit integer attribute attribute |
+| `ellipsis_mask` | `IntegerAttr` | 64-bit integer attribute attribute |
+| `new_axis_mask` | `IntegerAttr` | 64-bit integer attribute attribute |
+| `shrink_axis_mask` | `IntegerAttr` | 64-bit integer attribute attribute |
+| `T` | `Attribute` | derived attribute attribute |
+| `Index` | `Attribute` | derived attribute attribute |
+
+### Results:
+1. `output`: tensor of tf.dtype values
+
+## tf.Sub (TF::SubOp)
+Returns x - y element-wise.
+
+### Description:
+
+*NOTE*: `Subtract` supports broadcasting. More about broadcasting
+[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+
+### Operands:
+1. `x`: tensor of number values
+1. `y`: tensor of number values
+
+### Attributes:
+| Attribute | MLIR Type | Description |
+| :-------: | :-------: | ----------- |
+| `T` | `Attribute` | derived attribute attribute |
+
+### Results:
+1. `z`: tensor of number values
+
+## tf.Sum (TF::SumOp)
+Computes the sum of elements across dimensions of a tensor.
+
+### Description:
+
+Reduces `input` along the dimensions given in `axis`. Unless
+`keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
+`axis`. If `keep_dims` is true, the reduced dimensions are
+retained with length 1.
+
+### Operands:
+1. `input`: tensor of number values
+1. `reduction_indices`: tensor of 32/64-bit integer values
+
+### Attributes:
+| Attribute | MLIR Type | Description |
+| :-------: | :-------: | ----------- |
+| `keep_dims` | `BoolAttr` | bool attribute attribute |
+| `T` | `Attribute` | derived attribute attribute |
+| `Tidx` | `Attribute` | derived attribute attribute |
+
+### Results:
+1. `output`: tensor of number values
+
+## tf.TensorListFromTensor (TF::TensorListFromTensorOp)
+
+Creates a TensorList which, when stacked, has the value of `tensor`.
+  
+
+### Description:
+
+Each tensor in the result list corresponds to one row of the input tensor.
+
+tensor: The input tensor.
+output_handle: The list.
+
+### Operands:
+1. `tensor`: tensor of tf.dtype values
+1. `element_shape`: tensor of 32/64-bit integer values
+
+### Attributes:
+| Attribute | MLIR Type | Description |
+| :-------: | :-------: | ----------- |
+| `shape_type` | `Attribute` | derived attribute attribute |
+| `element_dtype` | `Attribute` | derived attribute attribute |
+
+### Results:
+1. `output_handle`: tensor of TensorFlow variant type values
+
+## tf.TensorListGetItem (TF::TensorListGetItemOp)
+
+
+### Description:
+
+
+### Operands:
+1. `input_handle`: tensor of TensorFlow variant type values
+1. `index`: tensor of 32-bit integer values
+1. `element_shape`: tensor of 32-bit integer values
+
+### Attributes:
+| Attribute | MLIR Type | Description |
+| :-------: | :-------: | ----------- |
+| `element_dtype` | `Attribute` | derived attribute attribute |
+
+### Results:
+1. `item`: tensor of tf.dtype values
+
+## tf.TensorListReserve (TF::TensorListReserveOp)
+List of the given size with empty elements.
+
+### Description:
+
+element_shape: the shape of the future elements of the list
+num_elements: the number of elements to reserve
+handle: the output list
+element_dtype: the desired type of elements in the list.
+
+### Operands:
+1. `element_shape`: tensor of 32/64-bit integer values
+1. `num_elements`: tensor of 32-bit integer values
+
+### Attributes:
+| Attribute | MLIR Type | Description |
+| :-------: | :-------: | ----------- |
+| `element_dtype` | `TypeAttr` | any type attribute attribute |
+| `shape_type` | `Attribute` | derived attribute attribute |
+
+### Results:
+1. `handle`: tensor of TensorFlow variant type values
+
+## tf.TensorListSetItem (TF::TensorListSetItemOp)
+
+
+### Description:
+
+
+### Operands:
+1. `input_handle`: tensor of TensorFlow variant type values
+1. `index`: tensor of 32-bit integer values
+1. `item`: tensor of tf.dtype values
+
+### Attributes:
+| Attribute | MLIR Type | Description |
+| :-------: | :-------: | ----------- |
+| `element_dtype` | `Attribute` | derived attribute attribute |
+
+### Results:
+1. `output_handle`: tensor of TensorFlow variant type values
+
+## tf.TensorListStack (TF::TensorListStackOp)
+Stacks all tensors in the list.
+
+### Description:
+
+Requires that all tensors have the same shape.
+
+input_handle: the input list
+tensor: the gathered result
+num_elements: optional. If not -1, the number of elements in the list.
+
+### Operands:
+1. `input_handle`: tensor of TensorFlow variant type values
+1. `element_shape`: tensor of 32-bit integer values
+
+### Attributes:
+| Attribute | MLIR Type | Description |
+| :-------: | :-------: | ----------- |
+| `num_elements` | `IntegerAttr` | 64-bit integer attribute attribute |
+| `element_dtype` | `Attribute` | derived attribute attribute |
+
+### Results:
+1. `tensor`: tensor of tf.dtype values
+
+## tf.TopKV2 (TF::TopKV2Op)
+
+Finds values and indices of the `k` largest elements for the last dimension.
+  
+
+### Description:
+
+If the input is a vector (rank-1), finds the `k` largest entries in the vector
+and outputs their values and indices as vectors.  Thus `values[j]` is the
+`j`-th largest entry in `input`, and its index is `indices[j]`.
+
+For matrices (resp. higher rank input), computes the top `k` entries in each
+row (resp. vector along the last dimension).  Thus,
+
+    values.shape = indices.shape = input.shape[:-1] + [k]
+
+If two elements are equal, the lower-index element appears first.
+
+### Operands:
+1. `input`: tensor of 8/16/32/64-bit integer or floating-point values
+1. `k`: tensor of 32-bit integer values
+
+### Attributes:
+| Attribute | MLIR Type | Description |
+| :-------: | :-------: | ----------- |
+| `sorted` | `BoolAttr` | bool attribute attribute |
+| `T` | `Attribute` | derived attribute attribute |
+
+### Results:
+1. `values`: tensor of 8/16/32/64-bit integer or floating-point values
+1. `indices`: tensor of 32-bit integer values
+
+## tf.Transpose (TF::TransposeOp)
+Shuffle dimensions of x according to a permutation.
+
+### Description:
+
+The output `y` has the same rank as `x`. The shapes of `x` and `y` satisfy:
+  `y.shape[i] == x.shape[perm[i]] for i in [0, 1, ..., rank(x) - 1]`
+
+### Operands:
+1. `x`: tensor of tf.dtype values
+1. `perm`: tensor of 32/64-bit integer values
+
+### Attributes:
+| Attribute | MLIR Type | Description |
+| :-------: | :-------: | ----------- |
+| `T` | `Attribute` | derived attribute attribute |
+| `Tperm` | `Attribute` | derived attribute attribute |
+
+### Results:
+1. `y`: tensor of tf.dtype values
+
+## tf.TruncateDiv (TF::TruncateDivOp)
+Returns x / y element-wise for integer types.
+
+### Description:
+
+Truncation designates that negative numbers will round fractional quantities
+toward zero. I.e. -7 / 5 = -1. This matches C semantics but it is different
+than Python semantics. See `FloorDiv` for a division function that matches
+Python Semantics.
+
+*NOTE*: `TruncateDiv` supports broadcasting. More about broadcasting
+[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+
+### Operands:
+1. `x`: tensor of number values
+1. `y`: tensor of number values
+
+### Attributes:
+| Attribute | MLIR Type | Description |
+| :-------: | :-------: | ----------- |
+| `T` | `Attribute` | derived attribute attribute |
+
+### Results:
+1. `z`: tensor of number values
+
+## tf.Unpack (TF::UnpackOp)
+
+Unpacks a given dimension of a rank-`R` tensor into `num` rank-`(R-1)` tensors.
+  
+
+### Description:
+
+Unpacks `num` tensors from `value` by chipping it along the `axis` dimension.
+For example, given a tensor of shape `(A, B, C, D)`;
+
+If `axis == 0` then the i'th tensor in `output` is the slice `value[i, :, :, :]`
+  and each tensor in `output` will have shape `(B, C, D)`. (Note that the
+  dimension unpacked along is gone, unlike `split`).
+
+If `axis == 1` then the i'th tensor in `output` is the slice `value[:, i, :, :]`
+  and each tensor in `output` will have shape `(A, C, D)`.
+Etc.
+
+This is the opposite of `pack`.
+
+### Operands:
+1. `value`: tensor of tf.dtype values
+
+### Attributes:
+| Attribute | MLIR Type | Description |
+| :-------: | :-------: | ----------- |
+| `num` | `IntegerAttr` | 64-bit integer attribute whose minimal value is 0 attribute |
+| `axis` | `IntegerAttr` | 64-bit integer attribute attribute |
+| `T` | `Attribute` | derived attribute attribute |
+
+### Results:
+1. `output`: tensor of tf.dtype values
+
+## tf.Xdivy (TF::XdivyOp)
+Returns 0 if x == 0, and x / y otherwise, elementwise.
+
+### Description:
+
+
+### Operands:
+1. `x`: tensor of 16-bit float or 32-bit float or 64-bit float or complex128 type or complex64 type values
+1. `y`: tensor of 16-bit float or 32-bit float or 64-bit float or complex128 type or complex64 type values
+
+### Attributes:
+| Attribute | MLIR Type | Description |
+| :-------: | :-------: | ----------- |
+| `T` | `Attribute` | derived attribute attribute |
+
+### Results:
+1. `z`: tensor of 16-bit float or 32-bit float or 64-bit float or complex128 type or complex64 type values
+
+## tf.ZerosLike (TF::ZerosLikeOp)
+Returns a tensor of zeros with the same shape and type as x.
+
+### Description:
+
+
+### Operands:
+1. `x`: tensor of tf.dtype values
+
+### Attributes:
+| Attribute | MLIR Type | Description |
+| :-------: | :-------: | ----------- |
+| `T` | `Attribute` | derived attribute attribute |
+
+### Results:
+1. `y`: tensor of tf.dtype values
+
diff --git a/tensorflow/compiler/mlir/g3doc/tfl_ops.md b/tensorflow/compiler/mlir/g3doc/tfl_ops.md
new file mode 100644
index 00000000000..74e4fc47868
--- /dev/null
+++ b/tensorflow/compiler/mlir/g3doc/tfl_ops.md
@@ -0,0 +1,1606 @@
+<!-- Autogenerated by mlir-tblgen; don't manually edit -->
+# Operation definition
+## tfl.abs (TFL::AbsOp)
+Absolute value operator
+
+### Description:
+
+Given a tensor `x`, this operation returns a tensor containing the absolute
+value of each element in `x`. For example, if x is an input element and y is
+an output element, this operation computes \\(y = |x|\\).
+
+### Operands:
+1. `x`: tensor of any type values
+
+### Attributes:
+
+### Results:
+1. `y`: tensor of any type values
+
+## tfl.add_n (TFL::AddNOp)
+add_n operator
+
+### Description:
+
+Adds all input tensors element-wise.
+
+### Operands:
+1. `inputs`: tensor of 32-bit float or 32-bit integer values
+
+### Attributes:
+
+### Results:
+1. `sum`: tensor of 32-bit float or 32-bit integer values
+
+## tfl.add (TFL::AddOp)
+Addition operator
+
+### Description:
+
+Element-wise addition operation.
+
+### Operands:
+1. `lhs`: tensor of any type values
+1. `rhs`: tensor of any type values
+
+### Attributes:
+| Attribute | MLIR Type | Description |
+| :-------: | :-------: | ----------- |
+| `fused_activation_function` | `StringAttr` | fused activation enum attribute |
+
+### Results:
+1. `output`: tensor of any type values
+
+## tfl.average_pool_2d (TFL::AveragePool2DOp)
+Average_pool_2d operator
+
+### Description:
+
+Performs average-pooling operation on input.
+
+### Operands:
+1. `input`: tensor of any type values
+
+### Attributes:
+| Attribute | MLIR Type | Description |
+| :-------: | :-------: | ----------- |
+| `filter_height` | `IntegerAttr` | 32-bit integer attribute attribute |
+| `filter_width` | `IntegerAttr` | 32-bit integer attribute attribute |
+| `padding` | `StringAttr` | padding enum attribute |
+| `stride_h` | `IntegerAttr` | 32-bit integer attribute attribute |
+| `stride_w` | `IntegerAttr` | 32-bit integer attribute attribute |
+| `fused_activation_function` | `StringAttr` | fused activation enum attribute |
+
+### Results:
+1. `output`: tensor of any type values
+
+## tfl.batch_to_space_nd (TFL::BatchToSpaceNdOp)
+BatchToSpaceNd operator
+
+### Description:
+
+This operation reshapes the "batch" dimension 0 into space dimensions.
+
+### Operands:
+1. `input`: tensor of 32-bit float or 8-bit integer or 32-bit integer or 64-bit integer values
+1. `block_shape`: tensor of 32-bit integer values
+1. `indices`: tensor of 32-bit integer values
+
+### Attributes:
+
+### Results:
+1. `output`: tensor of 32-bit float or 16-bit integer or 32-bit integer or 64-bit integer values
+
+## tfl.ceil (TFL::CeilOp)
+Ceil operator
+
+### Description:
+
+Returns element-wise ceil value of the input.
+
+### Operands:
+1. `x`: tensor of floating-point values
+
+### Attributes:
+
+### Results:
+1. `y`: tensor of floating-point values
+
+## tfl.concatenation (TFL::ConcatenationOp)
+Concatenation operator
+
+### Description:
+
+Concatenates tensors along one dimension
+
+### Operands:
+1. `values`: tensor of 32-bit float or 64-bit integer or 32-bit integer or 16-bit integer or 8-bit integer or quantized type with 8 bits storage type values
+
+### Attributes:
+| Attribute | MLIR Type | Description |
+| :-------: | :-------: | ----------- |
+| `axis` | `IntegerAttr` | 32-bit integer attribute attribute |
+| `fused_activation_function` | `StringAttr` | fused activation enum attribute |
+
+### Results:
+1. `output`: tensor of 32-bit float or 64-bit integer or 32-bit integer or 16-bit integer or 8-bit integer or quantized type with 8 bits storage type values
+
+## tfl.pseudo_const (TFL::ConstOp)
+Constant pseudo op.
+
+### Description:
+
+Represents a constant value in TensorFlow Lite dialect. This is not an
+actual operation and it will be lowered to buffer instead.
+
+The op is allowed to have all the same type of attributes as tf.Const does
+(e.g., opaque TF attributes are allowed).
+
+### Operands:
+
+### Attributes:
+| Attribute | MLIR Type | Description |
+| :-------: | :-------: | ----------- |
+| `value` | `ElementsAttr` | constant vector/tensor attribute attribute |
+
+### Results:
+1. `output`: tensor of any type values
+
+## tfl.conv_2d (TFL::Conv2DOp)
+Convolution operator
+
+### Description:
+
+Performs convolution operation on inputs.
+
+Inputs:
+  `inputs[0]`: required: the input activation tensor
+  `inputs[1]`: required: the filter weight tensor
+  `inputs[2]`: optional: the bias tensor
+
+### Operands:
+1. `input`: tensor of any type values
+1. `filter`: tensor of any type values
+1. `bias`: tensor of any type values
+
+### Attributes:
+| Attribute | MLIR Type | Description |
+| :-------: | :-------: | ----------- |
+| `dilation_h_factor` | `IntegerAttr` | 32-bit integer attribute attribute |
+| `dilation_w_factor` | `IntegerAttr` | 32-bit integer attribute attribute |
+| `fused_activation_function` | `StringAttr` | fused activation enum attribute |
+| `padding` | `StringAttr` | padding enum attribute |
+| `stride_h` | `IntegerAttr` | 32-bit integer attribute attribute |
+| `stride_w` | `IntegerAttr` | 32-bit integer attribute attribute |
+
+### Results:
+1. `output`: tensor of any type values
+
+## tfl.cos (TFL::CosOp)
+Cosine operator
+
+### Description:
+
+Computes element-wise Cosine of input
+
+### Operands:
+1. `x`: tensor of floating-point values
+
+### Attributes:
+
+### Results:
+1. `y`: tensor of floating-point values
+
+## tfl.depthwise_conv_2d (TFL::DepthwiseConv2DOp)
+Depthwise-separable convolution operator
+
+### Description:
+
+Performs convolution operation on inputs.
+
+Inputs:
+  `inputs[0]`: required: the input activation tensor
+  `inputs[1]`: required: the filter weight tensor
+  `inputs[2]`: optional: the bias tensor
+
+### Operands:
+1. `input`: tensor of any type values
+1. `filter`: tensor of any type values
+1. `bias`: tensor of any type values
+
+### Attributes:
+| Attribute | MLIR Type | Description |
+| :-------: | :-------: | ----------- |
+| `dilation_h_factor` | `IntegerAttr` | 32-bit integer attribute attribute |
+| `dilation_w_factor` | `IntegerAttr` | 32-bit integer attribute attribute |
+| `fused_activation_function` | `StringAttr` | fused activation enum attribute |
+| `padding` | `StringAttr` | padding enum attribute |
+| `stride_h` | `IntegerAttr` | 32-bit integer attribute attribute |
+| `stride_w` | `IntegerAttr` | 32-bit integer attribute attribute |
+| `depth_multiplier` | `IntegerAttr` | 32-bit integer attribute attribute |
+
+### Results:
+1. `output`: tensor of any type values
+
+## tfl.dequantize (TFL::DequantizeOp)
+Dequantize operator
+
+### Description:
+
+Converts quantized array of integers to floating-points according to the
+quantization parameters.
+
+### Operands:
+1. `input`: tensor of any type values
+
+### Attributes:
+
+### Results:
+1. `output`: tensor of any type values
+
+## tfl.div (TFL::DivOp)
+Division operator
+
+### Description:
+
+Element-wise division operation.
+
+### Operands:
+1. `lhs`: tensor of any type values
+1. `rhs`: tensor of any type values
+
+### Attributes:
+| Attribute | MLIR Type | Description |
+| :-------: | :-------: | ----------- |
+| `fused_activation_function` | `StringAttr` | fused activation enum attribute |
+
+### Results:
+1. `output`: tensor of any type values
+
+## tfl.elu (TFL::EluOp)
+Exponential Linear Unit operator
+
+### Description:
+
+Computes the exponential linear
+  f(x) -> exp(x) - 1 for x < 0, x for x >= 0.
+element-wise.
+
+### Operands:
+1. `x`: tensor of floating-point values
+
+### Attributes:
+
+### Results:
+1. `y`: tensor of any type values
+
+## tfl.equal (TFL::EqualOp)
+Equal operator
+
+### Description:
+
+Returns the truth element of x == y element-wise
+
+### Operands:
+1. `x`: tensor of 1-bit integer or 32-bit float or 32-bit integer or 64-bit integer or 8-bit integer values
+1. `y`: tensor of 1-bit integer or 32-bit float or 32-bit integer or 64-bit integer or 8-bit integer values
+
+### Attributes:
+
+### Results:
+1. `output`: tensor of 1-bit integer values
+
+## tfl.exp (TFL::ExpOp)
+Natural exponentiation operator
+
+### Description:
+
+Performs element-wise natural exponentiation operation on input.
+
+### Operands:
+1. `x`: tensor of any type values
+
+### Attributes:
+
+### Results:
+1. `y`: tensor of any type values
+
+## tfl.expand_dims (TFL::ExpandDimsOp)
+Inserts a dimension of 1 into a tensor's shape.
+
+### Description:
+
+Given a tensor `input`, this operation inserts a dimension of 1 at the
+dimension index `axis` of `input`'s shape. The dimension index `axis` starts at
+zero; if you specify a negative number for `axis` it is counted backward from
+the end.
+
+This operation is useful if you want to add a batch dimension to a single
+element. For example, if you have a single image of shape `[height, width,
+channels]`, you can make it a batch of 1 image with `expand_dims(image, 0)`,
+which will make the shape `[1, height, width, channels]`.
+
+Other examples:
+
+```
+# 't' is a tensor of shape [2]
+shape(expand_dims(t, 0)) ==> [1, 2]
+shape(expand_dims(t, 1)) ==> [2, 1]
+shape(expand_dims(t, -1)) ==> [2, 1]
+
+# 't2' is a tensor of shape [2, 3, 5]
+shape(expand_dims(t2, 0)) ==> [1, 2, 3, 5]
+shape(expand_dims(t2, 2)) ==> [2, 3, 1, 5]
+shape(expand_dims(t2, 3)) ==> [2, 3, 5, 1]
+```
+
+This operation requires that:
+
+`-1-input.dims() <= dim <= input.dims()`
+
+This operation is related to `squeeze()`, which removes dimensions of
+size 1.
+
+### Operands:
+1. `input`: tensor of any type values
+1. `dim`: tensor of any integer type
+
+### Attributes:
+
+### Results:
+1. `output`: tensor of any type values
+
+## tfl.fake_quant (TFL::FakeQuantOp)
+FakeQuant operator
+
+### Description:
+
+Fake-quantize the 'inputs' tensor of type float via float scalars min and
+max to 'outputs' tensor of same shape as inputs.
+
+### Operands:
+1. `input`: tensor of any type values
+
+### Attributes:
+| Attribute | MLIR Type | Description |
+| :-------: | :-------: | ----------- |
+| `minmax` | `ArrayAttr` | min-max range pair attribute |
+| `num_bits` | `IntegerAttr` | 32-bit integer attribute attribute |
+| `narrow_range` | `BoolAttr` | bool attribute attribute |
+
+### Results:
+1. `output`: tensor of any type values
+
+## tfl.fill (TFL::FillOp)
+Fill the tensor with given value.
+
+### Description:
+
+Fill the tensor with given value.
+
+### Operands:
+1. `dims`: tensor of 32/64-bit integer values
+1. `value`: tensor of any type values
+
+### Attributes:
+
+### Results:
+1. `res`: tensor of any type values
+
+## tfl.floor_div (TFL::FloorDivOp)
+Floor div operator
+
+### Description:
+
+Element-wise floor div operation.
+
+### Operands:
+1. `lhs`: tensor of any type values
+1. `rhs`: tensor of any type values
+
+### Attributes:
+
+### Results:
+1. `output`: tensor of any type values
+
+## tfl.floor_mod (TFL::FloorModOp)
+Division reminder
+
+### Description:
+
+Element-wise division reminder operation.
+
+### Operands:
+1. `lhs`: tensor of any type values
+1. `rhs`: tensor of any type values
+
+### Attributes:
+
+### Results:
+1. `output`: tensor of any type values
+
+## tfl.floor (TFL::FloorOp)
+Floor operator
+
+### Description:
+
+Returns element-wise floor value of the input.
+
+### Operands:
+1. `x`: tensor of floating-point values
+
+### Attributes:
+
+### Results:
+1. `y`: tensor of floating-point values
+
+## tfl.fully_connected (TFL::FullyConnectedOp)
+Fully connected op
+
+### Description:
+
+
+### Operands:
+1. `input`: tensor of 32-bit float values
+1. `filter`: tensor of 32-bit float values
+1. `bias`: tensor of 32-bit float values or none type
+
+### Attributes:
+| Attribute | MLIR Type | Description |
+| :-------: | :-------: | ----------- |
+| `fused_activation_function` | `StringAttr` | fused activation enum attribute |
+| `weights_format` | `StringAttr` | fully connected options weights format attribute |
+| `keep_num_dims` | `BoolAttr` | bool attribute attribute |
+
+### Results:
+1. `output`: tensor of 32-bit float values
+
+## tfl.gather (TFL::GatherOp)
+Gather operator
+
+### Description:
+
+Gather slices from `params` axis `axis` according to `indices`.
+
+### Operands:
+1. `params`: tensor of 32-bit float or 8-bit integer or 32-bit integer or 64-bit integer or TFLite string type values
+1. `indices`: tensor of 32-bit integer or 64-bit integer values
+
+### Attributes:
+| Attribute | MLIR Type | Description |
+| :-------: | :-------: | ----------- |
+| `axis` | `IntegerAttr` | 32-bit integer attribute attribute |
+
+### Results:
+1. `output`: tensor of 32-bit float or 16-bit integer or 32-bit integer or 64-bit integer or TFLite string type values
+
+## tfl.greater_equal (TFL::GreaterEqualOp)
+Greater_equal operator
+
+### Description:
+
+Element-wise greater_equal operation.
+
+### Operands:
+1. `lhs`: tensor of any type values
+1. `rhs`: tensor of any type values
+
+### Attributes:
+
+### Results:
+1. `output`: tensor of 1-bit integer values
+
+## tfl.greater (TFL::GreaterOp)
+Greater operator
+
+### Description:
+
+Element-wise greater operation.
+
+### Operands:
+1. `lhs`: tensor of any type values
+1. `rhs`: tensor of any type values
+
+### Attributes:
+
+### Results:
+1. `output`: tensor of any type values
+
+## tfl.pseudo_input (TFL::InputOp)
+Input pseudo operator
+
+### Description:
+
+Takes one of the function arguments as input and returns it as result.  This
+is a NOP and is used to attach attributes such as tensor name to function
+arguments.
+
+### Operands:
+1. `input`: tensor of any type values
+
+### Attributes:
+
+### Results:
+1. `output`: tensor of any type values
+
+## tfl.leaky_relu (TFL::LeakyReluOp)
+Leaky Relu operator
+
+### Description:
+
+Element-wise Leaky ReLU operator
+  x -> x >= 0 ? x : (alpha * x)
+
+### Operands:
+1. `input`: tensor of any type values
+
+### Attributes:
+| Attribute | MLIR Type | Description |
+| :-------: | :-------: | ----------- |
+| `alpha` | `FloatAttr` | 32-bit float attribute attribute |
+
+### Results:
+1. `output`: tensor of any type values
+
+## tfl.less_equal (TFL::LessEqualOp)
+Less_equal operator
+
+### Description:
+
+Element-wise less_equal operation.
+
+### Operands:
+1. `lhs`: tensor of 32-bit float or 32-bit integer or 64-bit integer or 8-bit integer values
+1. `rhs`: tensor of 32-bit float or 32-bit integer or 64-bit integer or 8-bit integer values
+
+### Attributes:
+
+### Results:
+1. `output`: tensor of 1-bit integer values
+
+## tfl.less (TFL::LessOp)
+Less operator
+
+### Description:
+
+Element-wise less operation.
+
+### Operands:
+1. `lhs`: tensor of any type values
+1. `rhs`: tensor of any type values
+
+### Attributes:
+
+### Results:
+1. `output`: tensor of 1-bit integer values
+
+## tfl.log (TFL::LogOp)
+Natural logarithm operator
+
+### Description:
+
+Performs element-wise natural logarithm operation on input.
+
+### Operands:
+1. `x`: tensor of any type values
+
+### Attributes:
+
+### Results:
+1. `y`: tensor of any type values
+
+## tfl.log_softmax (TFL::LogSoftmaxOp)
+Log softmax operator
+
+### Description:
+
+Computes element-wise log softmax activations with the following formula
+
+  input - log(reduce_sum(exp(input), dim))
+
+### Operands:
+1. `input`: tensor of any type values
+
+### Attributes:
+
+### Results:
+1. `output`: tensor of any type values
+
+## tfl.logical_and (TFL::LogicalAndOp)
+Logical AND operator
+
+### Description:
+
+Element-wise logical AND operation.
+
+### Operands:
+1. `lhs`: tensor of 1-bit integer values
+1. `rhs`: tensor of 1-bit integer values
+
+### Attributes:
+
+### Results:
+1. `output`: tensor of 1-bit integer values
+
+## tfl.logical_not (TFL::LogicalNotOp)
+Logical NOT operator
+
+### Description:
+
+Element-wise logical NOT operation.
+
+### Operands:
+1. `lhs`: tensor of 1-bit integer values
+
+### Attributes:
+
+### Results:
+1. `output`: tensor of 1-bit integer values
+
+## tfl.logical_or (TFL::LogicalOrOp)
+Logical OR operator
+
+### Description:
+
+Element-wise logical OR operation.
+
+### Operands:
+1. `lhs`: tensor of 1-bit integer values
+1. `rhs`: tensor of 1-bit integer values
+
+### Attributes:
+
+### Results:
+1. `output`: tensor of 1-bit integer values
+
+## tfl.logistic (TFL::LogisticOp)
+Logistic operator
+
+### Description:
+
+Computes element-wise Sigmoid of input
+
+### Operands:
+1. `x`: tensor of floating-point values
+
+### Attributes:
+
+### Results:
+1. `y`: tensor of floating-point values
+
+## tfl.max_pool_2d (TFL::MaxPool2DOp)
+Max Pool 2D op
+
+### Description:
+
+Performs max pool 2D on input.
+
+Inputs:
+  `inputs[0]`: required: the input tensor
+
+### Operands:
+1. `input`: tensor of any type values
+
+### Attributes:
+| Attribute | MLIR Type | Description |
+| :-------: | :-------: | ----------- |
+| `padding` | `StringAttr` | padding enum attribute |
+| `stride_w` | `IntegerAttr` | 32-bit integer attribute attribute |
+| `stride_h` | `IntegerAttr` | 32-bit integer attribute attribute |
+| `filter_width` | `IntegerAttr` | 32-bit integer attribute attribute |
+| `filter_height` | `IntegerAttr` | 32-bit integer attribute attribute |
+| `fused_activation_function` | `StringAttr` | fused activation enum attribute |
+
+### Results:
+1. `output`: tensor of any type values
+
+## tfl.maximum (TFL::MaximumOp)
+Max operator
+
+### Description:
+
+Element-wise max operation.
+
+### Operands:
+1. `lhs`: tensor of floating-point or 32/64-bit integer values
+1. `rhs`: tensor of floating-point or 32/64-bit integer values
+
+### Attributes:
+
+### Results:
+1. `max`: tensor of floating-point or 32/64-bit integer values
+
+## tfl.mean (TFL::MeanOp)
+Mean operator
+
+### Description:
+
+Computes the mean of elements across dimensions of a tensor.
+Reduces input_tensor along the dimensions given in axis.
+Unless keepdims is true, the rank of the tensor is reduced by 1 for
+each entry in axis. If keepdims is true, the reduced dimensions are retained
+with length 1.
+
+### Operands:
+1. `input`: tensor of 32-bit float or 8-bit integer or 32-bit integer or 64-bit integer values
+1. `axis`: tensor of 32-bit integer or 64-bit integer values
+
+### Attributes:
+| Attribute | MLIR Type | Description |
+| :-------: | :-------: | ----------- |
+| `keep_dims` | `BoolAttr` | bool attribute attribute |
+
+### Results:
+1. `output`: tensor of 32-bit float or 32-bit integer or 64-bit integer or 8-bit integer values
+
+## tfl.minimum (TFL::MinimumOp)
+Min operator
+
+### Description:
+
+Element-wise min operation.
+
+### Operands:
+1. `lhs`: tensor of floating-point or 32/64-bit integer values
+1. `rhs`: tensor of floating-point or 32/64-bit integer values
+
+### Attributes:
+
+### Results:
+1. `min`: tensor of floating-point or 32/64-bit integer values
+
+## tfl.mul (TFL::MulOp)
+Multiplication operator
+
+### Description:
+
+Element-wise multiplication operation.
+
+### Operands:
+1. `lhs`: tensor of any type values
+1. `rhs`: tensor of any type values
+
+### Attributes:
+| Attribute | MLIR Type | Description |
+| :-------: | :-------: | ----------- |
+| `fused_activation_function` | `StringAttr` | fused activation enum attribute |
+
+### Results:
+1. `output`: tensor of any type values
+
+## tfl.neg (TFL::NegOp)
+Negation operator
+
+### Description:
+
+Computes element-wise negation of input
+
+### Operands:
+1. `x`: tensor of any type values
+
+### Attributes:
+
+### Results:
+1. `y`: tensor of any type values
+
+## tfl.not_equal (TFL::NotEqualOp)
+Not_equal operator
+
+### Description:
+
+Element-wise not_equal operation.
+
+### Operands:
+1. `lhs`: tensor of any type values
+1. `rhs`: tensor of any type values
+
+### Attributes:
+
+### Results:
+1. `output`: tensor of 1-bit integer values
+
+## tfl.pack (TFL::PackOp)
+Packs a list of tensors along a dimension into one tensor
+
+### Description:
+
+Packs a list of `values_count` rank-`R` tensors into one rank-`(R+1)`
+tensor.
+
+Packs the `values_count` tensors in `values` into a tensor with rank one
+higher than each tensor in `values`, by packing them along the `axis`
+dimension.
+
+Given a list of tensors of shape `(A, B, C)`;
+
+if `axis == 0` then the `output` tensor will have the shape `(N, A, B, C)`.
+if `axis == 1` then the `output` tensor will have the shape `(A, N, B, C)`.
+Etc.
+
+For example:
+
+```
+# 'x' is [1, 4]
+# 'y' is [2, 5]
+# 'z' is [3, 6]
+pack([x, y, z]) => [[1, 4], [2, 5], [3, 6]]  # Pack along first dim.
+pack([x, y, z], axis=1) => [[1, 2, 3], [4, 5, 6]]
+```
+
+This is the opposite of `unpack`.
+
+### Operands:
+1. `values`: tensor of 32-bit float or 8-bit integer or 16-bit integer or 32-bit integer or 64-bit integer values
+
+### Attributes:
+| Attribute | MLIR Type | Description |
+| :-------: | :-------: | ----------- |
+| `values_count` | `IntegerAttr` | 32-bit integer attribute attribute |
+| `axis` | `IntegerAttr` | 32-bit integer attribute attribute |
+
+### Results:
+1. `output`: tensor of 32-bit float or 8-bit integer or 16-bit integer or 32-bit integer or 64-bit integer values
+
+## tfl.pad (TFL::PadOp)
+Padding operator
+
+### Description:
+
+This operation pads a `input` with zeros according to the `paddings` you
+specify. `paddings` is an integer tensor with shape `[Dn, 2]`, where n is
+the rank of `input`. For each dimension D of `input`, `paddings[D, 0]`
+indicates how many zeros to add before the contents of `input` in that
+dimension, and `paddings[D, 1]` indicates how many zeros to add after the
+contents of `input` in that dimension.
+
+The padded size of each dimension D of the output is:
+
+  `paddings(D, 0) + input.dim_size(D) + paddings(D, 1)`
+
+For example:
+
+```
+# 't' is [[1, 1], [2, 2]]
+# 'paddings' is [[1, 1], [2, 2]]
+# rank of 't' is 2
+pad(t, paddings) ==> [[0, 0, 0, 0, 0, 0]
+                      [0, 0, 1, 1, 0, 0]
+                      [0, 0, 2, 2, 0, 0]
+                      [0, 0, 0, 0, 0, 0]]
+
+### Operands:
+1. `input`: tensor of 32-bit float or 8-bit integer or 32-bit integer or 64-bit integer values
+1. `padding`: tensor of 32/64-bit integer values
+
+### Attributes:
+
+### Results:
+1. `output`: tensor of 32-bit float or 8-bit integer or 32-bit integer or 64-bit integer values
+
+## tfl.padv2 (TFL::PadV2Op)
+Padding operator v2
+
+### Description:
+
+This operation pads a `input` according to the `paddings` and
+`constant_values` you specify. `paddings` is an integer tensor with shape
+`[Dn, 2]`, where n is the rank of `input`. For each dimension D of `input`,
+`paddings[D, 0]` indicates how many zeros to add before the contents of
+`input` in that dimension, and `paddings[D, 1]` indicates how many zeros to
+add after the contents of `input` in that dimension. `constant_values` is a
+scalar tensor of the same type as `input` that indicates the value to use
+for padding `input`.
+
+The padded size of each dimension D of the output is:
+
+  `paddings(D, 0) + input.dim_size(D) + paddings(D, 1)`
+
+For example:
+
+```
+# 't' is [[1, 1], [2, 2]]
+# 'paddings' is [[1, 1], [2, 2]]
+# rank of 't' is 2
+pad(t, paddings) ==> [[0, 0, 0, 0, 0, 0]
+                      [0, 0, 1, 1, 0, 0]
+                      [0, 0, 2, 2, 0, 0]
+                      [0, 0, 0, 0, 0, 0]]
+
+### Operands:
+1. `input`: tensor of 32-bit float or 8-bit integer or 32-bit integer or 64-bit integer values
+1. `padding`: tensor of 32/64-bit integer values
+1. `constant_values`: tensor of 32-bit float or 8-bit integer or 32-bit integer or 64-bit integer values
+
+### Attributes:
+
+### Results:
+1. `output`: tensor of 32-bit float or 8-bit integer or 32-bit integer or 64-bit integer values
+
+## tfl.pow (TFL::PowOp)
+Power operator
+
+### Description:
+
+Element-wise power operation.
+
+### Operands:
+1. `lhs`: tensor of any type values
+1. `rhs`: tensor of any type values
+
+### Attributes:
+
+### Results:
+1. `output`: tensor of any type values
+
+## tfl.pseudo_qconst (TFL::QConstOp)
+Quantized constant pseudo op
+
+### Description:
+
+Represents a quantized constant value in TensorFlow Lite dialect. This is
+not an actual operation and it will be lowered to buffer instead. The
+quantization parameters are stored as a type attribute in this constant.
+
+### Operands:
+
+### Attributes:
+| Attribute | MLIR Type | Description |
+| :-------: | :-------: | ----------- |
+| `qtype` | `TypeAttr` | Tensor type attribute attribute |
+| `value` | `ElementsAttr` | constant vector/tensor attribute attribute |
+
+### Results:
+1. `output`: tensor of any type values
+
+## tfl.quantize (TFL::QuantizeOp)
+Quantize operator
+
+### Description:
+
+Converts floating point tensors to quantized integer tensors according to
+the quantization parameters defined in the type attribute.
+
+### Operands:
+1. `input`: tensor of any type values
+
+### Attributes:
+| Attribute | MLIR Type | Description |
+| :-------: | :-------: | ----------- |
+| `qtype` | `TypeAttr` | Tensor type attribute attribute |
+
+### Results:
+1. `output`: tensor of any type values
+
+## tfl.range (TFL::RangeOp)
+Range operator
+
+### Description:
+
+Returns a 1D tensor defined by a sequence from `start` to `limit` with
+a given `delta`.
+
+### Operands:
+1. `start`: tensor of any type values
+1. `limit`: tensor of any type values
+1. `delta`: tensor of any type values
+
+### Attributes:
+
+### Results:
+1. `result`: tensor of any type values
+
+## tfl.rank (TFL::RankOp)
+Rank operator.
+
+### Description:
+
+Returns the rank of a tensor.
+
+### Operands:
+1. `input`: tensor of any type values
+
+### Attributes:
+
+### Results:
+1. `output`: tensor of any integer type
+
+## tfl.reduce_max (TFL::ReduceMaxOp)
+Max-reduction operator
+
+### Description:
+
+Computes the max reduction along the specified axes
+
+### Operands:
+1. `input`: tensor of any type values
+1. `axes`: tensor of 32/64-bit integer values
+
+### Attributes:
+| Attribute | MLIR Type | Description |
+| :-------: | :-------: | ----------- |
+| `keep_dims` | `BoolAttr` | bool attribute attribute |
+
+### Results:
+1. &laquo;unnamed&raquo;: tensor of any type values
+
+## tfl.reduce_min (TFL::ReduceMinOp)
+Min-reduction operator
+
+### Description:
+
+Computes the min reduction along the specified axes
+
+### Operands:
+1. `input`: tensor of any type values
+1. `axes`: tensor of 32/64-bit integer values
+
+### Attributes:
+| Attribute | MLIR Type | Description |
+| :-------: | :-------: | ----------- |
+| `keep_dims` | `BoolAttr` | bool attribute attribute |
+
+### Results:
+1. &laquo;unnamed&raquo;: tensor of any type values
+
+## tfl.relu6 (TFL::Relu6Op)
+Relu6 operator
+
+### Description:
+
+Element-wise Relu6 operator
+  x -> max(0, min(6, x))
+
+### Operands:
+1. `x`: tensor of any type values
+
+### Attributes:
+
+### Results:
+1. `y`: tensor of any type values
+
+## tfl.relu (TFL::ReluOp)
+Relu operator
+
+### Description:
+
+Element-wise Relu operator
+  x -> max(0, x)
+
+### Operands:
+1. `x`: tensor of any type values
+
+### Attributes:
+
+### Results:
+1. `y`: tensor of any type values
+
+## tfl.reshape (TFL::ReshapeOp)
+Reshape operator
+
+### Description:
+
+Produces a tensor with the same values but different static shape defined
+by the output type.
+
+### Operands:
+1. `input`: tensor of any type values
+
+### Attributes:
+| Attribute | MLIR Type | Description |
+| :-------: | :-------: | ----------- |
+| `new_shape` | `Attribute` | derived attribute attribute |
+
+### Results:
+1. `output`: tensor of any type values
+
+## tfl.resize_bilinear (TFL::ResizeBilinearOp)
+ResizeBilinear Op
+
+### Description:
+
+Resize `images` to `size` using bilinear interpolation.
+
+### Operands:
+1. `input`: tensor of 32-bit float or 32-bit integer values
+1. `size`: tensor of 32-bit integer values
+
+### Attributes:
+| Attribute | MLIR Type | Description |
+| :-------: | :-------: | ----------- |
+| `align_corners` | `BoolAttr` | bool attribute attribute |
+
+### Results:
+1. `output`: tensor of 32-bit float values
+
+## tfl.reverse_v2 (TFL::ReverseV2Op)
+ReverseV2 Operator
+
+### Description:
+
+Reverses specific dimensions of a tensor.
+
+Given a tensor, and a int32/int64 tensor axis representing the set
+of dimensions of tensor to reverse.
+This operation reverses each dimension i for
+which there exists j s.t. axis[j] == i.
+
+Args:
+  tensor: A Tensor. Must be one of the following types:
+  int16, int32, int64, float32 Up to 8-D.
+
+  axis: A Tensor. Must be one of the following types: int32, int64.
+  with only 1 element which is the axis index.
+  TODO: Add support for multiple elements.
+
+### Operands:
+1. `input`: tensor of 32-bit float or 16-bit integer or 32-bit integer or 64-bit integer values
+1. `axis`: tensor of 32-bit integer or 64-bit integer values
+
+### Attributes:
+
+### Results:
+1. `output`: tensor of 32-bit float or 16-bit integer or 32-bit integer or 64-bit integer or 8-bit integer values
+
+## tfl.rsqrt (TFL::RsqrtOp)
+Reciprocal of square root operator
+
+### Description:
+
+Computes element-wise reverse square root of input
+
+### Operands:
+1. `x`: tensor of any type values
+
+### Attributes:
+
+### Results:
+1. `y`: tensor of any type values
+
+## tfl.select (TFL::SelectOp)
+Select operator
+
+### Description:
+
+Select values of 'x' if the corresponding value of 'condition' is true or
+the value of 'y' if false. There are valid condition input sizes:
+
+1. Either the same shape (in which case the select is elementwise), or
+2. condition must be Rank 1 and match over the first dimension.
+
+### Operands:
+1. `condition`: tensor of 1-bit integer values
+1. `x`: tensor of 32-bit float or 1-bit integer or 8-bit integer or 16-bit integer or 32-bit integer or 64-bit integer values
+1. `y`: tensor of 32-bit float or 1-bit integer or 8-bit integer or 16-bit integer or 32-bit integer or 64-bit integer values
+
+### Attributes:
+
+### Results:
+1. `output`: tensor of any type values
+
+## tfl.shape (TFL::ShapeOp)
+Shape operator
+
+### Description:
+
+Returns the shape of a tensor.
+
+### Operands:
+1. `input`: tensor of any type values
+
+### Attributes:
+| Attribute | MLIR Type | Description |
+| :-------: | :-------: | ----------- |
+| `out_type` | `Attribute` | derived attribute attribute |
+
+### Results:
+1. `output`: tensor of any type values
+
+## tfl.sin (TFL::SinOp)
+Sine operator
+
+### Description:
+
+Computes element-wise Sine of input
+
+### Operands:
+1. `x`: tensor of floating-point values
+
+### Attributes:
+
+### Results:
+1. `y`: tensor of floating-point values
+
+## tfl.softmax (TFL::SoftmaxOp)
+Softmax operator
+
+### Description:
+
+Computes element-wise softmax activiations with the following formula
+
+  exp(input) / tf.reduce_sum(exp(input * beta), dim)
+
+### Operands:
+1. `input`: tensor of any type values
+
+### Attributes:
+| Attribute | MLIR Type | Description |
+| :-------: | :-------: | ----------- |
+| `beta` | `FloatAttr` | 32-bit float attribute attribute |
+
+### Results:
+1. `output`: tensor of any type values
+
+## tfl.space_to_batch_nd (TFL::SpaceToBatchNdOp)
+SpaceToBatchNd operator
+
+### Description:
+
+This operation reshapes space dimensions into the "batch" dimension 0
+
+### Operands:
+1. `input`: tensor of 32-bit float or 8-bit integer or 32-bit integer or 64-bit integer values
+1. `block_shape`: tensor of 32-bit integer values
+1. `paddings`: tensor of 32-bit integer values
+
+### Attributes:
+
+### Results:
+1. `output`: tensor of 32-bit float or 16-bit integer or 32-bit integer or 64-bit integer values
+
+## tfl.split (TFL::SplitOp)
+Splits a tensor into `num_split` tensors along one dimension.
+
+### Description:
+
+Splits the `value` tensor along `split_dim` into a number of sub-tensors
+with same shape as the original one, except for `split_dim`. Same as
+tf.Split.
+
+### Operands:
+1. `split_dim`: tensor of 32-bit integer values
+1. `value`: tensor of 32-bit float or 16-bit integer or 32-bit integer or 64-bit integer values
+
+### Attributes:
+| Attribute | MLIR Type | Description |
+| :-------: | :-------: | ----------- |
+| `num_splits` | `IntegerAttr` | 32-bit integer attribute attribute |
+
+### Results:
+1. `outputs`: tensor of 32-bit float or 16-bit integer or 32-bit integer or 64-bit integer values
+
+## tfl.split_v (TFL::SplitVOp)
+Splits a tensor into `num_split` tensors along one dimension.
+
+### Description:
+
+Splits the `value` tensor along `split_dim` into a number of sub-tensors
+with same shape as the original one, except for `split_dim`. The grouping
+of the resultant sub-tensors is decided by `size-splits`. Same as tf.SplitV.
+
+### Operands:
+1. `value`: tensor of 32-bit float or 16-bit integer or 32-bit integer or 64-bit integer values
+1. `size_splits`: tensor of 32-bit integer values
+1. `split_dim`: tensor of 32-bit integer values
+
+### Attributes:
+| Attribute | MLIR Type | Description |
+| :-------: | :-------: | ----------- |
+| `num_splits` | `IntegerAttr` | 32-bit integer attribute attribute |
+
+### Results:
+1. `outputs`: tensor of 32-bit float or 16-bit integer or 32-bit integer or 64-bit integer values
+
+## tfl.sqrt (TFL::SqrtOp)
+Square root operator
+
+### Description:
+
+Computes element-wise Square root of input
+
+### Operands:
+1. `x`: tensor of any type values
+
+### Attributes:
+
+### Results:
+1. `y`: tensor of any type values
+
+## tfl.square (TFL::SquareOp)
+Square operator
+
+### Description:
+
+Computes element-wise Square of input
+
+### Operands:
+1. `x`: tensor of any type values
+
+### Attributes:
+
+### Results:
+1. `y`: tensor of any type values
+
+## tfl.squared_difference (TFL::SquaredDifferenceOp)
+Squared difference operator
+
+### Description:
+
+Element-wise squared difference operation.
+
+### Operands:
+1. `lhs`: tensor of any type values
+1. `rhs`: tensor of any type values
+
+### Attributes:
+
+### Results:
+1. `output`: tensor of any type values
+
+## tfl.squeeze (TFL::SqueezeOp)
+Removes dimensions of size 1 from the shape of a tensor.
+
+### Description:
+
+Given a tensor `input`, this operation returns a tensor of the same type with
+all dimensions of size 1 removed. If you don't want to remove all size 1
+dimensions, you can remove specific size 1 dimensions by specifying
+`axis`.
+
+For example:
+
+```
+# 't' is a tensor of shape [1, 2, 1, 3, 1, 1]
+shape(squeeze(t)) ==> [2, 3]
+```
+
+Or, to remove specific size 1 dimensions:
+
+```
+# 't' is a tensor of shape [1, 2, 1, 3, 1, 1]
+shape(squeeze(t, [2, 4])) ==> [1, 2, 3, 1]
+```
+
+### Operands:
+1. `input`: tensor of any type values
+
+### Attributes:
+| Attribute | MLIR Type | Description |
+| :-------: | :-------: | ----------- |
+| `squeeze_dims` | `ArrayAttr` | 64-bit integer array attribute attribute |
+
+### Results:
+1. `output`: tensor of any type values
+
+## tfl.strided_slice (TFL::StridedSliceOp)
+StridedSlice Op
+
+### Description:
+
+Return a strided slice from `input`.
+
+### Operands:
+1. `input`: tensor of 32-bit float or 32-bit integer or 64-bit integer or 8-bit integer values
+1. `begin`: tensor of 32-bit integer values
+1. `end`: tensor of 32-bit integer values
+1. `strides`: tensor of 32-bit integer values
+
+### Attributes:
+| Attribute | MLIR Type | Description |
+| :-------: | :-------: | ----------- |
+| `begin_mask` | `IntegerAttr` | 32-bit integer attribute attribute |
+| `end_mask` | `IntegerAttr` | 32-bit integer attribute attribute |
+| `ellipsis_mask` | `IntegerAttr` | 32-bit integer attribute attribute |
+| `new_axis_mask` | `IntegerAttr` | 32-bit integer attribute attribute |
+| `shrink_axis_mask` | `IntegerAttr` | 32-bit integer attribute attribute |
+
+### Results:
+1. `output`: tensor of 32-bit float or 32-bit integer or 64-bit integer or 8-bit integer values
+
+## tfl.sub (TFL::SubOp)
+Subtraction operator
+
+### Description:
+
+Element-wise subtraction operation.
+
+### Operands:
+1. `lhs`: tensor of any type values
+1. `rhs`: tensor of any type values
+
+### Attributes:
+| Attribute | MLIR Type | Description |
+| :-------: | :-------: | ----------- |
+| `fused_activation_function` | `StringAttr` | fused activation enum attribute |
+
+### Results:
+1. `output`: tensor of any type values
+
+## tfl.sum (TFL::SumOp)
+Sum operator
+
+### Description:
+
+Computes the sum reduction along the specified axes
+
+### Operands:
+1. `input`: tensor of any type values
+1. `axes`: tensor of 32/64-bit integer values
+
+### Attributes:
+| Attribute | MLIR Type | Description |
+| :-------: | :-------: | ----------- |
+| `keep_dims` | `BoolAttr` | bool attribute attribute |
+
+### Results:
+1. &laquo;unnamed&raquo;: tensor of any type values
+
+## tfl.tanh (TFL::TanhOp)
+Hyperbolic tangent operator
+
+### Description:
+
+Computes element-wise Hyperbolic tangent of input
+
+### Operands:
+1. `x`: tensor of any type values
+
+### Attributes:
+
+### Results:
+1. `y`: tensor of any type values
+
+## tfl.tile (TFL::TileOp)
+Tile operator.
+
+### Description:
+
+ Constructs a tensor by tiling a given tensor.
+
+This operation creates a new tensor by replicating input
+multiples times. The output tensor's i'th dimension has
+input.dims(i) * multiples[i] elements, and the values of input
+are replicated multiples[i] times along the 'i'th dimension.
+For example, tiling [a b c d] by [2] produces [a b c d a b c d].
+
+### Operands:
+1. `input`: tensor of any type values
+1. `multiples`: tensor of 32/64-bit integer values
+
+### Attributes:
+
+### Results:
+1. `output`: tensor of any type values
+
+## tfl.topk_v2 (TFL::TopKV2Op)
+TopK operator
+
+### Description:
+
+Returns the top `k` largest element along each last dimensional slice of
+`input` and the indices of values within the last dimension of the input
+tensor.
+
+### Operands:
+1. `input`: tensor of 32-bit float or 8-bit integer or 32-bit integer or 64-bit integer values
+1. `k`: tensor of 32-bit integer values
+
+### Attributes:
+
+### Results:
+1. `values`: tensor of any type values
+1. `indices`: tensor of 32-bit integer values
+
+## tfl.transpose (TFL::TransposeOp)
+Transpose operator
+
+### Description:
+
+Returns the Transpose of x
+
+### Operands:
+1. `x`: tensor of any type values
+1. `perm`: tensor of any type values
+
+### Attributes:
+
+### Results:
+1. `y`: tensor of any type values
+
+## tfl.unidirectional_sequence_lstm (TFL::UnidirectionalSequenceLSTMOp)
+Unidirectional sequence lstm operator
+
+### Description:
+
+A recurrent neural network specified by an LSTM cell. This Op supports
+unrolling the input along the time or batch dimensions, and
+implements the following operation for
+each element in the sequence s = 1...sequence_length:
+  outputs[s] = state = activation(LSTMOp(inputs[s]))
+
+where LSTMOp is LSTM TF Lite Op and the “activation” is the function passed
+as the “fused_activation_function” argument (if not “NONE”).
+
+### Operands:
+1. `input`: tensor of 32-bit float or 8-bit integer values
+1. `input_to_input_weights`: tensor of 32-bit float or 8-bit integer values or none type
+1. `input_to_forget_weights`: tensor of 32-bit float or 8-bit integer values
+1. `input_to_cell_weights`: tensor of 32-bit float or 8-bit integer values
+1. `input_to_output_weights`: tensor of 32-bit float or 8-bit integer values
+1. `recurrent_to_input_weights`: tensor of 32-bit float or 8-bit integer values or none type
+1. `recurrent_to_forget_weights`: tensor of 32-bit float or 8-bit integer values
+1. `recurrent_to_cell_weights`: tensor of 32-bit float or 8-bit integer values
+1. `recurrent_to_output_weights`: tensor of 32-bit float or 8-bit integer values
+1. `cell_to_input_weights`: tensor of 32-bit float or 8-bit integer values or none type
+1. `cell_to_forget_weights`: tensor of 32-bit float or 8-bit integer values or none type
+1. `cell_to_output_weights`: tensor of 32-bit float or 8-bit integer values or none type
+1. `input_gate_bias`: tensor of 32-bit float values or none type
+1. `forget_gate_bias`: tensor of 32-bit float values
+1. `cell_bias`: tensor of 32-bit float values
+1. `output_gate_bias`: tensor of 32-bit float values
+1. `projection_weights`: tensor of 32-bit float or 8-bit integer values or none type
+1. `projection_bias`: tensor of 32-bit float values or none type
+1. `input_activation_state`: stateful tensor
+1. `input_cell_state`: stateful tensor
+1. `input_layer_norm_coefficients`: tensor of 32-bit float or 8-bit integer values or none type
+1. `forget_layer_norm_coefficients`: tensor of 32-bit float or 8-bit integer values or none type
+1. `cell_layer_norm_coefficients`: tensor of 32-bit float or 8-bit integer values or none type
+1. `output_layer_norm_coefficients`: tensor of 32-bit float or 8-bit integer values or none type
+
+### Attributes:
+| Attribute | MLIR Type | Description |
+| :-------: | :-------: | ----------- |
+| `fused_activation_function` | `StringAttr` | fused activation enum attribute |
+| `cell_clip` | `FloatAttr` | 32-bit float attribute attribute |
+| `proj_clip` | `FloatAttr` | 32-bit float attribute attribute |
+| `time_major` | `BoolAttr` | bool attribute attribute |
+
+### Results:
+1. `output`: tensor of any type values
+
+## tfl.unpack (TFL::UnpackOp)
+Unpacks a tensor along a dimension into multiple tensors
+
+### Description:
+
+Unpacks a given dimension of a rank-`R` tensor into `num` rank-`(R-1)` tensors.
+
+Unpacks `num` tensors from `value` by chipping it along the `axis` dimension.
+For example, given a tensor of shape `(A, B, C, D)`;
+
+If `axis == 0` then the i'th tensor in `output` is the slice `value[i, :, :, :]`
+  and each tensor in `output` will have shape `(B, C, D)`. (Note that the
+  dimension unpacked along is gone, unlike `split`).
+
+If `axis == 1` then the i'th tensor in `output` is the slice `value[:, i, :, :]`
+  and each tensor in `output` will have shape `(A, C, D)`.
+Etc.
+
+This is the opposite of `pack`.
+
+### Operands:
+1. `input`: tensor of 32-bit float or 8-bit integer or 32-bit integer values
+
+### Attributes:
+| Attribute | MLIR Type | Description |
+| :-------: | :-------: | ----------- |
+| `num` | `IntegerAttr` | 32-bit integer attribute attribute |
+| `axis` | `IntegerAttr` | 32-bit integer attribute attribute |
+
+### Results:
+1. `outputs`: tensor of 32-bit float or 8-bit integer or 32-bit integer values
+
+## tfl.zeros_like (TFL::ZerosLikeOp)
+ZerosLike operator
+
+### Description:
+
+Returns a tensor of zeros with the same shape and type as the input tensor.
+
+### Operands:
+1. `input`: tensor of any type values
+
+### Attributes:
+
+### Results:
+1. `output`: tensor of any type values
+
diff --git a/tensorflow/compiler/mlir/lite/BUILD b/tensorflow/compiler/mlir/lite/BUILD
index 99740515a48..c4a3275d557 100644
--- a/tensorflow/compiler/mlir/lite/BUILD
+++ b/tensorflow/compiler/mlir/lite/BUILD
@@ -1,4 +1,4 @@
-load("//tensorflow:tensorflow.bzl", "tf_cc_binary", "tf_native_cc_binary")
+load("//tensorflow:tensorflow.bzl", "tf_cc_binary", "tf_cc_test", "tf_native_cc_binary")
 load(
     "@local_config_mlir//:tblgen.bzl",
     "gentbl",
@@ -185,6 +185,41 @@ cc_library(
     alwayslink = 1,
 )
 
+cc_library(
+    name = "lstm_utils",
+    srcs = [
+        "utils/lstm_utils.cc",
+    ],
+    hdrs = [
+        "utils/lstm_utils.h",
+    ],
+    copts = ["-std=c++14"],
+    deps = [
+        ":tensorflow_lite",
+        "//tensorflow/compiler/mlir/tensorflow",
+        "@llvm//:support",
+        "@local_config_mlir//:IR",
+        "@local_config_mlir//:StandardOps",
+        "@local_config_mlir//:Support",
+    ],
+)
+
+tf_cc_test(
+    name = "lstm_utils_test",
+    size = "small",
+    srcs = ["utils/lstm_utils_test.cc"],
+    deps = [
+        ":lstm_utils",
+        "//tensorflow/compiler/mlir/tensorflow",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "@llvm//:support",
+        "@local_config_mlir//:IR",
+        "@local_config_mlir//:StandardOps",
+        "@local_config_mlir//:Support",
+    ],
+)
+
 cc_library(
     name = "tensorflow_lite_legalize_tf",
     srcs = [
@@ -198,9 +233,11 @@ cc_library(
         "transforms/prepare_composite_functions_tf.cc",
         "transforms/prepare_tf.cc",
         "transforms/trim_functions_tf.cc",
+        "transforms/unroll_batch_matmul.cc",
     ],
     hdrs = [
         "transforms/passes.h",
+        "transforms/unroll_batch_matmul.h",
     ],
     deps = [
         ":common",
@@ -249,6 +286,7 @@ cc_library(
     name = "tensorflow_lite_quantize",
     srcs = [
         "transforms/generated_quantize.inc",
+        "transforms/load_quantization_recipe.cc",
         "transforms/post_quantize.cc",
         "transforms/prepare_quantize.cc",
         "transforms/quantize.cc",
@@ -521,7 +559,7 @@ cc_library(
         ":tensorflow_lite_quantize",
         "//tensorflow/compiler/mlir/tensorflow",
         "//tensorflow/compiler/mlir/tensorflow:error_util",
-        "//tensorflow/compiler/mlir/tensorflow:tensorflow_fold_switch",
+        "//tensorflow/compiler/mlir/tensorflow:tensorflow_passes",
         "//tensorflow/compiler/mlir/tensorflow:tf_dialect_lib",
         "//tensorflow/compiler/mlir/tensorflow:tf_dialect_passes",
         "//tensorflow/compiler/mlir/tensorflow:translate_lib",
diff --git a/tensorflow/compiler/mlir/lite/flatbuffer_translate.cc b/tensorflow/compiler/mlir/lite/flatbuffer_translate.cc
index aa57ff7f751..783696ecac3 100644
--- a/tensorflow/compiler/mlir/lite/flatbuffer_translate.cc
+++ b/tensorflow/compiler/mlir/lite/flatbuffer_translate.cc
@@ -99,7 +99,10 @@ using xla::StatusOr;
 template <typename T>
 using BufferOffset = flatbuffers::Offset<T>;
 
-using CustomOptionsOffset = BufferOffset<flatbuffers::Vector<uint8_t>>;
+template <typename T>
+using VectorBufferOffset = flatbuffers::Offset<flatbuffers::Vector<T>>;
+
+using CustomOptionsOffset = VectorBufferOffset<uint8_t>;
 
 namespace error = tensorflow::error;
 namespace tfl = mlir::TFL;
@@ -415,6 +418,15 @@ class Translator {
 
   Optional<BufferOffset<tflite::SubGraph>> BuildSubGraph(FuncOp fn);
 
+  // Builds Metadata with the given `name` and buffer `content`.
+  BufferOffset<tflite::Metadata> BuildMetadata(StringRef name,
+                                               StringRef content);
+
+  // Encodes the `tfl.metadata` dictionary attribute of the module to the
+  // metadata section in the final model.
+  Optional<VectorBufferOffset<BufferOffset<tflite::Metadata>>>
+  CreateMetadataVector();
+
   // Uses the tf.entry_function attribute (if set) to initialize the op to name
   // mapping.
   void InitializeNamesFromAttribute(FuncOp fn);
@@ -977,6 +989,36 @@ Optional<BufferOffset<tflite::SubGraph>> Translator::BuildSubGraph(FuncOp fn) {
       /*name=*/builder_.CreateString(fn.getName().str()));
 }
 
+BufferOffset<tflite::Metadata> Translator::BuildMetadata(StringRef name,
+                                                         StringRef content) {
+  auto buffer_index = buffers_.size();
+  auto buffer_data = builder_.CreateVector(
+      reinterpret_cast<const uint8_t*>(content.data()), content.size());
+  buffers_.push_back(tflite::CreateBuffer(builder_, buffer_data));
+  return tflite::CreateMetadataDirect(builder_, name.data(), buffer_index);
+}
+
+Optional<VectorBufferOffset<BufferOffset<tflite::Metadata>>>
+Translator::CreateMetadataVector() {
+  auto dict_attr = module_.getAttrOfType<mlir::DictionaryAttr>("tfl.metadata");
+  if (!dict_attr) return VectorBufferOffset<BufferOffset<tflite::Metadata>>();
+
+  std::vector<BufferOffset<tflite::Metadata>> metadata;
+  for (const auto& named_attr : dict_attr) {
+    StringRef name = named_attr.first;
+    mlir::Attribute attr = named_attr.second;
+    if (auto content = attr.dyn_cast<StringAttr>()) {
+      metadata.push_back(BuildMetadata(name, content.getValue()));
+    } else {
+      module_.emitError(
+          "all values in tfl.metadata's dictionary key-value pairs should be "
+          "string attributes");
+      return llvm::None;
+    }
+  }
+  return builder_.CreateVector(metadata);
+}
+
 Optional<std::string> Translator::Translate(ModuleOp module,
                                             bool emit_builtin_tflite_ops,
                                             bool emit_select_tf_ops,
@@ -1024,12 +1066,17 @@ Optional<std::string> Translator::TranslateInternal() {
   } else {
     model_description = "MLIR Converted.";
   }
+
   // Build the model and finish the model building process.
   auto description = builder_.CreateString(model_description.data());
+  VectorBufferOffset<int32_t> metadata_buffer = 0;  // Deprecated
+  auto metadata = CreateMetadataVector();
+  if (!metadata) return llvm::None;
+
   auto model = tflite::CreateModel(
       builder_, TFLITE_SCHEMA_VERSION, builder_.CreateVector(opcodes_),
       builder_.CreateVector(subgraphs), description,
-      builder_.CreateVector(buffers_));
+      builder_.CreateVector(buffers_), metadata_buffer, *metadata);
   tflite::FinishModelBuffer(builder_, model);
 
   // Return serialized string for the built FlatBuffer.
diff --git a/tensorflow/compiler/mlir/lite/ir/tfl_ops.cc b/tensorflow/compiler/mlir/lite/ir/tfl_ops.cc
index e92b7ac60a3..4f3d71a7fd4 100644
--- a/tensorflow/compiler/mlir/lite/ir/tfl_ops.cc
+++ b/tensorflow/compiler/mlir/lite/ir/tfl_ops.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/compiler/mlir/lite/ir/tfl_ops.h"
 
+#include <algorithm>
 #include <cstdint>
 
 #include "llvm/ADT/APFloat.h"
@@ -30,6 +31,7 @@ limitations under the License.
 #include "mlir/IR/StandardTypes.h"  // TF:local_config_mlir
 #include "mlir/IR/TypeUtilities.h"  // TF:local_config_mlir
 #include "mlir/Support/LLVM.h"  // TF:local_config_mlir
+#include "mlir/Support/LogicalResult.h"  // TF:local_config_mlir
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_types.h"
 
 namespace mlir {
@@ -1167,6 +1169,54 @@ OpFoldResult TransposeOp::fold(ArrayRef<Attribute> operands) {
   return DenseElementsAttr::get(result_type, new_values);
 }
 
+static LogicalResult Verify(TransposeOp op) {
+  auto input_type = op.x()->getType().cast<ShapedType>();
+  auto perm_type = op.perm()->getType().cast<ShapedType>();
+  auto output_type = op.y()->getType().cast<ShapedType>();
+  if (input_type.hasStaticShape() && perm_type.hasStaticShape()) {
+    if (perm_type.getNumElements() != input_type.getRank()) {
+      return op.emitOpError(
+          "perm tensor elements size is not equal to input tensor rank");
+    }
+  }
+
+  DenseIntElementsAttr perm;
+  if (!matchPattern(op.perm(), m_Constant(&perm))) {
+    return success();
+  }
+
+  int index = 0;
+  llvm::SmallVector<int64_t, 4> axes;
+  for (auto axis_int : perm.getValues<APInt>()) {
+    const int64_t axis = axis_int.getSExtValue();
+    if (axis < 0 || (input_type.hasRank() && axis >= input_type.getRank())) {
+      return op.emitOpError(
+          llvm::formatv("perm[{0}] must be in [0, rank)", index));
+    }
+    if (std::count(axes.begin(), axes.end(), axis) > 0) {
+      return op.emitOpError(
+          llvm::formatv("perm[{0}] cannot have duplicated axis", index));
+    }
+    axes.push_back(axis);
+    index++;
+  }
+
+  if (input_type.hasStaticShape() && output_type.hasStaticShape()) {
+    llvm::SmallVector<int64_t, 4> transposed_shape;
+    for (int64_t axis : axes) {
+      transposed_shape.push_back(input_type.getDimSize(axis));
+    }
+    auto expected_output_type =
+        RankedTensorType::get(transposed_shape, input_type.getElementType());
+    if (output_type != expected_output_type) {
+      return op.emitOpError(llvm::formatv("expect output type {0}, got {1}",
+                                          expected_output_type, output_type));
+    }
+  }
+
+  return success();
+}
+
 //===----------------------------------------------------------------------===//
 // TableGen'd op method definitions
 //===----------------------------------------------------------------------===//
diff --git a/tensorflow/compiler/mlir/lite/ir/tfl_ops.td b/tensorflow/compiler/mlir/lite/ir/tfl_ops.td
index 1d7b909f762..018e6605197 100644
--- a/tensorflow/compiler/mlir/lite/ir/tfl_ops.td
+++ b/tensorflow/compiler/mlir/lite/ir/tfl_ops.td
@@ -132,15 +132,35 @@ def TFL_FpOrI32OrI64Tensor : TensorOf<[AnyFloat, TFL_Int32Or64]>;
 // Rank/Shape helpers.
 //===----------------------------------------------------------------------===//
 
+class TFL_OperandIsUnrankedPred<int n> :
+  CPred<"$_op.getOperand(" # n # ")->getType().isa<UnrankedTensorType>()">;
+
 // TODO: Some of these could be generalized and/or moved to more general
 // location.
 // Returns true if the n-th operand has unknown rank or has rank m.
 class TFL_OperandHasRank<int n, int m> :
   PredOpTrait<"operand " # n # " is " # m # "-D",
-    Or<[CPred<"$_op.getOperand(" # n # ")->getType().isa<UnrankedTensorType>()">,
+    Or<[TFL_OperandIsUnrankedPred<n>,
       CPred<"$_op.getOperand(" # n #
       ")->getType().cast<ShapedType>().getRank() == " # m>]>>;
 
+// CPred version of TFL_OperandHasRank.
+class TFL_OperandHasRankPred<int n, int m> :
+  Or<[TFL_OperandIsUnrankedPred<n>,
+      CPred<"$_op.getOperand(" # n #
+      ")->getType().cast<ShapedType>().getRank() == " # m>]>;
+
+// True if operand n is ranked and has a rank > dim.
+class TFL_OperandIsRankedAndHasDimPred<int n, int dim> : And<[
+  CPred<"$_op.getOperand(" # n # ")->getType().isa<RankedTensorType>()">,
+  CPred<"$_op.getOperand(" # n # ")->getType().cast<ShapedType>().getRank() > "
+  # dim>]>;
+
+class TFL_OperandDimEquals<int n, int dim, int size> : And<[
+  TFL_OperandIsRankedAndHasDimPred<n, dim>,
+  CPred<"$_op.getOperand(" # n # ")->getType().cast<ShapedType>()"
+      ".getShape()[" # dim # " ] == " # size>]>;
+
 // Returns true if the n-th operand has unknown rank or at least rank m.
 class TFL_OperandHasAtleastRank<int n, int m> :
   PredOpTrait<"operand " # n # " is " # m # "-D",
@@ -155,6 +175,32 @@ class TFL_OperandRankEquals1DimOfOperand<int x, int y> :
       "$_op.getOperand(" # y #
       ")->getType().cast<ShapedType>().getShape()[0]">>;
 
+// True if x_shape[dim] == y_shape[dim].
+class TFL_DimOfOperandEqualsDimOfOperandPred<int x, int y, int dim> : And<[
+  TFL_OperandIsRankedAndHasDimPred<x, dim>,
+  TFL_OperandIsRankedAndHasDimPred<y, dim>,
+  CPred<"$_op.getOperand(" # x #
+    ")->getType().cast<ShapedType>().getShape()[" # dim # "] == "
+    "$_op.getOperand(" # y #
+    ")->getType().cast<ShapedType>().getShape()[" # dim # "]">]>;
+
+// Select operands must satisfy one of the following constraints:
+// All inputs are unranked/scalars
+// OR
+// All inputs are ranked AND have equal dim[0] AND X & Y have same rank.
+def SelectShapeConstraints :
+  PredOpTrait<"Select operands meet shape criteria",
+    Or<[
+      And<[
+        TFL_OperandHasRankPred<0, 0>,
+        TFL_OperandHasRankPred<1, 0>,
+        TFL_OperandHasRankPred<2, 0>]>,
+      And<[
+        TFL_DimOfOperandEqualsDimOfOperandPred<0, 1, 0>,
+        TFL_DimOfOperandEqualsDimOfOperandPred<0, 2, 0>,
+        CPred<"$_op.getOperand(1)->getType().cast<ShapedType>().getRank() == "
+        "$_op.getOperand(2)->getType().cast<ShapedType>().getRank()">]>]>>;
+
 // This is a quantization-aware version of TCresVTEtIsSameAsOp
 class TFL_TCresVTEtIsSameAsOp<int i, int j> : And<[
   TCOpResIsShapedTypePred<i, j>,
@@ -315,7 +361,7 @@ def TFL_AddOp : TFL_Op<"add", [Broadcastable, NoSideEffect, Commutative]> {
 
 // TODO(haoliang): Implement legalization pass after pattern rewrite generator
 // supports variadic inputs.
-def TFL_AddNOp : TFL_Op<"add_n", [Commutative, NoSideEffect]> {
+def TFL_AddNOp : TFL_Op<"add_n", [Commutative, NoSideEffect, SameOperandsAndResultsScale]> {
   let summary = "add_n operator";
 
   let description = [{
@@ -323,11 +369,11 @@ def TFL_AddNOp : TFL_Op<"add_n", [Commutative, NoSideEffect]> {
   }];
 
   let arguments = (ins
-    Variadic<TensorOf<[F32, I32]>>:$inputs
+    Variadic<TensorOf<[F32, I32, QI16, QUI16]>>:$inputs
   );
 
   let results = (outs
-    TensorOf<[F32, I32]>:$sum
+    TensorOf<[F32, I32, QI16, QUI16]>:$sum
   );
 }
 
@@ -680,6 +726,117 @@ def TFL_GreaterEqualOp : TFL_Op<"greater_equal", [
   let hasOptions = 0;
 }
 
+// These ops are named NonMaxSuppressionV4 & NonMaxSuppressionV5 to be
+// consistent with TensorFlow's naming. They are NOT 'versions' of NMS in the
+// sense that one is an incremental change over the other.
+// In reality NonMaxSuppressionV5 implements Soft Non Max Suppression and
+// NonMaxSuppressionV4 performs hard NMS.
+
+def TFL_NonMaxSuppressionV4Op : TFL_Op<"non_max_suppression_v4", [
+  NoSideEffect,
+  // Operand 0 (boxes) should have rank 2 with the dim[1] == 4 (box corners)
+  TFL_OperandHasRank<0, 2>,
+  PredOpTrait<"boxes should have dim[1] == 4",
+      TFL_OperandDimEquals<0, 1, 4>>,
+  // Operand 1 (scores) should be a 1-dim tensor
+  TFL_OperandHasRank<1, 1>,
+  // Other operands are scalar params.
+  TFL_OperandHasRank<2, 0>, TFL_OperandHasRank<3, 0>,
+  TFL_OperandHasRank<4, 0>]> {
+  let summary = [{
+Greedily selects a subset of bounding boxes in descending order of score,
+  }];
+
+  let description = [{
+pruning away boxes that have high intersection-over-union (IOU) overlap
+with previously selected boxes.  Bounding boxes with score less than
+`score_threshold` are removed.  Bounding boxes are supplied as
+[y1, x1, y2, x2], where (y1, x1) and (y2, x2) are the coordinates of any
+diagonal pair of box corners and the coordinates can be provided as normalized
+(i.e., lying in the interval [0, 1]) or absolute.  Note that this algorithm
+is agnostic to where the origin is in the coordinate system and more
+generally is invariant to orthogonal transformations and translations
+of the coordinate system; thus translating or reflections of the coordinate
+system result in the same boxes being selected by the algorithm.
+The output of this operation is a set of integers indexing into the input
+collection of bounding boxes representing the selected boxes.  The bounding
+box coordinates corresponding to the selected indices can then be obtained
+using the `tf.gather operation`.  For example:
+  selected_indices = tf.image.non_max_suppression_v2(
+      boxes, scores, max_output_size, iou_threshold, score_threshold)
+  selected_boxes = tf.gather(boxes, selected_indices)
+  }];
+
+  let arguments = (ins
+    TFL_FpTensor:$boxes,
+    TFL_FpTensor:$scores,
+    I32Tensor:$max_output_size,
+    TFL_FpTensor:$iou_threshold,
+    TFL_FpTensor:$score_threshold
+  );
+
+  let results = (outs
+    I32Tensor:$selected_indices,
+    I32Tensor:$valid_outputs
+  );
+}
+
+def TFL_NonMaxSuppressionV5Op : TFL_Op<"non_max_suppression_v5", [
+  NoSideEffect,
+  // Operand 0 (boxes) should have rank 2 with the dim[1] == 4 (box corners)
+  TFL_OperandHasRank<0, 2>,
+  PredOpTrait<"boxes should have dim[1] == 4",
+      TFL_OperandDimEquals<0, 1, 4>>,
+  // Operand 1 (scores) should be a 1-dim tensor
+  TFL_OperandHasRank<1, 1>,
+  // Other operands are scalar params.
+  TFL_OperandHasRank<2, 0>, TFL_OperandHasRank<3, 0>,
+  TFL_OperandHasRank<4, 0>, TFL_OperandHasRank<5, 0>]> {
+  let summary = [{
+Greedily selects a subset of bounding boxes in descending order of score,
+  }];
+
+  let description = [{
+pruning away boxes that have high intersection-over-union (IOU) overlap
+with previously selected boxes.  Bounding boxes with score less than
+`score_threshold` are removed.  Bounding boxes are supplied as
+[y1, x1, y2, x2], where (y1, x1) and (y2, x2) are the coordinates of any
+diagonal pair of box corners and the coordinates can be provided as normalized
+(i.e., lying in the interval [0, 1]) or absolute.  Note that this algorithm
+is agnostic to where the origin is in the coordinate system and more
+generally is invariant to orthogonal transformations and translations
+of the coordinate system; thus translating or reflections of the coordinate
+system result in the same boxes being selected by the algorithm.
+The output of this operation is a set of integers indexing into the input
+collection of bounding boxes representing the selected boxes.  The bounding
+box coordinates corresponding to the selected indices can then be obtained
+using the `tf.gather operation`.  For example:
+  selected_indices = tf.image.non_max_suppression_v2(
+      boxes, scores, max_output_size, iou_threshold, score_threshold)
+  selected_boxes = tf.gather(boxes, selected_indices)
+This op also supports a Soft-NMS (with Gaussian weighting) mode (c.f.
+Bodla et al, https://arxiv.org/abs/1704.04503) where boxes reduce the score
+of other overlapping boxes instead of directly causing them to be pruned.
+To enable this Soft-NMS mode, set the `soft_nms_sigma` parameter to be
+larger than 0.
+  }];
+
+  let arguments = (ins
+    TFL_FpTensor:$boxes,
+    TFL_FpTensor:$scores,
+    I32Tensor:$max_output_size,
+    TFL_FpTensor:$iou_threshold,
+    TFL_FpTensor:$score_threshold,
+    TFL_FpTensor:$soft_nms_sigma
+  );
+
+  let results = (outs
+    I32Tensor:$selected_indices,
+    TFL_FpTensor:$selected_scores,
+    I32Tensor:$valid_outputs
+  );
+}
+
 def TFL_NotEqualOp : TFL_Op<"not_equal", [
     Broadcastable, Commutative, NoSideEffect, NoQuantizableResult]> {
   let summary = "Not_equal operator";
@@ -987,11 +1144,11 @@ def TFL_L2NormalizationOp : TFL_Op<"l2_normalization", [NoSideEffect]> {
   }];
 
   let arguments = (ins
-    TensorOf<[F32, QUI8, QI8, I8]>:$input,
+    TensorOf<[F32, QUI8, QI8, QUI16, QI16, I8]>:$input,
     TFL_AFAttr:$fused_activation_function
   );
 
-  let results = (outs TensorOf<[F32, QUI8, QI8, I8]>:$output);
+  let results = (outs TensorOf<[F32, QUI8, QI8, QUI16, QI16, I8]>:$output);
 
   let hasOptions = 1;
 
@@ -1100,9 +1257,9 @@ def TFL_LogisticOp: TFL_Op<"logistic", [
     Computes element-wise Sigmoid of input
   }];
 
-  let arguments = (ins TensorOf<[AnyFloat, QI8, QUI8]>:$x);
+  let arguments = (ins TensorOf<[AnyFloat, QI8, QUI8, QI16, QUI16]>:$x);
 
-  let results = (outs TensorOf<[AnyFloat, QI8, QUI8]>:$y);
+  let results = (outs TensorOf<[AnyFloat, QI8, QUI8, QI16, QUI16]>:$y);
 }
 
 def TFL_LogOp: TFL_Op<"log", [NoSideEffect, SameOperandsAndResultType]> {
@@ -1441,7 +1598,7 @@ def TFL_NegOp: TFL_Op<"neg", [NoSideEffect, SameOperandsAndResultType]> {
   let hasOptions = 0b1;
 }
 
-def TFL_PackOp : TFL_Op<"pack", [NoSideEffect]> {
+def TFL_PackOp : TFL_Op<"pack", [NoSideEffect, SameOperandsAndResultsScale]> {
   let summary = "Packs a list of tensors along a dimension into one tensor";
 
   let description = [{
@@ -1472,14 +1629,14 @@ def TFL_PackOp : TFL_Op<"pack", [NoSideEffect]> {
   }];
 
   let arguments = (ins
-    Variadic<TensorOf<[F32, I8, I16, I32, I64]>>:$values,
+    Variadic<TensorOf<[F32, I8, I16, I32, I64, QI8, QUI8]>>:$values,
 
     I32Attr:$values_count,
     I32Attr:$axis
   );
 
   let results = (outs
-    TensorOf<[F32, I8, I16, I32, I64]>:$output
+    TensorOf<[F32, I8, I16, I32, I64, QI8, QUI8]>:$output
   );
 
   let verifier = [{ return Verify(*this); }];
@@ -1777,8 +1934,7 @@ def TFL_ReverseV2Op: TFL_Op<"reverse_v2",
 }
 
 def TFL_SelectOp : TFL_Op<"select", [NoSideEffect,
-  // TODO(jpienaar): This is too retrictive, rank 1 input is also allowed.
-  SameOperandsAndResultShape,
+  SelectShapeConstraints,
   PredOpTrait<"operands have same element type", TCopVTEtIsSameAs<1, 2>>,
   PredOpTrait<"operands and result have same element type",
     TCresVTEtIsSameAsOp<0, 1>>]> {
@@ -1836,7 +1992,7 @@ def TFL_SoftmaxOp : TFL_Op<"softmax", [
   let summary = "Softmax operator";
 
   let description = [{
-    Computes element-wise softmax activiations with the following formula
+    Computes element-wise softmax activations with the following formula
 
       exp(input) / tf.reduce_sum(exp(input * beta), dim)
   }];
@@ -1942,9 +2098,9 @@ def TFL_TanhOp: TFL_Op<"tanh", [
     Computes element-wise Hyperbolic tangent of input
   }];
 
-  let arguments = (ins TensorOf<[F32, I16, I8, QI8, QUI8, TFL_Uint8]>:$x);
+  let arguments = (ins TensorOf<[F32, I16, I8, QI8, QUI8, QI16, QUI16, TFL_Uint8]>:$x);
 
-  let results = (outs TensorOf<[F32, I16, I8, QI8, QUI8, TFL_Uint8]>:$y);
+  let results = (outs TensorOf<[F32, I16, I8, QI8, QUI8, QI16, QUI16, TFL_Uint8]>:$y);
 }
 
 def TFL_TileOp: TFL_Op<"tile", [NoSideEffect,
@@ -1999,8 +2155,6 @@ def TFL_TopKV2Op: TFL_Op<"topk_v2", [NoSideEffect, TFL_OperandHasRank<1,0>,
   let hasOptions = 1;
 }
 
-// TODO: Verify result shape a permutation of the first input shape's
-// dimensions.
 def TFL_TransposeOp : TFL_Op<"transpose",
   [NoSideEffect,
    TFL_OperandHasRank<1,1>,
@@ -2025,6 +2179,8 @@ def TFL_TransposeOp : TFL_Op<"transpose",
     AnyTensor:$y
   );
 
+  let verifier = [{ return Verify(*this); }];
+
   let hasFolder = 1;
 }
 
@@ -2342,7 +2498,8 @@ def TFL_StridedSliceOp: TFL_Op<"strided_slice",
   let hasOptions = 1;
 }
 
-def TFL_CastOp : TFL_Op<"cast", [NoSideEffect, SameOperandsAndResultShape]> {
+def TFL_CastOp : TFL_Op<"cast", [
+    NoSideEffect, SameOperandsAndResultShape, NoQuantizableResult]> {
   let summary = "Cast operator";
 
   let description = [{
@@ -2629,6 +2786,10 @@ Ba et al. “Layer Normalization”
 
   let results = (outs AnyTensor:$output);
 
+  // TODO(fengliuai): customize printer and parser to not display
+  // empty region.
+  let regions = (region AnyRegion:$internal);
+
   let hasOptions = 1;
 
   let verifier = [{ return Verify(*this); }];
diff --git a/tensorflow/compiler/mlir/lite/quantization/quantization_driver.cc b/tensorflow/compiler/mlir/lite/quantization/quantization_driver.cc
index e4158700713..4e3fda7771e 100644
--- a/tensorflow/compiler/mlir/lite/quantization/quantization_driver.cc
+++ b/tensorflow/compiler/mlir/lite/quantization/quantization_driver.cc
@@ -549,32 +549,42 @@ QuantParams QuantizationDriver::GetQuantParamsForSameScaleConstraint(
 
 void QuantizationDriver::PreprocessConstantOps() {
   fn_.walk([&](ConstantOp cst) {
-    // Non-float tensors are neither weights or require quantization.
-    if (!cst.getType().cast<ShapedType>().getElementType().isa<FloatType>()) {
-      return;
-    }
+    // Non-float tensors are neither weights nor require quantization.
+    auto type = cst.getType().dyn_cast<ShapedType>();
+    if (!type || !type.getElementType().isa<FloatType>()) return;
 
     Value *value = cst.getResult();
     SmallVector<std::pair<Operation *, int>, 4> bias_users;
+    bool used_as_weight = false;
     for (auto &use : value->getUses()) {
       auto spec = GetQuantSpec(use.getOwner());
       auto biases = spec->biases_params;
       Operation *user = use.getOwner();
       int operand_num = use.getOperandNumber();
 
-      // The user doesn't use this value as a bias operand nor require same
-      // scale.
+      // The user doesn't use this value as a bias operand or require same
+      // scale, then this constant is considered to be a weight.
       if (biases.find(operand_num) == biases.end() &&
           !spec->requires_same_scale) {
-        weights_.insert(cst);
+        used_as_weight = true;
       } else {
         bias_users.push_back({user, operand_num});
       }
     }
-    builder_.setInsertionPoint(cst);
-    for (int i = 1; i < bias_users.size(); ++i) {
+
+    // If the constant is used as a weight, this constant will be duplicated for
+    // each bias user, so it isn't shared with the weight usage. Otherwise, the
+    // first bias user can use the original constant and the rest use the
+    // duplications, so we pop bias user from the set.
+    if (used_as_weight) {
+      weights_.insert(cst);
+    } else {
+      bias_users.pop_back();
+      builder_.setInsertionPoint(cst);
+    }
+    for (auto bias_user : bias_users) {
       auto copied = builder_.create<ConstantOp>(cst.getLoc(), cst.getValue());
-      bias_users[i].first->setOperand(bias_users[i].second, copied.getResult());
+      bias_user.first->setOperand(bias_user.second, copied.getResult());
     }
   });
 }
diff --git a/tensorflow/compiler/mlir/lite/quantization/quantization_utils.cc b/tensorflow/compiler/mlir/lite/quantization/quantization_utils.cc
index 4919fbc74fe..c6355e123f1 100644
--- a/tensorflow/compiler/mlir/lite/quantization/quantization_utils.cc
+++ b/tensorflow/compiler/mlir/lite/quantization/quantization_utils.cc
@@ -32,7 +32,7 @@ static Type GetQuantizedType(Builder builder, Type input_type, double min,
                              double max, int storage_type_width,
                              bool narrow_range, bool is_signed) {
   auto converter =
-      quant::ExpressedToUniformQuantizedConverter::forInputType(input_type);
+      quant::ExpressedToQuantizedConverter::forInputType(input_type);
 
   quant::UniformQuantizedType quantizedEleType = quant::fakeQuantAttrsToType(
       builder.getUnknownLoc(), storage_type_width, min, max, narrow_range,
diff --git a/tensorflow/compiler/mlir/lite/tests/extract-ophint.mlir b/tensorflow/compiler/mlir/lite/tests/extract-ophint.mlir
index 5cbcb1e1cb8..a71e5cfae24 100644
--- a/tensorflow/compiler/mlir/lite/tests/extract-ophint.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/extract-ophint.mlir
@@ -13,6 +13,11 @@ func @extractSimpleOphint() {
   return
 }
 
+// CHECK:  func @d4b1eb00b81211e99426dc4a3e957995(tensor<1x16x16x1xf32>) -> tensor<1x16x16x1xf32>
+// CHECK:    attributes  {_tflite_function_input_index = [0 : i32], _tflite_function_name = "cool_activation"}
+
+// -----
+
 // CHECK-LABEL: extractPackedInputOphint
 func @extractPackedInputOphint() {
 // CHECK:  %[[PACK:[0-9]*]] = "tfl.pack"(%0, %1) {axis = 0 : i32, values_count = 2 : i32} : (tensor<1x16x1xf32>, tensor<1x16x1xf32>) -> tensor<2x1x16x1xf32>
@@ -30,6 +35,11 @@ func @extractPackedInputOphint() {
   return
 }
 
+// CHECK:  func @47393154b9af11e99426dc4a3e957995(tensor<2x1x16x1xf32>) -> tensor<1x16x1xf32>
+// CHECK:    attributes  {_tflite_function_input_index = [0 : i32], _tflite_function_name = "cool_activation_stack"}
+
+// -----
+
 // CHECK-LABEL: extractFirstInputOphint
 func @extractFirstInputOphint() {
 // CHECK:  %[[OP_HINT_CALL:[0-9]*]] = call @b703f0f4b9ec11e99426dc4a3e957995(%0) : (tensor<1x16x1xf32>) -> tensor<1x16x1xf32>
@@ -46,6 +56,11 @@ func @extractFirstInputOphint() {
   return
 }
 
+// CHECK:  func @b703f0f4b9ec11e99426dc4a3e957995(tensor<1x16x1xf32>) -> tensor<1x16x1xf32>
+// CHECK:    attributes  {_tflite_function_input_index = [0 : i32], _tflite_function_name = "cool_activation_first"}
+
+// -----
+
 // CHECK-LABEL: extractLastInputOphint
 func @extractLastInputOphint() {
 // CHECK:  %[[OP_HINT_CALL:[0-9]*]] = call @e31fcf90b9ed11e99426dc4a3e957995(%1) : (tensor<1x16x1xf32>) -> tensor<1x16x1xf32>
@@ -62,6 +77,11 @@ func @extractLastInputOphint() {
   return
 }
 
+// CHECK:  func @e31fcf90b9ed11e99426dc4a3e957995(tensor<1x16x1xf32>) -> tensor<1x16x1xf32>
+// CHECK:    attributes  {_tflite_function_input_index = [0 : i32], _tflite_function_name = "cool_activation_last"}
+
+// -----
+
 // CHECK-LABEL: extractPackOneInputOphint
 func @extractPackOneInputOphint() {
 // CHECK:  %[[RESHAPE:[0-9]*]] = "tfl.reshape"(%0) : (tensor<1x16x1xf32>) -> tensor<1x1x16x1xf32>
@@ -75,13 +95,16 @@ func @extractPackOneInputOphint() {
   return
 }
 
+// CHECK:  func @33fab028b9ef11e99426dc4a3e957995(tensor<1x1x16x1xf32>) -> tensor<1x16x1xf32>
+// CHECK:    attributes  {_tflite_function_input_index = [0 : i32], _tflite_function_name = "cool_activation_pack_input_one"}
+
+// -----
+
 // CHECK-LABEL: extractStackInputOutputOphint
 func @extractStackInputOutputOphint() {
 // CHECK:  %[[PACK:[0-9]*]] = "tfl.pack"(%0, %1) {axis = 0 : i32, values_count = 2 : i32} : (tensor<1x16x1xf32>, tensor<1x16x1xf32>) -> tensor<2x1x16x1xf32>
 // CHECK:  %[[OP_HINT_CALL:[0-9]*]] = call @b92ed354b9f011e99426dc4a3e957995(%[[PACK]]) : (tensor<2x1x16x1xf32>) -> tensor<2x1x16x1xf32>
 // CHECK:  %[[UNPACK:[0-9]*]]:2 = "tfl.unpack"(%[[OP_HINT_CALL]]) {axis = 0 : i32, num = 2 : i32} : (tensor<2x1x16x1xf32>) -> (tensor<1x16x1xf32>, tensor<1x16x1xf32>)
-// CHECK:  %[[OUTPUT1:[0-9]*]] = "tf.Identity"(%[[UNPACK]]#0) {T = "tfdtype$DT_FLOAT", _tflite_function_aggregate = "stack", _tflite_function_name = "cool_activation_stack_input_output", _tflite_function_output_index = 0 : i64, _tflite_function_sort_index = 0 : i64, _tflite_function_uuid = "b92ed354b9f011e99426dc4a3e957995", _tflite_ophint_level = 1 : i64, name = "OutputHint-cool_activation_stack_input_output-b92ed354b9f011e99426dc4a3e957995-0-0-None"} : (tensor<1x16x1xf32>) -> tensor<1x16x1xf32>
-// CHECK:  %[[OUTPUT2:[0-9]*]] = "tf.Identity"(%[[UNPACK]]#1) {T = "tfdtype$DT_FLOAT", _tflite_function_aggregate = "stack", _tflite_function_name = "cool_activation_stack_input_output", _tflite_function_output_index = 0 : i64, _tflite_function_sort_index = 1 : i64, _tflite_function_uuid = "b92ed354b9f011e99426dc4a3e957995", _tflite_ophint_level = 1 : i64, name = "OutputHint-cool_activation_stack_input_output-b92ed354b9f011e99426dc4a3e957995-0-1-None"} : (tensor<1x16x1xf32>) -> tensor<1x16x1xf32>
 
   %0 = "tf.Placeholder"() {dtype = "tfdtype$DT_FLOAT", name = "Placeholder", shape = "tfshape$dim { size: 1 } dim { size: 16 } dim { size: 1 }"} : () -> tensor<1x16x1xf32>
   %1 = "tf.Identity"(%0) {T = "tfdtype$DT_FLOAT", _tflite_function_aggregate = "stack", _tflite_function_input_index = 0 : i64, _tflite_function_name = "cool_activation_stack_input_output", _tflite_function_sort_index = 0 : i64, _tflite_function_uuid = "b92ed354b9f011e99426dc4a3e957995", _tflite_ophint_level = 1 : i64, name = "InputHint-cool_activation_stack_input_output-b92ed354b9f011e99426dc4a3e957995-0-0-None"} : (tensor<1x16x1xf32>) -> tensor<1x16x1xf32>
@@ -98,11 +121,14 @@ func @extractStackInputOutputOphint() {
   return
 }
 
+// CHECK:  func @b92ed354b9f011e99426dc4a3e957995(tensor<2x1x16x1xf32>) -> tensor<2x1x16x1xf32>
+// CHECK:    attributes  {_tflite_function_input_index = [0 : i32], _tflite_function_name = "cool_activation_stack_input_output"}
+
+// -----
+
 // CHECK-LABEL: extractMultipleInputsOutputsOphint
 func @extractMultipleInputsOutputsOphint() {
-// CHECK:  %[[OP_HINT_CALL:[0-9]*]]:2 = call @a6ca45beb9f411e99426dc4a3e957995(%0, %1) : (tensor<1x16x1xf32>, tensor<1x16x1xf32>) -> (tensor<1x16x1xf32>, tensor<1x16x1xf32>)
-// CHECK:  %[[OUTPUT1:[0-9]*]] = "tf.Identity"(%[[OP_HINT_CALL]]#0) {T = "tfdtype$DT_FLOAT", _tflite_function_name = "cool_activation_multiple_input_output", _tflite_function_output_index = 0 : i64, _tflite_function_uuid = "a6ca45beb9f411e99426dc4a3e957995", _tflite_ophint_level = 1 : i64, name = "OutputHint-cool_activation_multiple_input_output-a6ca45beb9f411e99426dc4a3e957995-0-None-None"} : (tensor<1x16x1xf32>) -> tensor<1x16x1xf32>
-// CHECK:  %[[OUTPUT2:[0-9]*]] = "tf.Identity"(%[[OP_HINT_CALL]]#1) {T = "tfdtype$DT_FLOAT", _tflite_function_name = "cool_activation_multiple_input_output", _tflite_function_output_index = 1 : i64, _tflite_function_uuid = "a6ca45beb9f411e99426dc4a3e957995", _tflite_ophint_level = 1 : i64, name = "OutputHint-cool_activation_multiple_input_output-a6ca45beb9f411e99426dc4a3e957995-1-None-None"} : (tensor<1x16x1xf32>) -> tensor<1x16x1xf32>
+// CHECK:  %[[MULTI_INPUT_CALL:[0-9]*]]:2 = call @a6ca45beb9f411e99426dc4a3e957995(%0, %1) : (tensor<1x16x1xf32>, tensor<1x16x1xf32>) -> (tensor<1x16x1xf32>, tensor<1x16x1xf32>)
 
   %0 = "tf.Placeholder"() {dtype = "tfdtype$DT_FLOAT", name = "Placeholder", shape = "tfshape$dim { size: 1 } dim { size: 16 } dim { size: 1 }"} : () -> tensor<1x16x1xf32>
   %1 = "tf.Identity"(%0) {T = "tfdtype$DT_FLOAT", _tflite_function_input_index = 0 : i64, _tflite_function_name = "cool_activation_multiple_input_output", _tflite_function_uuid = "a6ca45beb9f411e99426dc4a3e957995", _tflite_ophint_level = 1 : i64, name = "InputHint-cool_activation_multiple_input_output-a6ca45beb9f411e99426dc4a3e957995-0-None-None"} : (tensor<1x16x1xf32>) -> tensor<1x16x1xf32>
@@ -119,21 +145,33 @@ func @extractMultipleInputsOutputsOphint() {
   return
 }
 
-// CHECK:  func @d4b1eb00b81211e99426dc4a3e957995(tensor<1x16x16x1xf32>) -> tensor<1x16x16x1xf32>
-// CHECK:    attributes  {_tflite_function_name = "cool_activation"}
-// CHECK:  func @47393154b9af11e99426dc4a3e957995(tensor<2x1x16x1xf32>) -> tensor<1x16x1xf32>
-// CHECK:    attributes  {_tflite_function_name = "cool_activation_stack"}
-// CHECK:  func @b703f0f4b9ec11e99426dc4a3e957995(tensor<1x16x1xf32>) -> tensor<1x16x1xf32>
-// CHECK:    attributes  {_tflite_function_name = "cool_activation_first"}
-// CHECK:  func @e31fcf90b9ed11e99426dc4a3e957995(tensor<1x16x1xf32>) -> tensor<1x16x1xf32>
-// CHECK:    attributes  {_tflite_function_name = "cool_activation_last"}
-// CHECK:  func @33fab028b9ef11e99426dc4a3e957995(tensor<1x1x16x1xf32>) -> tensor<1x16x1xf32>
-// CHECK:    attributes  {_tflite_function_name = "cool_activation_pack_input_one"}
-// CHECK:  func @b92ed354b9f011e99426dc4a3e957995(tensor<2x1x16x1xf32>) -> tensor<2x1x16x1xf32>
-// CHECK:    attributes  {_tflite_function_name = "cool_activation_stack_input_output"}
 // CHECK:  func @a6ca45beb9f411e99426dc4a3e957995(tensor<1x16x1xf32>, tensor<1x16x1xf32>) -> (tensor<1x16x1xf32>, tensor<1x16x1xf32>)
-// CHECK:    attributes  {_tflite_function_name = "cool_activation_multiple_input_output"}
+// CHECK:  attributes  {_tflite_function_input_index = [0 : i32, 1 : i32], _tflite_function_name = "cool_activation_multiple_input_output"}
 
+// -----
+
+// CHECK-LABEL: inputsAfterOutputs
+func @inputsAfterOutputs() {
+// CHECK:  %[[PLACE_HOLDER:[0-9]*]] = "tf.Placeholder"() {device = "", dtype = "tfdtype$DT_FLOAT", name = "Placeholder_1", shape = "tfshape$dim { size: 2 } dim { size: 2 }"} : () -> tensor<2x2xf32>
+// CHECK:  %[[INPUT_PROCESS:[0-9]*]] = "tf.Sigmoid"(%[[PLACE_HOLDER]]) {T = "tfdtype$DT_FLOAT", device = "", name = "Sigmoid"} : (tensor<2x2xf32>) -> tensor<2x2xf32>
+// CHECK:  %[[OP_HINT_CALL:[0-9]*]]:2 = call @d6266124d2dd11e9b52cdc4a3e957995(%0, %1, %[[INPUT_PROCESS]]) : (tensor<2x2xf32>, tensor<f32>, tensor<2x2xf32>) -> (tensor<2x2xf32>, tensor<2x2xf32>)
+
+  %0 = "tf.Const"() {device = "", dtype = "tfdtype$DT_FLOAT", name = "Const", value = dense<0.000000e+00> : tensor<f32>} : () -> tensor<f32>
+  %1 = "tf.Identity"(%0) {T = "tfdtype$DT_FLOAT", _tflite_function_input_index = 1 : i64, _tflite_function_name = "CustomOp", _tflite_function_uuid = "d6266124d2dd11e9b52cdc4a3e957995", _tflite_ophint_level = 1 : i64, device = "", name = "InputHint-CustomOp-d6266124d2dd11e9b52cdc4a3e957995-1-None-None"} : (tensor<f32>) -> tensor<f32>
+  %2 = "tf.Placeholder"() {device = "", dtype = "tfdtype$DT_FLOAT", name = "Placeholder", shape = "tfshape$dim { size: 2 } dim { size: 2 }"} : () -> tensor<2x2xf32>
+  %3 = "tf.Identity"(%2) {T = "tfdtype$DT_FLOAT", _tflite_function_input_index = 0 : i64, _tflite_function_name = "CustomOp", _tflite_function_uuid = "d6266124d2dd11e9b52cdc4a3e957995", _tflite_ophint_level = 1 : i64, device = "", name = "InputHint-CustomOp-d6266124d2dd11e9b52cdc4a3e957995-0-None-None"} : (tensor<2x2xf32>) -> tensor<2x2xf32>
+  %4 = "tf.Add"(%3, %1) {T = "tfdtype$DT_FLOAT", device = "", name = "Add"} : (tensor<2x2xf32>, tensor<f32>) -> tensor<2x2xf32>
+  %5 = "tf.Identity"(%4) {T = "tfdtype$DT_FLOAT", _tflite_function_name = "CustomOp", _tflite_function_output_index = 0 : i64, _tflite_function_uuid = "d6266124d2dd11e9b52cdc4a3e957995", _tflite_ophint_level = 1 : i64, device = "", name = "OutputHint-CustomOp-d6266124d2dd11e9b52cdc4a3e957995-0-None-None"} : (tensor<2x2xf32>) -> tensor<2x2xf32>
+  %6 = "tf.Placeholder"() {device = "", dtype = "tfdtype$DT_FLOAT", name = "Placeholder_1", shape = "tfshape$dim { size: 2 } dim { size: 2 }"} : () -> tensor<2x2xf32>
+  %7 = "tf.Sigmoid"(%6) {T = "tfdtype$DT_FLOAT", device = "", name = "Sigmoid"} : (tensor<2x2xf32>) -> tensor<2x2xf32>
+  %8 = "tf.Identity"(%7) {T = "tfdtype$DT_FLOAT", _tflite_function_input_index = 2 : i64, _tflite_function_name = "CustomOp", _tflite_function_uuid = "d6266124d2dd11e9b52cdc4a3e957995", _tflite_ophint_level = 1 : i64, device = "", name = "InputHint-CustomOp-d6266124d2dd11e9b52cdc4a3e957995-2-None-None"} : (tensor<2x2xf32>) -> tensor<2x2xf32>
+  %9 = "tf.Add"(%5, %8) {T = "tfdtype$DT_FLOAT", device = "", name = "Add_1"} : (tensor<2x2xf32>, tensor<2x2xf32>) -> tensor<2x2xf32>
+  %10 = "tf.Identity"(%9) {T = "tfdtype$DT_FLOAT", _tflite_function_name = "CustomOp", _tflite_function_output_index = 1 : i64, _tflite_function_uuid = "d6266124d2dd11e9b52cdc4a3e957995", _tflite_ophint_level = 1 : i64, device = "", name = "OutputHint-CustomOp-d6266124d2dd11e9b52cdc4a3e957995-1-None-None"} : (tensor<2x2xf32>) -> tensor<2x2xf32>
+  return
+}
+
+// CHECK:  func @d6266124d2dd11e9b52cdc4a3e957995(tensor<2x2xf32>, tensor<f32>, tensor<2x2xf32>) -> (tensor<2x2xf32>, tensor<2x2xf32>)
+// CHECK:    attributes {_tflite_function_input_index = [0 : i32, 1 : i32, 2 : i32], _tflite_function_name = "CustomOp"}
 
 // -----
 
diff --git a/tensorflow/compiler/mlir/lite/tests/legalize-tf.mlir b/tensorflow/compiler/mlir/lite/tests/legalize-tf.mlir
index 5d265305796..45853817aec 100644
--- a/tensorflow/compiler/mlir/lite/tests/legalize-tf.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/legalize-tf.mlir
@@ -50,7 +50,7 @@ func @biasAddInt(%arg0: tensor<1x10x10x32xi32>, %arg1: tensor<32xi32>) -> tensor
 func @squeezeAndReshape(%arg0: tensor<1x1x10xf32>, %arg1: tensor<?x10xf32>) -> i32 {
   %0 = "tf.Squeeze"(%arg0) {squeeze_dims = [0]} : (tensor<1x1x10xf32>) -> tensor<1x10xf32>
   %1 = "tf.Squeeze"(%arg1) : (tensor<?x10xf32>) -> tensor<*xf32>
-  %2 = constant dense<[2, 5]> : tensor<2xi32>
+  %2 = "tf.Const"() { value = dense<[2, 5]> : tensor<2xi32> } : () -> tensor<2xi32>
   %3 = "tf.Reshape" (%0, %2) : (tensor<1x10xf32>, tensor<2xi32>) -> tensor<2x5xf32>
   %4 = "some_op"(%1, %3) : (tensor<*xf32>, tensor<2x5xf32>) -> i32
   return %4 : i32
@@ -119,8 +119,8 @@ func @fakeQuantArgsTrue(%arg0: tensor<8x8x8x8xf32>) -> tensor<8x8x8x8xf32> {
 }
 
 func @fakeQuantVarsFalse(%arg0: tensor<8x8x8x8xf32>) -> tensor<8x8x8x8xf32> {
-  %arg1 = constant dense<-0.1> : tensor<f32>
-  %arg2 = constant dense<0.2> : tensor<f32>
+  %arg1 = "tf.Const"() { value = dense<-0.1> : tensor<f32> } : () -> tensor<f32>
+  %arg2 = "tf.Const"() { value = dense<0.2> : tensor<f32> } : () -> tensor<f32>
   %0 = "tf.FakeQuantWithMinMaxVars"(%arg0, %arg1, %arg2) {num_bits = 3, narrow_range = false} : (tensor<8x8x8x8xf32>, tensor<f32>, tensor<f32>) -> tensor<8x8x8x8xf32>
   return %0 : tensor<8x8x8x8xf32>
 
@@ -153,6 +153,14 @@ func @placeholder(%arg0: tensor<f32>) -> tensor<f32> {
 // CHECK:  %0 = "tfl.pseudo_input"(%arg0) : (tensor<f32>) -> tensor<f32>
 }
 
+func @placeholder_int(%arg0: tensor<i32>) -> tensor<i32> {
+  %0 = "tf.Placeholder.input"(%arg0) {name = "Input"} : (tensor<i32>) -> tensor<i32>
+  return %0: tensor<i32>
+
+// CHECK-LABEL: @placeholder_int
+// CHECK-NEXT:  "tfl.pseudo_input"(%arg0) : (tensor<i32>) -> tensor<i32>
+}
+
 func @placeholder_min(%arg0: tensor<f32>) -> tensor<f32> {
   %0 = "tf.Placeholder.input"(%arg0) {name = "Input", min = -0.1 : f32} : (tensor<f32>) -> tensor<f32>
   return %0: tensor<f32>
@@ -409,7 +417,7 @@ func @gatherNdHigherRankIndices(%arg0 : tensor<4x3x2xf32>, %arg1 : tensor<2x2xi3
 }
 
 func @gatherV2VectorIndices(%arg0 : tensor<1x2x20xf32>, %arg1 : tensor<3x5xi32>) -> tensor<1x3x5x20xf32> {
-  %0 = constant dense<[1]> : tensor<1xi32>
+  %0 = "tf.Const"() { value = dense<[1]> : tensor<1xi32> } : () -> tensor<1xi32>
   %1 = "tf.GatherV2"(%arg0, %arg1, %0) : (tensor<1x2x20xf32>, tensor<3x5xi32>, tensor<1xi32>) -> tensor<1x3x5x20xf32>
   return %1 : tensor<1x3x5x20xf32>
 
@@ -418,7 +426,7 @@ func @gatherV2VectorIndices(%arg0 : tensor<1x2x20xf32>, %arg1 : tensor<3x5xi32>)
 }
 
 func @gatherV2VectorIndicesNegAxis(%arg0 : tensor<1x2x20xf32>, %arg1 : tensor<3x5xi32>) -> tensor<1x2x3x5xf32> {
-  %0 = constant dense<[-1]> : tensor<1xi32>
+  %0 = "tf.Const"() { value = dense<[-1]> : tensor<1xi32> } : () -> tensor<1xi32>
   %1 = "tf.GatherV2"(%arg0, %arg1, %0) : (tensor<1x2x20xf32>, tensor<3x5xi32>, tensor<1xi32>) -> tensor<1x2x3x5xf32>
   return %1 : tensor<1x2x3x5xf32>
 
@@ -427,7 +435,7 @@ func @gatherV2VectorIndicesNegAxis(%arg0 : tensor<1x2x20xf32>, %arg1 : tensor<3x
 }
 
 func @gatherV2NonZeroBatchDims(%arg0 : tensor<1x2x20xf32>, %arg1 : tensor<3x5xi32>) -> tensor<1x2x3x5xf32> {
-  %0 = constant dense<[1]> : tensor<1xi32>
+  %0 = "tf.Const"() { value = dense<[1]> : tensor<1xi32> } : () -> tensor<1xi32>
   %1 = "tf.GatherV2"(%arg0, %arg1, %0) {batch_dims = 1 : i64} : (tensor<1x2x20xf32>, tensor<3x5xi32>, tensor<1xi32>) -> tensor<1x2x3x5xf32>
   return %1 : tensor<1x2x3x5xf32>
 
@@ -509,6 +517,15 @@ func @select(%arg0: tensor<8xi1>, %arg1: tensor<8xf32>, %arg2: tensor<8xf32>) ->
 // CHECK:  return %0 : tensor<8xf32>
 }
 
+func @select_multidim(%arg0: tensor<8xi1>, %arg1: tensor<8x3xf32>, %arg2: tensor<8x3xf32>) -> tensor<8x3xf32> {
+  %0 = "tf.Select"(%arg0, %arg1, %arg2) : (tensor<8xi1>, tensor<8x3xf32>, tensor<8x3xf32>) -> tensor<8x3xf32>
+  return %0: tensor<8x3xf32>
+
+// CHECK-LABEL: select_multidim
+// CHECK:  %0 = "tfl.select"(%arg0, %arg1, %arg2)
+// CHECK:  return %0 : tensor<8x3xf32>
+}
+
 func @select_v2(%arg0: tensor<8xi1>, %arg1: tensor<8xf32>, %arg2: tensor<8xf32>) -> tensor<8xf32> {
   %0 = "tf.SelectV2"(%arg0, %arg1, %arg2) : (tensor<8xi1>, tensor<8xf32>, tensor<8xf32>) -> tensor<8xf32>
   return %0: tensor<8xf32>
@@ -518,6 +535,15 @@ func @select_v2(%arg0: tensor<8xi1>, %arg1: tensor<8xf32>, %arg2: tensor<8xf32>)
 // CHECK:  return %0 : tensor<8xf32>
 }
 
+func @select_v2_multidim(%arg0: tensor<8xi1>, %arg1: tensor<8x3xf32>, %arg2: tensor<8x3xf32>) -> tensor<8x3xf32> {
+  %0 = "tf.SelectV2"(%arg0, %arg1, %arg2) : (tensor<8xi1>, tensor<8x3xf32>, tensor<8x3xf32>) -> tensor<8x3xf32>
+  return %0: tensor<8x3xf32>
+
+// CHECK-LABEL: select_v2_multidim
+// CHECK:  %0 = "tfl.select"(%arg0, %arg1, %arg2)
+// CHECK:  return %0 : tensor<8x3xf32>
+}
+
 func @sin(%arg0: tensor<f32>) -> tensor<f32> {
   %0 = "tf.Sin"(%arg0) : (tensor<f32>) -> tensor<f32>
   return %0 : tensor<f32>
@@ -536,7 +562,7 @@ func @topk(%arg0: tensor<8xf32>, %arg1: tensor<i32>) -> (tensor<?xf32>, tensor<?
 }
 
 func @topk_2(%arg0: tensor<8xf32>) -> (tensor<2xf32>, tensor<2xi32>) {
-  %0 = constant dense<2> : tensor<i32>
+  %0 = "tf.Const"() { value = dense<2> : tensor<i32> } : () -> tensor<i32>
   %1:2 = "tf.TopKV2"(%arg0, %0) : (tensor<8xf32>, tensor<i32>) -> (tensor<2xf32>, tensor<2xi32>)
   return %1#0, %1#1: tensor<2xf32>, tensor<2xi32>
 
@@ -546,7 +572,7 @@ func @topk_2(%arg0: tensor<8xf32>) -> (tensor<2xf32>, tensor<2xi32>) {
 }
 
 func @topk_3(%arg0: tensor<?x8xf32>) -> (tensor<?x2xf32>, tensor<?x2xi32>) {
-  %0 = constant dense<2> : tensor<i32>
+  %0 = "tf.Const"() { value = dense<2> : tensor<i32> } : () -> tensor<i32>
   %1:2 = "tf.TopKV2"(%arg0, %0) : (tensor<?x8xf32>, tensor<i32>) -> (tensor<?x2xf32>, tensor<?x2xi32>)
   return %1#0, %1#1: tensor<?x2xf32>, tensor<?x2xi32>
 
@@ -556,7 +582,7 @@ func @topk_3(%arg0: tensor<?x8xf32>) -> (tensor<?x2xf32>, tensor<?x2xi32>) {
 }
 
 func @topk_4(%arg0: tensor<1x2x3x4xf32>) -> (tensor<1x2x3x2xf32>, tensor<1x2x3x2xi32>) {
-  %0 = constant dense<2> : tensor<i32>
+  %0 = "tf.Const"() { value = dense<2> : tensor<i32> } : () -> tensor<i32>
   %1:2 = "tf.TopKV2"(%arg0, %0) : (tensor<1x2x3x4xf32>, tensor<i32>) -> (tensor<1x2x3x2xf32>, tensor<1x2x3x2xi32>)
   return %1#0, %1#1: tensor<1x2x3x2xf32>, tensor<1x2x3x2xi32>
 
@@ -566,7 +592,7 @@ func @topk_4(%arg0: tensor<1x2x3x4xf32>) -> (tensor<1x2x3x2xf32>, tensor<1x2x3x2
 }
 
 func @topk_5(%arg0: tensor<*xf32>) -> (tensor<*xf32>, tensor<*xi32>) {
-  %0 = constant dense<2> : tensor<i32>
+  %0 = "tf.Const"() { value = dense<2> : tensor<i32> } : () -> tensor<i32>
   %1:2 = "tf.TopKV2"(%arg0, %0) : (tensor<*xf32>, tensor<i32>) -> (tensor<*xf32>, tensor<*xi32>)
   return %1#0, %1#1: tensor<*xf32>, tensor<*xi32>
 
@@ -671,7 +697,7 @@ func @pow(%arg0: tensor<2x1x3xf32>, %arg1: tensor<2x1x1xf32>) -> tensor<2x1x3xf3
 
 func @tile(tensor<2x3xf32>, tensor<2xi32>) -> tensor<2x6xf32> {
 ^bb0(%arg0: tensor<2x3xf32>, %arg1: tensor<2xi32>):
-  %cst = constant dense<[1, 2]> : tensor<2xi32>
+  %cst = "tf.Const"() { value = dense<[1, 2]> : tensor<2xi32> } : () -> tensor<2xi32>
   %0 = "tf.Tile"(%arg0, %cst) : (tensor<2x3xf32>, tensor<2xi32>) -> tensor<2x6xf32>
   return %0 : tensor<2x6xf32>
 
@@ -682,7 +708,7 @@ func @tile(tensor<2x3xf32>, tensor<2xi32>) -> tensor<2x6xf32> {
 
 func @padv2(tensor<2x1x3xf32>, tensor<3x2xi32>) -> tensor<? x f32> {
 ^bb0(%arg0: tensor<2x1x3xf32>, %arg1: tensor<3x2xi32>):
-  %cst = constant dense<2.0> : tensor<f32>
+  %cst = "tf.Const"() { value = dense<2.0> : tensor<f32> } : () -> tensor<f32>
   %0 = "tf.PadV2"(%arg0, %arg1, %cst) : (tensor<2x1x3xf32>, tensor<3x2xi32>, tensor<f32>) -> tensor<? x f32>
   return %0#0 : tensor<? x f32>
 
@@ -858,8 +884,8 @@ func @matmul_transposed(%arg0: tensor<40x37xf32>, %arg1: tensor<40x37xf32>) -> t
 }
 
 func @concat2Tensors(%arg0: tensor<2xi32>, %arg1: tensor<2xi32>) -> tensor<2x2xi32> {
-  %0 = constant dense<[1]> : tensor<1xi32>
-  %1 = "tf.Concat"(%0, %arg0, %arg1) {N = 2 : i64} : (tensor<1xi32>, tensor<2xi32>, tensor<2xi32>) -> tensor<2x2xi32>
+  %0 = "tf.Const"() { value = dense<1> : tensor<i32> } : () -> tensor<i32>
+  %1 = "tf.Concat"(%0, %arg0, %arg1) {N = 2 : i64} : (tensor<i32>, tensor<2xi32>, tensor<2xi32>) -> tensor<2x2xi32>
   return %1 : tensor<2x2xi32>
 
 // CHECK-LABEL: concat2Tensors
@@ -867,8 +893,8 @@ func @concat2Tensors(%arg0: tensor<2xi32>, %arg1: tensor<2xi32>) -> tensor<2x2xi
 }
 
 func @concat3Tensors(%arg0: tensor<2xi32>, %arg1: tensor<2xi32>, %arg2: tensor<2xi32>) -> tensor<2x3xi32> {
-  %0 = constant dense<[-1]> : tensor<1xi32>
-  %1 = "tf.Concat"(%0, %arg0, %arg1, %arg2) {N = 3 : i64} : (tensor<1xi32>, tensor<2xi32>, tensor<2xi32>, tensor<2xi32>) -> tensor<2x3xi32>
+  %0 = "tf.Const"() { value = dense<-1> : tensor<i32> } : () -> tensor<i32>
+  %1 = "tf.Concat"(%0, %arg0, %arg1, %arg2) {N = 3 : i64} : (tensor<i32>, tensor<2xi32>, tensor<2xi32>, tensor<2xi32>) -> tensor<2x3xi32>
   return %1 : tensor<2x3xi32>
 
 // CHECK-LABEL: concat3Tensors
@@ -876,8 +902,8 @@ func @concat3Tensors(%arg0: tensor<2xi32>, %arg1: tensor<2xi32>, %arg2: tensor<2
 }
 
 func @concatv2With3Tensors(%arg0: tensor<2xi32>, %arg1: tensor<2xi32>, %arg2: tensor<2xi32>) -> tensor<2x3xi32> {
-  %0 = constant dense<[-1]> : tensor<1xi32>
-  %1 = "tf.ConcatV2"(%arg0, %arg1, %arg2, %0) {N = 3 : i64} : (tensor<2xi32>, tensor<2xi32>, tensor<2xi32>, tensor<1xi32>) -> tensor<2x3xi32>
+  %0 = "tf.Const"() { value = dense<-1> : tensor<i32> } : () -> tensor<i32>
+  %1 = "tf.ConcatV2"(%arg0, %arg1, %arg2, %0) {N = 3 : i64} : (tensor<2xi32>, tensor<2xi32>, tensor<2xi32>, tensor<i32>) -> tensor<2x3xi32>
   return %1 : tensor<2x3xi32>
 
 // CHECK-LABEL: concatv2With3Tensors
@@ -1093,3 +1119,35 @@ func @depth_to_space(%arg0: tensor<1x1x1x4xf32>) -> tensor<1x2x2x1xf32> {
   // CHECK: %[[ARG:.*]]: tensor<1x1x1x4xf32>
   // CHECK: "tfl.depth_to_space"(%[[ARG]]) {block_size = 2 : i32} : (tensor<1x1x1x4xf32>) -> tensor<1x2x2x1xf32>
 }
+
+func @non_max_suppression_v4(%arg0: tensor<3x4xf32>, %arg1: tensor<3xf32>, %arg2: tensor<i32>, %arg3: tensor<f32>, %arg4: tensor<f32>) -> tensor<2xi32> {
+  %0:2 = "tf.NonMaxSuppressionV4"(%arg0, %arg1, %arg2, %arg3, %arg4) {pad_to_max_output_size = true}: (tensor<3x4xf32>, tensor<3xf32>, tensor<i32>, tensor<f32>, tensor<f32>) -> (tensor<2xi32>, tensor<i32>)
+  return %0#0 : tensor<2xi32>
+
+  // CHECK-LABEL: non_max_suppression_v4
+  // CHECK: %0:2 = "tfl.non_max_suppression_v4"(%arg0, %arg1, %arg2, %arg3, %arg4) : (tensor<3x4xf32>, tensor<3xf32>, tensor<i32>, tensor<f32>, tensor<f32>) -> (tensor<2xi32>, tensor<i32>)
+}
+
+func @non_max_suppression_v4_no_pad(%arg0: tensor<3x4xf32>, %arg1: tensor<3xf32>, %arg2: tensor<i32>, %arg3: tensor<f32>, %arg4: tensor<f32>) -> tensor<2xi32> {
+  %0:2 = "tf.NonMaxSuppressionV4"(%arg0, %arg1, %arg2, %arg3, %arg4) {pad_to_max_output_size = false}: (tensor<3x4xf32>, tensor<3xf32>, tensor<i32>, tensor<f32>, tensor<f32>) -> (tensor<2xi32>, tensor<i32>)
+  return %0#0 : tensor<2xi32>
+
+  // CHECK-LABEL: non_max_suppression_v4_no_pad
+  // CHECK: %0:2 = "tfl.non_max_suppression_v4"(%arg0, %arg1, %arg2, %arg3, %arg4) : (tensor<3x4xf32>, tensor<3xf32>, tensor<i32>, tensor<f32>, tensor<f32>) -> (tensor<2xi32>, tensor<i32>)
+}
+
+func @non_max_suppression_v5(%arg0: tensor<3x4xf32>, %arg1: tensor<3xf32>, %arg2: tensor<i32>, %arg3: tensor<f32>, %arg4: tensor<f32>, %arg5: tensor<f32>) -> tensor<2xi32> {
+  %0:3 = "tf.NonMaxSuppressionV5"(%arg0, %arg1, %arg2, %arg3, %arg4, %arg5) {pad_to_max_output_size = true}: (tensor<3x4xf32>, tensor<3xf32>, tensor<i32>, tensor<f32>, tensor<f32>, tensor<f32>) -> (tensor<2xi32>, tensor<2xf32>, tensor<i32>)
+  return %0#0 : tensor<2xi32>
+
+  // CHECK-LABEL: non_max_suppression_v5
+  // CHECK: %0:3 = "tfl.non_max_suppression_v5"(%arg0, %arg1, %arg2, %arg3, %arg4, %arg5) : (tensor<3x4xf32>, tensor<3xf32>, tensor<i32>, tensor<f32>, tensor<f32>, tensor<f32>) -> (tensor<2xi32>, tensor<2xf32>, tensor<i32>)
+}
+
+func @non_max_suppression_v5_no_pad(%arg0: tensor<3x4xf32>, %arg1: tensor<3xf32>, %arg2: tensor<i32>, %arg3: tensor<f32>, %arg4: tensor<f32>, %arg5: tensor<f32>) -> tensor<2xi32> {
+  %0:3 = "tf.NonMaxSuppressionV5"(%arg0, %arg1, %arg2, %arg3, %arg4, %arg5) {pad_to_max_output_size = false}: (tensor<3x4xf32>, tensor<3xf32>, tensor<i32>, tensor<f32>, tensor<f32>, tensor<f32>) -> (tensor<2xi32>, tensor<2xf32>, tensor<i32>)
+  return %0#0 : tensor<2xi32>
+
+  // CHECK-LABEL: non_max_suppression_v5_no_pad
+  // CHECK: %0:3 = "tfl.non_max_suppression_v5"(%arg0, %arg1, %arg2, %arg3, %arg4, %arg5) : (tensor<3x4xf32>, tensor<3xf32>, tensor<i32>, tensor<f32>, tensor<f32>, tensor<f32>) -> (tensor<2xi32>, tensor<2xf32>, tensor<i32>)
+}
diff --git a/tensorflow/compiler/mlir/lite/tests/load-quantization-recipe.mlir b/tensorflow/compiler/mlir/lite/tests/load-quantization-recipe.mlir
new file mode 100644
index 00000000000..5c53d5e05e7
--- /dev/null
+++ b/tensorflow/compiler/mlir/lite/tests/load-quantization-recipe.mlir
@@ -0,0 +1,107 @@
+// RUN: tf-opt -tfl-load-recipe %s | FileCheck %s --dump-input-on-failure
+
+// CHECK-LABEL: testLstm
+func @testLstm(%arg0: tensor<? x f32>, %arg1: tensor<?xf32>, %arg2: tensor<?xf32>, %arg3: tensor<?xf32>, %arg4: tensor<?xf32>, %arg5: tensor<?xf32>, %arg6: tensor<?xf32>, %arg7: tensor<?xf32>, %arg8: tensor<? x f32>, %arg9: tensor<? x f32>, %arg10: tensor<? x f32>, %arg11: tensor<? x f32>, %arg12: tensor<? x f32>, %arg13: tensor<? x f32>, %arg14: tensor<? x f32>, %arg15: tensor<? x f32>, %arg16: tensor<? x f32>, %arg17: tensor<? x f32>, %arg18: tensor<? x f32>, %arg19: tensor<? x f32>, %arg20: tensor<? x f32>, %arg21: tensor<? x f32>, %arg22: tensor<? x f32>, %arg23: tensor<? x f32>) -> tensor<? x f32> {
+  %0 = "tfl.lstm"(%arg0, // input
+    %arg1, %arg2, %arg3, %arg4, // weights
+    %arg5, %arg6, %arg7, %arg8, // recurrent weights
+    %arg9, %arg10, %arg11, // cell weights
+    %arg12, %arg13, %arg14, %arg15, // bias
+    %arg16, %arg17, // projection weight and bias
+    %arg18, %arg19, // stateful
+    %arg20, %arg21, %arg22, %arg23 // layer norm coefficients
+    ) ({}) {fused_activation_function = "NONE", kernel_type = "FULL"} : (tensor<? xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>) -> tensor<?xf32>
+
+// CHECK-NEXT:  "tfl.lstm"
+// CHECK-NEXT:  %[[cst:.*]] = constant unit
+
+// input gate
+// CHECK-NEXT:  %[[in1:.*]] = "tfl.fully_connected"(%arg0, %arg1, %[[cst]])
+// CHECK-SAME:    -> tensor<?x!quant.any<i16:f32>>
+// CHECK-NEXT:  %[[in2:.*]] = "tfl.fully_connected"(%arg18, %arg5, %[[cst]])
+// CHECK-SAME:    -> tensor<?x!quant.any<i16:f32>>
+// CHECK-NEXT:  %[[in3:.*]] = "tfl.mul"(%arg19, %arg9)
+// CHECK-SAME:    -> tensor<?x!quant.any<i16:f32>>
+// CHECK-NEXT:  %[[in4:.*]] = "tfl.add_n"(%[[in1]], %[[in2]], %[[in3]])
+// CHECK-SAME:    -> tensor<?x!quant.any<i16:f32>>
+// CHECK-NEXT:  %[[in5:.*]] = "tfl.l2_normalization"(%[[in4]])
+// CHECK-SAME:    -> tensor<?x!quant.any<i16:f32>>
+// CHECK-NEXT:  %[[in6:.*]] = tfl.add %[[in4]], %[[in5]]
+// CHECK-SAME:    tensor<?x!quant.any<i16:f32>>
+// CHECK-NEXT:  %[[in7:.*]] = "tfl.fully_connected"(%[[in6]], %arg20, %arg12)
+// CHECK-SAME:    -> tensor<?x!quant.any<i16:f32>>
+// CHECK-NEXT:  %[[in8:.*]] = "tfl.logistic"(%[[in7]])
+// CHECK-SAME:    -> tensor<?x!quant.any<i16:f32>>
+
+// forget gate
+// CHECK-NEXT:  %[[fo1:.*]] = "tfl.fully_connected"(%arg0, %arg2, %[[cst]])
+// CHECK-SAME:    tensor<?x!quant.any<i16:f32>>
+// CHECK-NEXT:  %[[fo2:.*]] = "tfl.fully_connected"(%arg18, %arg6, %[[cst]])
+// CHECK-SAME:    -> tensor<?x!quant.any<i16:f32>>
+// CHECK-NEXT:  %[[fo3:.*]] = "tfl.mul"(%arg19, %arg10)
+// CHECK-SAME:    -> tensor<?x!quant.any<i16:f32>>
+// CHECK-NEXT:  %[[fo4:.*]] = "tfl.add_n"(%[[fo1]], %[[fo2]], %[[fo3]])
+// CHECK-SAME:    -> tensor<?x!quant.any<i16:f32>>
+// CHECK-NEXT:  %[[fo5:.*]] = "tfl.l2_normalization"(%[[fo4]])
+// CHECK-SAME:    -> tensor<?x!quant.any<i16:f32>>
+// CHECK-NEXT:  %[[fo6:.*]] = tfl.add %[[fo4]], %[[fo5]]
+// CHECK-SAME:    tensor<?x!quant.any<i16:f32>>
+// CHECK-NEXT:  %[[fo7:.*]] = "tfl.fully_connected"(%[[fo6]], %arg21, %arg13)
+// CHECK-SAME:    -> tensor<?x!quant.any<i16:f32>>
+// CHECK-NEXT:  %[[fo8:.*]] = "tfl.logistic"(%[[fo7]])
+// CHECK-SAME:    -> tensor<?x!quant.any<i16:f32>>
+
+// cell gate
+// CHECK-NEXT:  %[[ce1:.*]] = "tfl.fully_connected"(%arg0, %arg3, %[[cst]])
+// CHECK-SAME:    -> tensor<?x!quant.any<i16:f32>>
+// CHECK-NEXT:  %[[ce2:.*]] = "tfl.fully_connected"(%arg18, %arg7, %[[cst]])
+// CHECK-SAME:    -> tensor<?x!quant.any<i16:f32>>
+// CHECK-NEXT:  %[[ce3:.*]] = "tfl.add_n"(%[[ce1]], %[[ce2]])
+// CHECK-SAME:    -> tensor<?x!quant.any<i16:f32>>
+// CHECK-NEXT:  %[[ce4:.*]] = "tfl.l2_normalization"(%[[ce3]])
+// CHECK-SAME:    -> tensor<?x!quant.any<i16:f32>>
+// CHECK-NEXT:  %[[ce5:.*]] = tfl.add %[[ce3]], %[[ce4]]
+// CHECK-SAME:    tensor<?x!quant.any<i16:f32>>
+// CHECK-NEXT:  %[[ce6:.*]] = "tfl.fully_connected"(%[[ce5]], %arg22, %arg14)
+// CHECK-SAME:    -> tensor<?x!quant.any<i16:f32>>
+// CHECK-NEXT:  %[[ce7:.*]] = "tfl.tanh"(%[[ce6]])
+// CHECK-SAME:    -> tensor<?x!quant.any<i16:f32>>
+
+// CHECK-NEXT:  %[[ac1:.*]] = "tfl.mul"(%[[fo8]], %arg19)
+// CHECK-SAME:    -> tensor<?x!quant.any<i16:f32>>
+// CHECK-NEXT:  %[[ac2:.*]] = tfl.mul %[[in8]], %[[ce7]]
+// CHECK-SAME:    tensor<?x!quant.any<i16:f32>>
+// CHECK-NEXT:  %[[ac3:.*]] = tfl.add %[[ac1]], %[[ac2]]
+// CHECK-SAME:    tensor<?x!quant.any<i16:f32>>
+
+// output gate
+// CHECK-NEXT:  %[[ou1:.*]] = "tfl.fully_connected"(%arg0, %arg4, %[[cst]])
+// CHECK-SAME:    -> tensor<?x!quant.any<i16:f32>>
+// CHECK-NEXT:  %[[ou2:.*]] = "tfl.fully_connected"(%arg18, %arg8, %[[cst]])
+// CHECK-SAME:    -> tensor<?x!quant.any<i16:f32>>
+// CHECK-NEXT:  %[[ou3:.*]] = "tfl.mul"(%[[ac3]], %arg11)
+// CHECK-SAME:    -> tensor<?x!quant.any<i16:f32>>
+// CHECK-NEXT:  %[[ou4:.*]] = "tfl.add_n"(%[[ou1]], %[[ou2]], %[[ou3]])
+// CHECK-SAME:    -> tensor<?x!quant.any<i16:f32>>
+// CHECK-NEXT:  %[[ou5:.*]] = "tfl.l2_normalization"(%[[ou4]])
+// CHECK-SAME:    -> tensor<?x!quant.any<i16:f32>>
+// CHECK-NEXT:  %[[ou6:.*]] = tfl.add %[[ou4]], %[[ou5]]
+// CHECK-SAME:    tensor<?x!quant.any<i16:f32>>
+// CHECK-NEXT:  %[[ou7:.*]] = "tfl.fully_connected"(%[[ou6]], %arg23, %arg15)
+// CHECK-SAME:    -> tensor<?x!quant.any<i16:f32>>
+// CHECK-NEXT:  %[[ou8:.*]] = "tfl.logistic"(%[[ou7]])
+// CHECK-SAME:    -> tensor<?x!quant.any<i16:f32>>
+
+// output activation
+// CHECK-NEXT:  %[[ac4:.*]] = "tfl.tanh"(%[[ac3]])
+// CHECK-SAME:    -> tensor<?x!quant.any<i16:f32>>
+// CHECK-NEXT:  %[[ac5:.*]] = tfl.mul %[[ac4]], %[[ou8]]
+// CHECK-SAME:    tensor<?x!quant.any<i16:f32>>
+// CHECK-NEXT:  %[[ac6:.*]] = "tfl.fully_connected"(%[[ac5]], %arg16, %arg17)
+// CHECK-SAME:    (tensor<?x!quant.any<i16:f32>>, tensor<?xf32>, tensor<?xf32>) -> tensor<?x!quant.any<i8:f32>>
+// CHECK-NEXT:  %[[ac7:.*]] = "tf_quant.pseudo_return"(%[[ac6]]) : (tensor<?x!quant.any<i8:f32>>) -> tensor<?x!quant.any<i8:f32>>
+// CHECK-NEXT:  })
+// CHECK-NEXT:  return
+
+  return %0 : tensor<?xf32>
+}
diff --git a/tensorflow/compiler/mlir/lite/tests/lower-static-tensor-list.mlir b/tensorflow/compiler/mlir/lite/tests/lower-static-tensor-list.mlir
index 817ced79ced..287958e905c 100644
--- a/tensorflow/compiler/mlir/lite/tests/lower-static-tensor-list.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/lower-static-tensor-list.mlir
@@ -143,6 +143,19 @@ func @tensorlistPushBack(%arg0: tensor<3x10xf32>, %arg1: tensor<1xi32>, %arg2: t
 // CHECK:   return [[RESULT]] : tensor<?x10xf32>
 }
 
+func @tensorlistLength(%arg0: tensor<3x10xf32>, %arg1: tensor<1xi32>) -> (tensor<i32>) {
+  %0 = "tf.TensorListFromTensor"(%arg0, %arg1) : (tensor<3x10xf32>, tensor<1xi32>) -> tensor<!tf.variant<tensor<10xf32>>>
+  %1 = "tf.TensorListLength"(%0) : (tensor<!tf.variant<tensor<10xf32>>>) -> tensor<i32>
+  return %1: tensor<i32>
+
+// CHECK-LABEL: tensorlistLength
+// CHECK-SAME: ([[INPUT:%.*]]: tensor<3x10xf32>, [[ELEM_SHAPE:%.*]]: tensor<1xi32>)
+// CHECK-DAG: [[SHAPE:%.*]] = "tf.Shape"([[INPUT]]) {{.*}} -> tensor<2xi32>
+// CHECK-DAG: [[ZERO:%cst.*]] = constant dense<0> : tensor<i32>
+// CHECK: [[RESULT:%.*]] = "tf.Gather"([[SHAPE]], [[ZERO]]) {validate_indices = true} : (tensor<2xi32>, tensor<i32>) -> tensor<i32>
+// CHECK: return [[RESULT]] : tensor<i32>
+}
+
 func @tensorlistWhileLoop(%arg0: tensor<2x3xf32>) -> tensor<*xf32> {
   %cst = constant dense<3> : tensor<1xi32>
   %cst_0 = constant dense<0> : tensor<i32>
diff --git a/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/lstm.mlir b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/lstm.mlir
index ddb122f6e37..23976dbb476 100644
--- a/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/lstm.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/lstm.mlir
@@ -278,6 +278,6 @@ func @main(tensor<4 x f32>, tensor<4 x f32>, tensor<4 x f32>, tensor<4 x f32>, t
   %21 = "tfl.pseudo_input" (%arg21) : (tensor<4 x f32>) -> tensor<4 x f32>
   %22 = "tfl.pseudo_input" (%arg22) : (tensor<4 x f32>) -> tensor<4 x f32>
   %23 = "tfl.pseudo_input" (%arg23) : (tensor<4 x f32>) -> tensor<4 x f32>
-  %24 = "tfl.lstm"(%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, %14, %15, %16, %17, %18, %19, %20, %21, %22, %23) {fused_activation_function = "NONE", kernel_type = "FULL"} : (tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>) -> tensor<4xf32>
+  %24 = "tfl.lstm"(%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, %14, %15, %16, %17, %18, %19, %20, %21, %22, %23) ({}) {fused_activation_function = "NONE", kernel_type = "FULL"} : (tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>) -> tensor<4xf32>
   return %24 : tensor<4xf32>
 }
\ No newline at end of file
diff --git a/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/metadata.mlir b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/metadata.mlir
new file mode 100644
index 00000000000..e89c2715c50
--- /dev/null
+++ b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/metadata.mlir
@@ -0,0 +1,31 @@
+// RUN: flatbuffer_translate -mlir-to-tflite-flatbuffer %s -o - | flatbuffer_to_string - | FileCheck --dump-input-on-failure %s
+
+module attributes {
+  tfl.metadata = {key1 = "value1", key2 = "value2"}
+} {
+  func @main(tensor<3x2xi32>) -> tensor<3x2xi32>
+    attributes {tf.entry_function = {inputs = "input", outputs = "SameNameAsOutput"}} {
+  ^bb0(%arg0: tensor<3x2xi32>):
+    %0 = "tfl.pseudo_input" (%arg0) : (tensor<3x2xi32>) -> tensor<3x2xi32> loc("Input")
+    %1 = "tfl.pseudo_const" () {value = dense<[[1, 2], [3, 4], [5, 6]]> : tensor<3x2xi32>} : () -> tensor<3x2xi32>
+    %2 = "tfl.sub" (%0, %1) {fused_activation_function = "NONE"} : (tensor<3x2xi32>, tensor<3x2xi32>) -> tensor<3x2xi32>
+    return %2 : tensor<3x2xi32>
+  }
+}
+
+// CHECK:      buffers: [ {
+// CHECK:      }, {
+// CHECK:      }, {
+// CHECK:      }, {
+// CHECK:      }, {
+// CHECK-NEXT:   data: [ 118, 97, 108, 117, 101, 49 ]
+// CHECK-NEXT: }, {
+// CHECK-NEXT:   data: [ 118, 97, 108, 117, 101, 50 ]
+// CHECK-NEXT: } ],
+// CHECK-NEXT: metadata: [ {
+// CHECK-NEXT:   name: "key1",
+// CHECK-NEXT:   buffer: 4
+// CHECK-NEXT: }, {
+// CHECK-NEXT:   name: "key2",
+// CHECK-NEXT:   buffer: 5
+// CHECK-NEXT: } ]
diff --git a/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/optional.mlir b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/optional.mlir
index 97129df86a2..d62d0ac2c31 100644
--- a/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/optional.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/optional.mlir
@@ -1,5 +1,4 @@
-// RUN: flatbuffer_translate -mlir-to-tflite-flatbuffer %s -o - | flatbuffer_to_string -
-// | FileCheck %s
+// RUN: flatbuffer_translate -mlir-to-tflite-flatbuffer %s -o - | flatbuffer_to_string - | FileCheck %s
 
 func @main(%arg0: tensor<40x37xf32>, %arg1: tensor<40x37xf32>) -> tensor<40x40xf32> {
   %cst = constant unit
@@ -9,7 +8,7 @@ func @main(%arg0: tensor<40x37xf32>, %arg1: tensor<40x37xf32>) -> tensor<40x40xf
   return %2 : tensor<40x40xf32>
 }
 
-// CHECK-NEXT: operators: [ {
+// CHECK: operators: [ {
 // CHECK-NEXT:       inputs: [ 0, 1, -1 ],
 // CHECK-NEXT:       outputs: [ 2, 3 ],
 // CHECK-NEXT:       builtin_options_type: FullyConnectedOptions,
diff --git a/tensorflow/compiler/mlir/lite/tests/ops.mlir b/tensorflow/compiler/mlir/lite/tests/ops.mlir
index a0d78c25297..3a051678664 100644
--- a/tensorflow/compiler/mlir/lite/tests/ops.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/ops.mlir
@@ -103,7 +103,7 @@ func @testAddN(tensor<? x f32>, tensor<? x f32>, tensor<? x f32>) -> tensor<? x
 // test invalid AddN
 func @testAddNWrongOperandResultType(tensor<? x f16>, tensor<? x f16>, tensor<? x f16>) -> tensor<? x f16> {
 ^bb0(%arg0: tensor<? x f16>, %arg1: tensor<? x f16>, %arg2: tensor<? x f16>):
-  // expected-error @+1 {{'tfl.add_n' op operand #0 must be tensor of 32-bit float or 32-bit integer values}}
+  // expected-error @+1 {{'tfl.add_n' op operand #0 must be tensor of 32-bit float or 32-bit integer or QI16 type or QUI16 type values}}
   %0 = "tfl.add_n"(%arg0, %arg1, %arg2): (tensor<? x f16>, tensor<? x f16>, tensor<? x f16>) -> tensor<? x f16>
   return %0 : tensor<? x f16>
 }
@@ -537,7 +537,7 @@ func @testLogistic(tensor<1x2x3x4x5xbf16>) -> tensor<1x2x3x4x5xbf16> {
 // test invalid Logistic input
 func @testLogisticWithWrongInputType(tensor<?xi32>) -> tensor<?xi32> {
 ^bb0(%arg0: tensor<?xi32>):
-  // expected-error @+1 {{tfl.logistic' op operand #0 must be tensor of floating-point or QI8 type or QUI8 type values}}
+  // expected-error @+1 {{tfl.logistic' op operand #0 must be tensor of floating-point or QI8 type or QUI8 type or QI16 type or QUI16 type values}}
   %0 = "tfl.logistic"(%arg0): (tensor<?xi32>) -> tensor<?xi32>
   return %0#0 : tensor<?xi32>
 }
@@ -591,8 +591,9 @@ func @testUnidirectionalSequenceLstmWithInvalidNoneType(%arg0: tensor<? x f32>,
 
 // CHECK-LABEL: testLstm
 func @testLstm(%arg0: tensor<? x f32>, %arg1: tensor<? x f32>, %arg2: tensor<? x f32>, %arg3: tensor<? x f32>, %arg4: tensor<? x f32>, %arg5: tensor<? x f32>, %arg6: tensor<? x f32>, %arg7: tensor<? x f32>, %arg8: tensor<? x f32>, %arg9: tensor<? x f32>, %arg10: tensor<? x f32>, %arg11: tensor<? x f32>, %arg12: tensor<? x f32>, %arg13: tensor<? x f32>, %arg14: tensor<? x f32>, %arg15: tensor<? x f32>, %arg16: tensor<? x f32>, %arg17: tensor<? x f32>, %arg18: tensor<? x f32>, %arg19: tensor<? x f32>, %arg20: tensor<? x f32>, %arg21: tensor<? x f32>, %arg22: tensor<? x f32>, %arg23: tensor<? x f32>) -> tensor<? x f32> {
-  // CHECK: "tfl.lstm"(%arg0, %arg1, %arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8, %arg9, %arg10, %arg11, %arg12, %arg13, %arg14, %arg15, %arg16, %arg17, %arg18, %arg19, %arg20, %arg21, %arg22, %arg23) {fused_activation_function = "NONE", kernel_type = "FULL"} : (tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>) -> tensor<?xf32>
-  %0 = "tfl.lstm"(%arg0, %arg1, %arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8, %arg9, %arg10, %arg11, %arg12, %arg13, %arg14, %arg15, %arg16, %arg17, %arg18, %arg19, %arg20, %arg21, %arg22, %arg23) {fused_activation_function = "NONE", kernel_type = "FULL"} : (tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>) -> tensor<?xf32>
+  // CHECK: "tfl.lstm"(%arg0, %arg1, %arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8, %arg9, %arg10, %arg11, %arg12, %arg13, %arg14, %arg15, %arg16, %arg17, %arg18, %arg19, %arg20, %arg21, %arg22, %arg23)
+  // CHECK-NEXT: {fused_activation_function = "NONE", kernel_type = "FULL"} : (tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>) -> tensor<?xf32>
+  %0 = "tfl.lstm"(%arg0, %arg1, %arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8, %arg9, %arg10, %arg11, %arg12, %arg13, %arg14, %arg15, %arg16, %arg17, %arg18, %arg19, %arg20, %arg21, %arg22, %arg23) ({}) {fused_activation_function = "NONE", kernel_type = "FULL"} : (tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>) -> tensor<?xf32>
   return %0 : tensor<?xf32>
 }
 
@@ -600,8 +601,9 @@ func @testLstm(%arg0: tensor<? x f32>, %arg1: tensor<? x f32>, %arg2: tensor<? x
 
 // CHECK-LABEL: testLstmWithNoneTypeAndOverrideAttr
 func @testLstmWithNoneTypeAndOverrideAttr(%arg0: tensor<? x f32>, %arg1: none, %arg2: tensor<? x f32>, %arg3: tensor<? x f32>, %arg4: tensor<? x f32>, %arg5: tensor<? x f32>, %arg6: tensor<? x f32>, %arg7: tensor<? x f32>, %arg8: tensor<? x f32>, %arg9: tensor<? x f32>, %arg10: tensor<? x f32>, %arg11: tensor<? x f32>, %arg12: tensor<? x f32>, %arg13: tensor<? x f32>, %arg14: tensor<? x f32>, %arg15: tensor<? x f32>, %arg16: tensor<? x f32>, %arg17: tensor<? x f32>, %arg18: tensor<? x f32>, %arg19: tensor<? x f32>, %arg20: tensor<? x f32>, %arg21: tensor<? x f32>, %arg22: tensor<? x f32>, %arg23: tensor<? x f32>) -> tensor<? x f32> {
-  // CHECK: "tfl.lstm"(%arg0, %arg1, %arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8, %arg9, %arg10, %arg11, %arg12, %arg13, %arg14, %arg15, %arg16, %arg17, %arg18, %arg19, %arg20, %arg21, %arg22, %arg23) {cell_clip = 1.000000e+00 : f32, fused_activation_function = "NONE", kernel_type = "FULL"} : (tensor<?xf32>, none, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>) -> tensor<?xf32>
-  %0 = "tfl.lstm"(%arg0, %arg1, %arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8, %arg9, %arg10, %arg11, %arg12, %arg13, %arg14, %arg15, %arg16, %arg17, %arg18, %arg19, %arg20, %arg21, %arg22, %arg23) {cell_clip = 1.000000e+00 : f32, fused_activation_function = "NONE", kernel_type = "FULL"} : (tensor<?xf32>, none, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>) -> tensor<?xf32>
+  // CHECK: "tfl.lstm"(%arg0, %arg1, %arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8, %arg9, %arg10, %arg11, %arg12, %arg13, %arg14, %arg15, %arg16, %arg17, %arg18, %arg19, %arg20, %arg21, %arg22, %arg23)
+  // CHECK-NEXT: {cell_clip = 1.000000e+00 : f32, fused_activation_function = "NONE", kernel_type = "FULL"} : (tensor<?xf32>, none, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>) -> tensor<?xf32>
+  %0 = "tfl.lstm"(%arg0, %arg1, %arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8, %arg9, %arg10, %arg11, %arg12, %arg13, %arg14, %arg15, %arg16, %arg17, %arg18, %arg19, %arg20, %arg21, %arg22, %arg23) ({}) {cell_clip = 1.000000e+00 : f32, fused_activation_function = "NONE", kernel_type = "FULL"} : (tensor<?xf32>, none, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>) -> tensor<?xf32>
   return %0 : tensor<?xf32>
 }
 
@@ -610,7 +612,7 @@ func @testLstmWithNoneTypeAndOverrideAttr(%arg0: tensor<? x f32>, %arg1: none, %
 // test invalid none type applied to a tensor type arg
 func @testLstmWithInvalidNoneType(%arg0: tensor<? x f32>, %arg1: tensor<? x f32>, %arg2: none, %arg3: tensor<? x f32>, %arg4: tensor<? x f32>, %arg5: tensor<? x f32>, %arg6: tensor<? x f32>, %arg7: tensor<? x f32>, %arg8: tensor<? x f32>, %arg9: tensor<? x f32>, %arg10: tensor<? x f32>, %arg11: tensor<? x f32>, %arg12: tensor<? x f32>, %arg13: tensor<? x f32>, %arg14: tensor<? x f32>, %arg15: tensor<? x f32>, %arg16: tensor<? x f32>, %arg17: tensor<? x f32>, %arg18: tensor<? x f32>, %arg19: tensor<? x f32>, %arg20: tensor<? x f32>, %arg21: tensor<? x f32>, %arg22: tensor<? x f32>, %arg23: tensor<? x f32>) -> tensor<? x f32> {
   // expected-error @+1 {{'tfl.lstm' op operand #2 must be tensor of 32-bit float or 8-bit integer values}}
-  %0 = "tfl.lstm"(%arg0, %arg1, %arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8, %arg9, %arg10, %arg11, %arg12, %arg13, %arg14, %arg15, %arg16, %arg17, %arg18, %arg19, %arg20, %arg21, %arg22, %arg23) {fused_activation_function = "NONE"} : (tensor<?xf32>, tensor<? x f32>, none, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>) -> tensor<?xf32>
+  %0 = "tfl.lstm"(%arg0, %arg1, %arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8, %arg9, %arg10, %arg11, %arg12, %arg13, %arg14, %arg15, %arg16, %arg17, %arg18, %arg19, %arg20, %arg21, %arg22, %arg23) ({}) {fused_activation_function = "NONE"} : (tensor<?xf32>, tensor<? x f32>, none, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>) -> tensor<?xf32>
   return %0 : tensor<?xf32>
 }
 
@@ -619,7 +621,7 @@ func @testLstmWithInvalidNoneType(%arg0: tensor<? x f32>, %arg1: tensor<? x f32>
 // test violation of projection weight and projection bias pred op trait
 func @testLstmWithInvalidNoneType(%arg0: tensor<? x f32>, %arg1: tensor<? x f32>, %arg2: tensor<? x f32>, %arg3: tensor<? x f32>, %arg4: tensor<? x f32>, %arg5: tensor<? x f32>, %arg6: tensor<? x f32>, %arg7: tensor<? x f32>, %arg8: tensor<? x f32>, %arg9: tensor<? x f32>, %arg10: tensor<? x f32>, %arg11: tensor<? x f32>, %arg12: tensor<? x f32>, %arg13: tensor<? x f32>, %arg14: tensor<? x f32>, %arg15: tensor<? x f32>, %arg16: none, %arg17: tensor<? x f32>, %arg18: tensor<? x f32>, %arg19: tensor<? x f32>, %arg20: tensor<? x f32>, %arg21: tensor<? x f32>, %arg22: tensor<? x f32>, %arg23: tensor<? x f32>) -> tensor<? x f32> {
   // expected-error @+1 {{'tfl.lstm' op failed to verify that either projection weight must be specified or both projection weight and projection bias must not be specified}}
-  %0 = "tfl.lstm"(%arg0, %arg1, %arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8, %arg9, %arg10, %arg11, %arg12, %arg13, %arg14, %arg15, %arg16, %arg17, %arg18, %arg19, %arg20, %arg21, %arg22, %arg23) {fused_activation_function = "NONE"} : (tensor<?xf32>, tensor<? x f32>, tensor<? x f32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, none, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>) -> tensor<?xf32>
+  %0 = "tfl.lstm"(%arg0, %arg1, %arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8, %arg9, %arg10, %arg11, %arg12, %arg13, %arg14, %arg15, %arg16, %arg17, %arg18, %arg19, %arg20, %arg21, %arg22, %arg23) ({}) {fused_activation_function = "NONE"} : (tensor<?xf32>, tensor<? x f32>, tensor<? x f32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, none, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>) -> tensor<?xf32>
   return %0 : tensor<?xf32>
 }
 
@@ -628,7 +630,7 @@ func @testLstmWithInvalidNoneType(%arg0: tensor<? x f32>, %arg1: tensor<? x f32>
 // test invalid kernel type
 func @testLstmWithInvalidKernelType(%arg0: tensor<? x f32>, %arg1: tensor<? x f32>, %arg2: tensor<? x f32>, %arg3: tensor<? x f32>, %arg4: tensor<? x f32>, %arg5: tensor<? x f32>, %arg6: tensor<? x f32>, %arg7: tensor<? x f32>, %arg8: tensor<? x f32>, %arg9: tensor<? x f32>, %arg10: tensor<? x f32>, %arg11: tensor<? x f32>, %arg12: tensor<? x f32>, %arg13: tensor<? x f32>, %arg14: tensor<? x f32>, %arg15: tensor<? x f32>, %arg16: tensor<? x f32>, %arg17: tensor<? x f32>, %arg18: tensor<? x f32>, %arg19: tensor<? x f32>, %arg20: tensor<? x f32>, %arg21: tensor<? x f32>, %arg22: tensor<? x f32>, %arg23: tensor<? x f32>) -> tensor<? x f32> {
   // expected-error @+1 {{'tfl.lstm' op attribute 'kernel_type' failed to satisfy constraint: lstm kernel type enum case FULL}}
-  %0 = "tfl.lstm"(%arg0, %arg1, %arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8, %arg9, %arg10, %arg11, %arg12, %arg13, %arg14, %arg15, %arg16, %arg17, %arg18, %arg19, %arg20, %arg21, %arg22, %arg23) {cell_clip = 1.000000e+00 : f32, fused_activation_function = "NONE", kernel_type = "BASIC"} : (tensor<?xf32>, tensor<? x f32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>) -> tensor<?xf32>
+  %0 = "tfl.lstm"(%arg0, %arg1, %arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8, %arg9, %arg10, %arg11, %arg12, %arg13, %arg14, %arg15, %arg16, %arg17, %arg18, %arg19, %arg20, %arg21, %arg22, %arg23) ({}) {cell_clip = 1.000000e+00 : f32, fused_activation_function = "NONE", kernel_type = "BASIC"} : (tensor<?xf32>, tensor<? x f32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>) -> tensor<?xf32>
   return %0 : tensor<?xf32>
 }
 
@@ -652,6 +654,15 @@ func @testSelect(%cond : tensor<?xi1>, %arg0 : tensor<?xi32>, %arg1 : tensor<?xi
 
 // -----
 
+// test select with multi-dim inputs
+// CHECK-LABEL: testSelectMultiDim
+func @testSelectMultiDim(%cond : tensor<?xi1>, %arg0 : tensor<?x4xi32>, %arg1 : tensor<?x4xi32>) -> tensor<?x4xi32> {
+  %0 = "tfl.select"(%cond, %arg0, %arg1): (tensor<?xi1>,tensor<?x4xi32>,tensor<?x4xi32>) -> tensor<?x4xi32>
+  return %0 : tensor<?x4xi32>
+}
+
+// -----
+
 func @testSelectWithUnsupportedType(%cond : tensor<?xi32>, %arg0 : tensor<?xi32>, %arg1 : tensor<?xi32>) -> tensor<?xi32> {
   // expected-error @+1 {{op operand #0 must be tensor of 1-bit integer values}}
   %0 = "tfl.select"(%cond, %arg0, %arg1): (tensor<?xi32>,tensor<?xi32>,tensor<?xi32>) -> tensor<?xi32>
@@ -660,6 +671,14 @@ func @testSelectWithUnsupportedType(%cond : tensor<?xi32>, %arg0 : tensor<?xi32>
 
 // -----
 
+func @testSelectWithUnsupportedShapes(%cond : tensor<2xi1>, %arg0 : tensor<3xi32>, %arg1 : tensor<3xi32>) -> tensor<3xi32> {
+  // expected-error @+1 {{failed to verify that Select operands meet shape criteria}}
+  %0 = "tfl.select"(%cond, %arg0, %arg1): (tensor<2xi1>,tensor<3xi32>,tensor<3xi32>) -> tensor<3xi32>
+  return %0 : tensor<3xi32>
+}
+
+// -----
+
 func @testSelectWithUnsupportedType(%cond : tensor<?xi1>, %arg0 : tensor<?xi32>, %arg1 : tensor<?xf32>) -> tensor<?xi32> {
   // expected-error @+1 {{failed to verify that operands have same element type}}
   %0 = "tfl.select"(%cond, %arg0, %arg1): (tensor<?xi1>,tensor<?xi32>,tensor<?xf32>) -> tensor<?xi32>
@@ -762,6 +781,21 @@ func @testPadWithInvalidPaddingsRank(tensor<2x1x3xf32>, tensor<1x3x2xi32>) -> te
 
 // -----
 
+// CHECK-LABEL: testPadQuantizedU8
+func @testPadQuantizedU8(%arg0: tensor<2x1x3x!quant.uniform<u8:f32, 0.1>>, %arg1: tensor<3x2xi32>) -> tensor<? x !quant.uniform<u8:f32, 0.1>> {
+  // CHECK: "tfl.pad"(%arg0, %arg1)
+  %0 = "tfl.pad"(%arg0, %arg1) : (tensor<2x1x3x!quant.uniform<u8:f32, 0.1>>, tensor<3x2xi32>) -> tensor<? x !quant.uniform<u8:f32, 0.1>>
+  return %0#0 : tensor<? x !quant.uniform<u8:f32, 0.1>>
+}
+
+// CHECK-LABEL: testPadQuantizedI8
+func @testPadQuantizedI8(%arg0: tensor<2x1x3x!quant.uniform<i8:f32, 0.1>>, %arg1: tensor<3x2xi32>) -> tensor<? x !quant.uniform<i8:f32, 0.1>> {
+  // CHECK: "tfl.pad"(%arg0, %arg1)
+  %0 = "tfl.pad"(%arg0, %arg1) : (tensor<2x1x3x!quant.uniform<i8:f32, 0.1>>, tensor<3x2xi32>) -> tensor<? x !quant.uniform<i8:f32, 0.1>>
+  return %0#0 : tensor<? x !quant.uniform<i8:f32, 0.1>>
+}
+// -----
+
 // CHECK-LABEL: testPadV2
 func @testPadV2(tensor<2x1x3xf32>, tensor<3x2xi32>) -> tensor<? x f32> {
 ^bb0(%arg0: tensor<2x1x3xf32>, %arg1: tensor<3x2xi32>):
@@ -817,6 +851,20 @@ func @testPadV2WithInvalidConstantScalar(tensor<2x1x3xf32>, tensor<3x2xi32>) ->
 
 // -----
 
+func @packQuantizedU8(%arg0: tensor<2x!quant.uniform<u8:f32, 0.1>>, %arg1: tensor<2x!quant.uniform<u8:f32, 0.1>>) -> tensor<2x2x!quant.uniform<u8:f32, 0.1>> {
+  // CHECK: "tfl.pack"(%arg0, %arg1) {axis = 0 : i32, values_count = 2 : i32}
+  %0 = "tfl.pack"(%arg0, %arg1) {axis = 0 : i32, values_count = 2 : i32} : (tensor<2x!quant.uniform<u8:f32, 0.1>>, tensor<2x!quant.uniform<u8:f32, 0.1>>) -> tensor<2x2x!quant.uniform<u8:f32, 0.1>>
+  return %0 : tensor<2x2x!quant.uniform<u8:f32, 0.1>>
+}
+
+func @packQuantizedI8(%arg0: tensor<2x!quant.uniform<i8:f32, 0.1>>, %arg1: tensor<2x!quant.uniform<i8:f32, 0.1>>) -> tensor<2x2x!quant.uniform<i8:f32, 0.1>> {
+  // CHECK: "tfl.pack"(%arg0, %arg1) {axis = 0 : i32, values_count = 2 : i32}
+  %0 = "tfl.pack"(%arg0, %arg1) {axis = 0 : i32, values_count = 2 : i32} : (tensor<2x!quant.uniform<i8:f32, 0.1>>, tensor<2x!quant.uniform<i8:f32, 0.1>>) -> tensor<2x2x!quant.uniform<i8:f32, 0.1>>
+  return %0 : tensor<2x2x!quant.uniform<i8:f32, 0.1>>
+}
+
+// -----
+
 func @pack(%arg0: tensor<2xi32>, %arg1: tensor<2xi32>) -> tensor<2x2xi32> {
   // CHECK: "tfl.pack"(%arg0, %arg1) {axis = 0 : i32, values_count = 2 : i32}
   %0 = "tfl.pack"(%arg0, %arg1) {axis = 0 : i32, values_count = 2 : i32} : (tensor<2xi32>, tensor<2xi32>) -> tensor<2x2xi32>
@@ -1101,6 +1149,63 @@ func @transpose_perm_not_i32(%arg0 : tensor<2x2xi32>, %arg1 : tensor<2xf32>) ->
 }
 
 
+// -----
+
+func @transpose_perm_size(%arg0 : tensor<2x2xi32>, %arg1 : tensor<3xi32>) -> tensor<2x2xi32> {
+  // expected-error @+1 {{perm tensor elements size is not equal to input tensor rank}}
+  %0 = "tfl.transpose"(%arg0, %arg1) : (tensor<2x2xi32>, tensor<3xi32>) -> tensor<2x2xi32>
+  return %0 : tensor<2x2xi32>
+}
+
+
+// -----
+
+func @transpose_unranked_shape(%arg0 : tensor<*xi32>) -> tensor<2x2xi32> {
+  %cst = constant dense<[1, 0]> : tensor<2xi32>
+  %0 = "tfl.transpose"(%arg0, %cst) : (tensor<*xi32>, tensor<2xi32>) -> tensor<2x2xi32>
+  return %0 : tensor<2x2xi32>
+}
+
+
+// -----
+
+func @transpose_dynamic_shape(%arg0 : tensor<2x?xi32>) -> tensor<?x2xi32> {
+  %cst = constant dense<[1, 0]> : tensor<2xi32>
+  %0 = "tfl.transpose"(%arg0, %cst) : (tensor<2x?xi32>, tensor<2xi32>) -> tensor<?x2xi32>
+  return %0 : tensor<?x2xi32>
+}
+
+
+// -----
+
+func @transpose_perm_axis_invalid(%arg0 : tensor<2x2xi32>) -> tensor<2x2xi32> {
+  %cst = constant dense<[1, -1]> : tensor<2xi32>
+  // expected-error @+1 {{perm[1] must be in [0, rank)}}
+  %0 = "tfl.transpose"(%arg0, %cst) : (tensor<2x2xi32>, tensor<2xi32>) -> tensor<2x2xi32>
+  return %0 : tensor<2x2xi32>
+}
+
+
+// -----
+
+func @transpose_perm_axis_duplicated(%arg0 : tensor<2x2xi32>) -> tensor<2x2xi32> {
+  %cst = constant dense<[1, 1]> : tensor<2xi32>
+  // expected-error @+1 {{perm[1] cannot have duplicated axis}}
+  %0 = "tfl.transpose"(%arg0, %cst) : (tensor<2x2xi32>, tensor<2xi32>) -> tensor<2x2xi32>
+  return %0 : tensor<2x2xi32>
+}
+
+
+// -----
+
+func @transpose_output_type_bad(%arg0 : tensor<3x4x5x6xi32>) -> tensor<3x4x5x6xi32> {
+  %cst = constant dense<[0, 3, 1, 2]> : tensor<4xi32>
+  // expected-error @+1 {{expect output type tensor<3x6x4x5xi32>, got tensor<3x4x5x6xi32>}}
+  %0 = "tfl.transpose"(%arg0, %cst) : (tensor<3x4x5x6xi32>, tensor<4xi32>) -> tensor<3x4x5x6xi32>
+  return %0 : tensor<3x4x5x6xi32>
+}
+
+
 // -----
 
 func @transpose_element_type(%arg0 : tensor<2x2xf32>, %arg1 : tensor<2xi32>) -> tensor<2x2xi32> {
@@ -1643,3 +1748,33 @@ func @testSplitVOpWithValidSizeSplitsNegative(%arg0 : tensor<16x4xf32>) -> (tens
 
   return %0, %1, %2, %3, %4 : tensor<7x4xf32>, tensor<3x4xf32>, tensor<6x4xf32>, tensor<16x0xf32>, tensor<16x4xf32>
 }
+
+// -----
+
+func @testNonMaxSuppressionV4WithCorrectBoxShape(%arg0: tensor<3x4xf32>, %arg1: tensor<3xf32>, %arg2: tensor<i32>, %arg3: tensor<f32>, %arg4: tensor<f32>) -> (tensor<2xi32>, tensor<i32>) {
+  %0, %1 = "tfl.non_max_suppression_v4"(%arg0, %arg1, %arg2, %arg3, %arg4) : (tensor<3x4xf32>, tensor<3xf32>, tensor<i32>, tensor<f32>, tensor<f32>) -> (tensor<2xi32>, tensor<i32>)
+  return %0, %1 : tensor<2xi32>, tensor<i32>
+}
+
+// -----
+
+func @testNonMaxSuppressionV4WithWrongBoxShape(%arg0: tensor<3x2xf32>, %arg1: tensor<3xf32>, %arg2: tensor<i32>, %arg3: tensor<f32>, %arg4: tensor<f32>) -> (tensor<2xi32>, tensor<i32>) {
+  // expected-error @+1 {{'tfl.non_max_suppression_v4' op failed to verify that boxes should have dim[1] == 4}}
+  %0, %1 = "tfl.non_max_suppression_v4"(%arg0, %arg1, %arg2, %arg3, %arg4) : (tensor<3x2xf32>, tensor<3xf32>, tensor<i32>, tensor<f32>, tensor<f32>) -> (tensor<2xi32>, tensor<i32>)
+  return %0, %1 : tensor<2xi32>, tensor<i32>
+}
+
+// -----
+
+func @testNonMaxSuppressionV5WithCorrectBoxShape(%arg0: tensor<3x4xf32>, %arg1: tensor<3xf32>, %arg2: tensor<i32>, %arg3: tensor<f32>, %arg4: tensor<f32>, %arg5: tensor<f32>) -> (tensor<2xi32>, tensor<2xf32>, tensor<i32>) {
+  %0, %1, %2 = "tfl.non_max_suppression_v5"(%arg0, %arg1, %arg2, %arg3, %arg4, %arg5) : (tensor<3x4xf32>, tensor<3xf32>, tensor<i32>, tensor<f32>, tensor<f32>, tensor<f32>) -> (tensor<2xi32>, tensor<2xf32>, tensor<i32>)
+  return %0, %1, %2 : tensor<2xi32>, tensor<2xf32>, tensor<i32>
+}
+
+// -----
+
+func @testNonMaxSuppressionV5WithWrongBoxShape(%arg0: tensor<3x2xf32>, %arg1: tensor<3xf32>, %arg2: tensor<i32>, %arg3: tensor<f32>, %arg4: tensor<f32>, %arg5: tensor<f32>) -> (tensor<2xi32>, tensor<2xf32>, tensor<i32>) {
+  // expected-error @+1 {{'tfl.non_max_suppression_v5' op failed to verify that boxes should have dim[1] == 4}}
+  %0, %1, %2 = "tfl.non_max_suppression_v5"(%arg0, %arg1, %arg2, %arg3, %arg4, %arg5) : (tensor<3x2xf32>, tensor<3xf32>, tensor<i32>, tensor<f32>, tensor<f32>, tensor<f32>) -> (tensor<2xi32>, tensor<2xf32>, tensor<i32>)
+  return %0, %1, %2 : tensor<2xi32>, tensor<2xf32>, tensor<i32>
+}
diff --git a/tensorflow/compiler/mlir/lite/tests/optimize.mlir b/tensorflow/compiler/mlir/lite/tests/optimize.mlir
index 138962d5fca..f1e556703e3 100644
--- a/tensorflow/compiler/mlir/lite/tests/optimize.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/optimize.mlir
@@ -292,16 +292,3 @@ func @InvalidL2NormalizePattern(%arg0: tensor<2xf32>, %arg1: tensor<2xf32>) -> t
   // CHECK: %3 = "tfl.div"([[INPUT:%.*]], %2) {fused_activation_function = "NONE"} : (tensor<2xf32>, tensor<f32>) -> tensor<2xf32>
   // CHECK: return %3
 }
-
-// CHECK-LABEL: @InvalidL2NormalizePatternMorethan1Dimension
-// Input has higher rank, it should be limited to 1D only.
-func @InvalidL2NormalizePatternMorethan1Dimension(%arg0: tensor<2x2xf32>) -> tensor<2x2xf32> {
-  %cst = constant dense<[0]> : tensor<1xi32>
-  %0 = "tfl.square"(%arg0) : (tensor<2x2xf32>) -> tensor<2x2xf32>
-  %1 = "tfl.sum"(%0, %cst) {keep_dims = false} : (tensor<2x2xf32>, tensor<1xi32>) -> tensor<f32>
-  %2 = "tfl.sqrt"(%1) : (tensor<f32>) -> tensor<f32>
-  %3 = "tfl.div"(%arg0, %2) {fused_activation_function = "NONE"} : (tensor<2x2xf32>, tensor<f32>) -> tensor<2x2xf32>
-  return %3: tensor<2x2xf32>
-  // CHECK: %3 = "tfl.div"([[INPUT:%.*]], %2) {fused_activation_function = "NONE"} : (tensor<2x2xf32>, tensor<f32>) -> tensor<2x2xf32>
-  // CHECK: return %3
-}
diff --git a/tensorflow/compiler/mlir/lite/tests/prepare-quantize.mlir b/tensorflow/compiler/mlir/lite/tests/prepare-quantize.mlir
index 5fd57ab21b4..092cb1e52f9 100644
--- a/tensorflow/compiler/mlir/lite/tests/prepare-quantize.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/prepare-quantize.mlir
@@ -373,21 +373,12 @@ func @QuantizeConstant() -> tensor<2x3xf32> {
 // CHECK: return %1 : tensor<2x3xf32>
 }
 
-// CHECK-LABEL: NotQuantizeNonZeroSplat
-func @NotQuantizeNonZeroSplat() -> tensor<2x3xf32> {
-  %cst = constant dense<2.0> : tensor<2x3xf32>
-  return %cst : tensor<2x3xf32>
+// CHECK-LABEL: NotQuantizeNoneType
+func @NotQuantizeNoneType() -> none {
+  %cst = constant unit
+  return %cst : none
 
-// CHECK-NEXT:  %[[cst:.*]] = constant dense<2.000000e+00>
-// CHECK-NEXT:  return %[[cst]]
-}
-
-// CHECK-LABEL: NotQuantizeNonZeroScalar
-func @NotQuantizeNonZeroScalar() -> tensor<f32> {
-  %cst = constant dense<2.0> : tensor<f32>
-  return %cst : tensor<f32>
-
-// CHECK-NEXT:  %[[cst:.*]] = constant dense<2.000000e+00>
+// CHECK-NEXT:  %[[cst:.*]] = constant unit
 // CHECK-NEXT:  return %[[cst]]
 }
 
@@ -433,6 +424,32 @@ func @QuantizeSharedBiases(
 // CHECK: %[[cst_0:.*]] = constant dense<1.000000e+00> : tensor<32xf32>
 // CHECK: %[[q_0:.*]] = "tfl.quantize"(%[[cst_0]])
 // CHECK: %[[dq_0:.*]] = "tfl.dequantize"(%[[q_0]])
-// CHECK: %{{.*}} = "tfl.conv_2d"(%{{.*}}, %{{.*}}, %[[dq]])
 // CHECK: %{{.*}} = "tfl.conv_2d"(%{{.*}}, %{{.*}}, %[[dq_0]])
+// CHECK: %{{.*}} = "tfl.conv_2d"(%{{.*}}, %{{.*}}, %[[dq]])
+}
+
+// CHECK-LABEL: QuantizeSharedBiases2
+func @QuantizeSharedBiases2(
+    %arg0: tensor<32x!quant.uniform<u8:f32, 1.0>>,
+    %arg1: tensor<1x112x112x32x!quant.uniform<u8:f32, 1.0>>,
+    %arg2: tensor<32x3x3x3x!quant.uniform<u8<1:255>:f32, 2.0>>) -> (tensor<32x!quant.uniform<u8:f32, 1.0>>, tensor<1x56x56x32x!quant.uniform<u8:f32, 1.0>>) {
+  %cst = constant dense<1.0> : tensor<32xf32>
+  %1 = "tfl.dequantize"(%arg0) : (tensor<32x!quant.uniform<u8:f32, 1.0>>) -> tensor<32xf32>
+  %add = "tfl.add"(%1, %cst) {fused_activation_function = "NONE"} : (tensor<32xf32>, tensor<32xf32>) -> tensor<32xf32>
+  %3 = "tfl.quantize"(%add) {qtype = tensor<32xf32>} : (tensor<32xf32>) -> tensor<32x!quant.uniform<u8:f32, 1.0>>
+
+  %5 = "tfl.dequantize"(%arg1) : (tensor<1x112x112x32x!quant.uniform<u8:f32, 1.0>>) -> tensor<1x112x112x32xf32>
+  %6 = "tfl.dequantize"(%arg2) : (tensor<32x3x3x3x!quant.uniform<u8<1:255>:f32, 2.0>>) -> tensor<32x3x3x3xf32>
+  %conv2 = "tfl.conv_2d"(%5, %6, %cst) {dilation_h_factor = 2 : i32, dilation_w_factor = 3 : i32, fused_activation_function = "NONE", padding = "SAME", stride_h = 4 : i32, stride_w = 5 : i32} : (tensor<1x112x112x32xf32>, tensor<32x3x3x3xf32>, tensor<32xf32>) -> tensor<1x56x56x32xf32>
+  %7 = "tfl.quantize"(%conv2) {qtype = tensor<1x56x56x32x!quant.uniform<u8:f32, 1.0>>} : (tensor<1x56x56x32xf32>) -> tensor<1x56x56x32x!quant.uniform<u8:f32, 1.0>>
+  return %3, %7 : tensor<32x!quant.uniform<u8:f32, 1.0>>, tensor<1x56x56x32x!quant.uniform<u8:f32, 1.0>>
+
+// CHECK: %[[cst:.*]] = constant dense<1.000000e+00> : tensor<32xf32>
+// CHECK: %[[q:.*]] = "tfl.quantize"(%[[cst]])
+// CHECK: %[[dq:.*]] = "tfl.dequantize"(%[[q]])
+// CHECK: %[[cst_0:.*]] = constant dense<1.000000e+00> : tensor<32xf32>
+// CHECK: %[[q_0:.*]] = "tfl.quantize"(%[[cst_0]]) {qtype = tensor<32x!quant.uniform<u8<1:255>:f32, 1.000000e+00:1>>}
+// CHECK: %[[dq_0:.*]] = "tfl.dequantize"(%[[q_0]])
+// CHECK: %{{.*}} = tfl.add %{{.*}}, %[[dq_0]]
+// CHECK: %{{.*}} = "tfl.conv_2d"(%{{.*}}, %{{.*}}, %[[dq]])
 }
diff --git a/tensorflow/compiler/mlir/lite/tests/prepare-tf.mlir b/tensorflow/compiler/mlir/lite/tests/prepare-tf.mlir
index 6a7d883ce50..8d6d7ab513e 100644
--- a/tensorflow/compiler/mlir/lite/tests/prepare-tf.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/prepare-tf.mlir
@@ -63,7 +63,7 @@ func @fusedBatchNorm(tensor<8x8x8x8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8
   return %2, %2#1 : tensor<8x8x8x8xf32>, tensor<8xf32>
 
 // CHECK-LABEL: fusedBatchNorm
-// CHECK:  %[[CONSTANT:.*]] = "tf.Const"{{.*}} dense<1.000000e-03>
+// CHECK:  %[[CONSTANT:.*]] = constant dense<1.000000e-03>
 //              variance + epsilon
 // CHECK:  %[[ADD1:.*]] = "tf.Add"(%[[ARG4:.*]], %[[CONSTANT]])
 //              rsqrt(variance + epsilon)
@@ -96,7 +96,7 @@ func @fusedBatchNormV3(tensor<8x8x8x8xf32>, tensor<8xf32>, tensor<8xf32>, tensor
   return %2, %2#1 : tensor<8x8x8x8xf32>, tensor<8xf32>
 
 // CHECK-LABEL: fusedBatchNormV3
-// CHECK:  %[[CONSTANT:.*]] = "tf.Const"{{.*}} dense<1.000000e-03>
+// CHECK:  %[[CONSTANT:.*]] = constant dense<1.000000e-03>
 //              variance + epsilon
 // CHECK:  %[[ADD1:.*]] = "tf.Add"(%[[ARG4:.*]], %[[CONSTANT]])
 //              rsqrt(variance + epsilon)
@@ -155,7 +155,7 @@ func @fakeQuantFolded() -> (tensor<8xf32>) {
   %rst = "tf.FakeQuantWithMinMaxVars"(%in, %mini, %maxi) {num_bits = 3, narrow_range = false} : (tensor<8xf32>, tensor<f32>, tensor<f32>) -> tensor<8xf32>
   return %rst : tensor<8xf32>
 
-// CHECK: %[[CONSTANT:.*]] = "tf.Const"{{.*}} dense<0.000000e+00> : tensor<8xf32>
+// CHECK: %[[CONSTANT:.*]] = constant dense<0.000000e+00> : tensor<8xf32>
 // CHECK: %[[QUANTIZE:.*]] = "tfl.quantize"(%[[CONSTANT]]) {qtype = tensor<8x!quant.uniform<u8:f32, 1.000000e+00>>}
 // CHECK: %[[DEQUANTIZE:.*]] = "tfl.dequantize"(%[[QUANTIZE]])
 // CHECK: return %[[DEQUANTIZE]] : tensor<8xf32>
@@ -262,7 +262,7 @@ func @fakeQuantWithConv2D(tensor<256x32x32x3xf32>) -> (tensor<256x30x30x16xf32>)
   return %rst : tensor<256x30x30x16xf32>
 
 // CHECK: %[[CONSTANT:.*]] = constant dense<0.000000e+00> : tensor<16xf32>
-// CHECK: %[[CONSTANT0:.*]] = "tf.Const"{{.*}} dense<0.000000e+00> : tensor<16x3x3x3xf32>
+// CHECK: %[[CONSTANT0:.*]] = constant dense<0.000000e+00> : tensor<16x3x3x3xf32>
 // CHECK: %[[QUANTIZE:.*]] = "tfl.quantize"(%[[CONSTANT0]]) {qtype = tensor<16x3x3x3x!quant.uniform<u8:f32, 1.000000e+00>>}
 // CHECK: %[[DEQUANTIZE:.*]] = "tfl.dequantize"(%[[QUANTIZE]])
 // CHECK: %[[CONV:.*]] = "tfl.conv_2d"(%arg0, %[[DEQUANTIZE]], %[[CONSTANT]])
@@ -282,7 +282,7 @@ func @fakeQuantWithDepthwiseConv2D(tensor<256x32x32x3xf32>) -> (tensor<256x30x30
   return %rst : tensor<256x30x30x16xf32>
 
 // CHECK: %[[CONSTANT:.*]] = constant dense<0.000000e+00> : tensor<48xf32>
-// CHECK: %[[CONSTANT0:.*]] = "tf.Const"{{.*}} dense<0.000000e+00> : tensor<1x3x3x48xf32>
+// CHECK: %[[CONSTANT0:.*]] = constant dense<0.000000e+00> : tensor<1x3x3x48xf32>
 // CHECK: %[[QUANTIZE:.*]] = "tfl.quantize"(%[[CONSTANT0]]) {qtype = tensor<1x3x3x48x!quant.uniform<u8:f32, 1.000000e+00>>}
 // CHECK: %[[DEQUANTIZE:.*]] = "tfl.dequantize"(%[[QUANTIZE]])
 // CHECK: %[[CONV:.*]] = "tfl.depthwise_conv_2d"(%arg0, %[[DEQUANTIZE]], %[[CONSTANT]])
@@ -348,3 +348,11 @@ func @stop_gradient(%arg0: tensor<3xi32>) -> tensor<3xi32> {
   // CHECK-LABEL: stop_gradient
   // CHECK:  return %arg0 : tensor<3xi32>
 }
+
+func @CheckNumerics(%arg0: tensor<3xf32>) -> tensor<3xf32> {
+  %0 = "tf.CheckNumerics"(%arg0) {message = ""}: (tensor<3xf32>) -> tensor<3xf32>
+  return %0 : tensor<3xf32>
+  // Should be converted to Identity and then from Identity to value
+  // CHECK-LABEL: CheckNumerics
+  // CHECK:  return %arg0 : tensor<3xf32>
+}
diff --git a/tensorflow/compiler/mlir/lite/tests/unroll-batch-matmul.mlir b/tensorflow/compiler/mlir/lite/tests/unroll-batch-matmul.mlir
new file mode 100644
index 00000000000..09f1dfc9133
--- /dev/null
+++ b/tensorflow/compiler/mlir/lite/tests/unroll-batch-matmul.mlir
@@ -0,0 +1,223 @@
+// RUN: tf-opt -tfl-unroll-batch-matmul %s | FileCheck %s
+
+func @batchMatMulV2TwoDim(%arg0: tensor<2x3x4x5xf32>, %arg1: tensor<2x3x5x6xf32>) -> tensor<2x3x4x6xf32> {
+  %0 = "tf.BatchMatMulV2"(%arg0, %arg1) : (tensor<2x3x4x5xf32>, tensor<2x3x5x6xf32>) -> tensor<2x3x4x6xf32>
+  return %0 : tensor<2x3x4x6xf32>
+
+  // CHECK-LABEL: batchMatMulV2TwoDim
+  // CHECK: %[[cst:.*]] = constant dense<[6, 4, 5]> : tensor<3xi64>
+  // CHECK: %[[cst_0:.*]] = constant dense<[1, 4, 5]> : tensor<3xi64>
+  // CHECK: %[[cst_1:.*]] = constant dense<[4, 5]> : tensor<2xi64>
+  // CHECK: %[[cst_2:.*]] = constant dense<[6, 5, 6]> : tensor<3xi64>
+  // CHECK: %[[cst_3:.*]] = constant dense<0> : tensor<3xi64>
+  // CHECK: %[[cst_4:.*]] = constant dense<[1, 0, 0]> : tensor<3xi64>
+  // CHECK: %[[cst_5:.*]] = constant dense<[2, 0, 0]> : tensor<3xi64>
+  // CHECK: %[[cst_6:.*]] = constant dense<[3, 0, 0]> : tensor<3xi64>
+  // CHECK: %[[cst_7:.*]] = constant dense<[4, 0, 0]> : tensor<3xi64>
+  // CHECK: %[[cst_8:.*]] = constant dense<[5, 0, 0]> : tensor<3xi64>
+  // CHECK: %[[cst_9:.*]] = constant dense<[1, 5, 6]> : tensor<3xi64>
+  // CHECK: %[[cst_10:.*]] = constant dense<[5, 6]> : tensor<2xi64>
+  // CHECK: %[[cst_11:.*]] = constant dense<[2, 3, 4, 6]> : tensor<4xi64>
+
+  // CHECK: %[[v0:.*]] = "tf.Reshape"(%arg0, %[[cst]]) : (tensor<2x3x4x5xf32>, tensor<3xi64>) -> tensor<6x4x5xf32>
+  // CHECK: %[[v1:.*]] = "tf.Slice"(%[[v0]], %[[cst_3]], %[[cst_0]]) : (tensor<6x4x5xf32>, tensor<3xi64>, tensor<3xi64>) -> tensor<1x4x5xf32>
+  // CHECK: %[[v2:.*]] = "tf.Reshape"(%[[v1]], %[[cst_1]]) : (tensor<1x4x5xf32>, tensor<2xi64>) -> tensor<4x5xf32>
+  // CHECK: %[[v3:.*]] = "tf.Slice"(%[[v0]], %[[cst_4]], %[[cst_0]]) : (tensor<6x4x5xf32>, tensor<3xi64>, tensor<3xi64>) -> tensor<1x4x5xf32>
+  // CHECK: %[[v4:.*]] = "tf.Reshape"(%[[v3]], %[[cst_1]]) : (tensor<1x4x5xf32>, tensor<2xi64>) -> tensor<4x5xf32>
+  // CHECK: %[[v5:.*]] = "tf.Slice"(%[[v0]], %[[cst_5]], %[[cst_0]]) : (tensor<6x4x5xf32>, tensor<3xi64>, tensor<3xi64>) -> tensor<1x4x5xf32>
+  // CHECK: %[[v6:.*]] = "tf.Reshape"(%[[v5]], %[[cst_1]]) : (tensor<1x4x5xf32>, tensor<2xi64>) -> tensor<4x5xf32>
+  // CHECK: %[[v7:.*]] = "tf.Slice"(%[[v0]], %[[cst_6]], %[[cst_0]]) : (tensor<6x4x5xf32>, tensor<3xi64>, tensor<3xi64>) -> tensor<1x4x5xf32>
+  // CHECK: %[[v8:.*]] = "tf.Reshape"(%[[v7]], %[[cst_1]]) : (tensor<1x4x5xf32>, tensor<2xi64>) -> tensor<4x5xf32>
+  // CHECK: %[[v9:.*]] = "tf.Slice"(%[[v0]], %[[cst_7]], %[[cst_0]]) : (tensor<6x4x5xf32>, tensor<3xi64>, tensor<3xi64>) -> tensor<1x4x5xf32>
+  // CHECK: %[[v10:.*]] = "tf.Reshape"(%[[v9]], %[[cst_1]]) : (tensor<1x4x5xf32>, tensor<2xi64>) -> tensor<4x5xf32>
+  // CHECK: %[[v11:.*]] = "tf.Slice"(%[[v0]], %[[cst_8]], %[[cst_0]]) : (tensor<6x4x5xf32>, tensor<3xi64>, tensor<3xi64>) -> tensor<1x4x5xf32>
+  // CHECK: %[[v12:.*]] = "tf.Reshape"(%[[v11]], %[[cst_1]]) : (tensor<1x4x5xf32>, tensor<2xi64>) -> tensor<4x5xf32>
+
+  // CHECK: %[[v13:.*]] = "tf.Reshape"(%arg1, %[[cst_2]]) : (tensor<2x3x5x6xf32>, tensor<3xi64>) -> tensor<6x5x6xf32>
+  // CHECK: %[[v14:.*]] = "tf.Slice"(%[[v13]], %[[cst_3]], %[[cst_9]]) : (tensor<6x5x6xf32>, tensor<3xi64>, tensor<3xi64>) -> tensor<1x5x6xf32>
+  // CHECK: %[[v15:.*]] = "tf.Reshape"(%[[v14]], %[[cst_10]]) : (tensor<1x5x6xf32>, tensor<2xi64>) -> tensor<5x6xf32>
+  // CHECK: %[[v16:.*]] = "tf.Slice"(%[[v13]], %[[cst_4]], %[[cst_9]]) : (tensor<6x5x6xf32>, tensor<3xi64>, tensor<3xi64>) -> tensor<1x5x6xf32>
+  // CHECK: %[[v17:.*]] = "tf.Reshape"(%[[v16]], %[[cst_10]]) : (tensor<1x5x6xf32>, tensor<2xi64>) -> tensor<5x6xf32>
+  // CHECK: %[[v18:.*]] = "tf.Slice"(%[[v13]], %[[cst_5]], %[[cst_9]]) : (tensor<6x5x6xf32>, tensor<3xi64>, tensor<3xi64>) -> tensor<1x5x6xf32>
+  // CHECK: %[[v19:.*]] = "tf.Reshape"(%[[v18]], %[[cst_10]]) : (tensor<1x5x6xf32>, tensor<2xi64>) -> tensor<5x6xf32>
+  // CHECK: %[[v20:.*]] = "tf.Slice"(%[[v13]], %[[cst_6]], %[[cst_9]]) : (tensor<6x5x6xf32>, tensor<3xi64>, tensor<3xi64>) -> tensor<1x5x6xf32>
+  // CHECK: %[[v21:.*]] = "tf.Reshape"(%[[v20]], %[[cst_10]]) : (tensor<1x5x6xf32>, tensor<2xi64>) -> tensor<5x6xf32>
+  // CHECK: %[[v22:.*]] = "tf.Slice"(%[[v13]], %[[cst_7]], %[[cst_9]]) : (tensor<6x5x6xf32>, tensor<3xi64>, tensor<3xi64>) -> tensor<1x5x6xf32>
+  // CHECK: %[[v23:.*]] = "tf.Reshape"(%[[v22]], %[[cst_10]]) : (tensor<1x5x6xf32>, tensor<2xi64>) -> tensor<5x6xf32>
+  // CHECK: %[[v24:.*]] = "tf.Slice"(%[[v13]], %[[cst_8]], %[[cst_9]]) : (tensor<6x5x6xf32>, tensor<3xi64>, tensor<3xi64>) -> tensor<1x5x6xf32>
+  // CHECK: %[[v25:.*]] = "tf.Reshape"(%[[v24]], %[[cst_10]]) : (tensor<1x5x6xf32>, tensor<2xi64>) -> tensor<5x6xf32>
+
+  // CHECK: %[[v26:.*]] = "tf.MatMul"(%[[v2]], %[[v15]]) {transpose_a = false, transpose_b = false} : (tensor<4x5xf32>, tensor<5x6xf32>) -> tensor<4x6xf32>
+  // CHECK: %[[v27:.*]] = "tf.MatMul"(%[[v4]], %[[v17]]) {transpose_a = false, transpose_b = false} : (tensor<4x5xf32>, tensor<5x6xf32>) -> tensor<4x6xf32>
+  // CHECK: %[[v28:.*]] = "tf.MatMul"(%[[v6]], %[[v19]]) {transpose_a = false, transpose_b = false} : (tensor<4x5xf32>, tensor<5x6xf32>) -> tensor<4x6xf32>
+  // CHECK: %[[v29:.*]] = "tf.MatMul"(%[[v8]], %[[v21]]) {transpose_a = false, transpose_b = false} : (tensor<4x5xf32>, tensor<5x6xf32>) -> tensor<4x6xf32>
+  // CHECK: %[[v30:.*]] = "tf.MatMul"(%[[v10]], %[[v23]]) {transpose_a = false, transpose_b = false} : (tensor<4x5xf32>, tensor<5x6xf32>) -> tensor<4x6xf32>
+  // CHECK: %[[v31:.*]] = "tf.MatMul"(%[[v12]], %[[v25]]) {transpose_a = false, transpose_b = false} : (tensor<4x5xf32>, tensor<5x6xf32>) -> tensor<4x6xf32>
+
+  // CHECK: %[[v32:.*]] = "tf.Pack"(%[[v26]], %[[v27]], %[[v28]], %[[v29]], %[[v30]], %[[v31]]) {N = 6 : i64, axis = 0 : i64} : (tensor<4x6xf32>, tensor<4x6xf32>, tensor<4x6xf32>, tensor<4x6xf32>, tensor<4x6xf32>, tensor<4x6xf32>) -> tensor<6x4x6xf32>
+  // CHECK: %[[v33:.*]] = "tf.Reshape"(%[[v32]], %[[cst_11]]) : (tensor<6x4x6xf32>, tensor<4xi64>) -> tensor<2x3x4x6xf32>
+
+  // CHECK: return %[[v33]] : tensor<2x3x4x6xf32>
+}
+
+func @batchMatMulV2FlatInput(%arg0: tensor<3x4x5xf32>, %arg1: tensor<3x5x6xf32>) -> tensor<3x4x6xf32> {
+  %0 = "tf.BatchMatMulV2"(%arg0, %arg1) : (tensor<3x4x5xf32>, tensor<3x5x6xf32>) -> tensor<3x4x6xf32>
+  return %0 : tensor<3x4x6xf32>
+
+  // CHECK-LABEL: batchMatMulV2FlatInput
+  // CHECK: %[[cst:.*]] = constant dense<[3, 4, 5]> : tensor<3xi64>
+  // CHECK: %[[cst_0:.*]] = constant dense<[1, 4, 5]> : tensor<3xi64>
+  // CHECK: %[[cst_1:.*]] = constant dense<[4, 5]> : tensor<2xi64>
+  // CHECK: %[[cst_2:.*]] = constant dense<[3, 5, 6]> : tensor<3xi64>
+  // CHECK: %[[cst_3:.*]] = constant dense<0> : tensor<3xi64>
+  // CHECK: %[[cst_4:.*]] = constant dense<[1, 0, 0]> : tensor<3xi64>
+  // CHECK: %[[cst_5:.*]] = constant dense<[2, 0, 0]> : tensor<3xi64>
+  // CHECK: %[[cst_6:.*]] = constant dense<[1, 5, 6]> : tensor<3xi64>
+  // CHECK: %[[cst_7:.*]] = constant dense<[5, 6]> : tensor<2xi64>
+  // CHECK: %[[cst_8:.*]] = constant dense<[3, 4, 6]> : tensor<3xi64>
+
+  // CHECK: %[[v0:.*]] = "tf.Reshape"(%arg0, %[[cst]]) : (tensor<3x4x5xf32>, tensor<3xi64>) -> tensor<3x4x5xf32>
+  // CHECK: %[[v1:.*]] = "tf.Slice"(%[[v0]], %[[cst_3]], %[[cst_0]]) : (tensor<3x4x5xf32>, tensor<3xi64>, tensor<3xi64>) -> tensor<1x4x5xf32>
+  // CHECK: %[[v2:.*]] = "tf.Reshape"(%[[v1]], %[[cst_1]]) : (tensor<1x4x5xf32>, tensor<2xi64>) -> tensor<4x5xf32>
+  // CHECK: %[[v3:.*]] = "tf.Slice"(%[[v0]], %[[cst_4]], %[[cst_0]]) : (tensor<3x4x5xf32>, tensor<3xi64>, tensor<3xi64>) -> tensor<1x4x5xf32>
+  // CHECK: %[[v4:.*]] = "tf.Reshape"(%[[v3]], %[[cst_1]]) : (tensor<1x4x5xf32>, tensor<2xi64>) -> tensor<4x5xf32>
+  // CHECK: %[[v5:.*]] = "tf.Slice"(%[[v0]], %[[cst_5]], %[[cst_0]]) : (tensor<3x4x5xf32>, tensor<3xi64>, tensor<3xi64>) -> tensor<1x4x5xf32>
+  // CHECK: %[[v6:.*]] = "tf.Reshape"(%[[v5]], %[[cst_1]]) : (tensor<1x4x5xf32>, tensor<2xi64>) -> tensor<4x5xf32>
+
+  // CHECK: %[[v7:.*]] = "tf.Reshape"(%arg1, %[[cst_2]]) : (tensor<3x5x6xf32>, tensor<3xi64>) -> tensor<3x5x6xf32>
+  // CHECK: %[[v8:.*]] = "tf.Slice"(%[[v7]], %[[cst_3]], %[[cst_6]]) : (tensor<3x5x6xf32>, tensor<3xi64>, tensor<3xi64>) -> tensor<1x5x6xf32>
+  // CHECK: %[[v9:.*]] = "tf.Reshape"(%[[v8]], %[[cst_7]]) : (tensor<1x5x6xf32>, tensor<2xi64>) -> tensor<5x6xf32>
+  // CHECK: %[[v10:.*]] = "tf.Slice"(%[[v7]], %[[cst_4]], %[[cst_6]]) : (tensor<3x5x6xf32>, tensor<3xi64>, tensor<3xi64>) -> tensor<1x5x6xf32>
+  // CHECK: %[[v11:.*]] = "tf.Reshape"(%[[v10]], %[[cst_7]]) : (tensor<1x5x6xf32>, tensor<2xi64>) -> tensor<5x6xf32>
+  // CHECK: %[[v12:.*]] = "tf.Slice"(%[[v7]], %[[cst_5]], %[[cst_6]]) : (tensor<3x5x6xf32>, tensor<3xi64>, tensor<3xi64>) -> tensor<1x5x6xf32>
+  // CHECK: %[[v13:.*]] = "tf.Reshape"(%[[v12]], %[[cst_7]]) : (tensor<1x5x6xf32>, tensor<2xi64>) -> tensor<5x6xf32>
+
+  // CHECK: %[[v14:.*]] = "tf.MatMul"(%[[v2]], %[[v9]]) {transpose_a = false, transpose_b = false} : (tensor<4x5xf32>, tensor<5x6xf32>) -> tensor<4x6xf32>
+  // CHECK: %[[v15:.*]] = "tf.MatMul"(%[[v4]], %[[v11]]) {transpose_a = false, transpose_b = false} : (tensor<4x5xf32>, tensor<5x6xf32>) -> tensor<4x6xf32>
+  // CHECK: %[[v16:.*]] = "tf.MatMul"(%[[v6]], %[[v13]]) {transpose_a = false, transpose_b = false} : (tensor<4x5xf32>, tensor<5x6xf32>) -> tensor<4x6xf32>
+
+  // CHECK: %[[v17:.*]] = "tf.Pack"(%[[v14]], %[[v15]], %[[v16]]) {N = 3 : i64, axis = 0 : i64} : (tensor<4x6xf32>, tensor<4x6xf32>, tensor<4x6xf32>) -> tensor<3x4x6xf32>
+  // CHECK: %[[v18:.*]] = "tf.Reshape"(%[[v17]], %[[cst_8]]) : (tensor<3x4x6xf32>, tensor<3xi64>) -> tensor<3x4x6xf32>
+
+  // CHECK: return %[[v18]] : tensor<3x4x6xf32>
+}
+
+func @batchMatMulV2Matrix(%arg0: tensor<4x5xf32>, %arg1: tensor<5x6xf32>) -> tensor<4x6xf32> {
+  %0 = "tf.BatchMatMulV2"(%arg0, %arg1) : (tensor<4x5xf32>, tensor<5x6xf32>) -> tensor<4x6xf32>
+  return %0 : tensor<4x6xf32>
+
+  // CHECK-LABEL: batchMatMulV2Matrix
+  // CHECK: %[[v0:.*]] = "tf.MatMul"(%arg0, %arg1) {transpose_a = false, transpose_b = false} : (tensor<4x5xf32>, tensor<5x6xf32>) -> tensor<4x6xf32>
+  // CHECK: return %[[v0]] : tensor<4x6xf32>
+}
+
+func @batchMatMulTwoDim(%arg0: tensor<2x3x4x5xf32>, %arg1: tensor<2x3x5x6xf32>) -> tensor<2x3x4x6xf32> {
+  %0 = "tf.BatchMatMul"(%arg0, %arg1) : (tensor<2x3x4x5xf32>, tensor<2x3x5x6xf32>) -> tensor<2x3x4x6xf32>
+  return %0 : tensor<2x3x4x6xf32>
+
+  // CHECK-LABEL: batchMatMulTwoDim
+  // CHECK: %[[cst:.*]] = constant dense<[6, 4, 5]> : tensor<3xi64>
+  // CHECK: %[[cst_0:.*]] = constant dense<[1, 4, 5]> : tensor<3xi64>
+  // CHECK: %[[cst_1:.*]] = constant dense<[4, 5]> : tensor<2xi64>
+  // CHECK: %[[cst_2:.*]] = constant dense<[6, 5, 6]> : tensor<3xi64>
+  // CHECK: %[[cst_3:.*]] = constant dense<0> : tensor<3xi64>
+  // CHECK: %[[cst_4:.*]] = constant dense<[1, 0, 0]> : tensor<3xi64>
+  // CHECK: %[[cst_5:.*]] = constant dense<[2, 0, 0]> : tensor<3xi64>
+  // CHECK: %[[cst_6:.*]] = constant dense<[3, 0, 0]> : tensor<3xi64>
+  // CHECK: %[[cst_7:.*]] = constant dense<[4, 0, 0]> : tensor<3xi64>
+  // CHECK: %[[cst_8:.*]] = constant dense<[5, 0, 0]> : tensor<3xi64>
+  // CHECK: %[[cst_9:.*]] = constant dense<[1, 5, 6]> : tensor<3xi64>
+  // CHECK: %[[cst_10:.*]] = constant dense<[5, 6]> : tensor<2xi64>
+  // CHECK: %[[cst_11:.*]] = constant dense<[2, 3, 4, 6]> : tensor<4xi64>
+
+  // CHECK: %[[v0:.*]] = "tf.Reshape"(%arg0, %[[cst]]) : (tensor<2x3x4x5xf32>, tensor<3xi64>) -> tensor<6x4x5xf32>
+  // CHECK: %[[v1:.*]] = "tf.Slice"(%[[v0]], %[[cst_3]], %[[cst_0]]) : (tensor<6x4x5xf32>, tensor<3xi64>, tensor<3xi64>) -> tensor<1x4x5xf32>
+  // CHECK: %[[v2:.*]] = "tf.Reshape"(%[[v1]], %[[cst_1]]) : (tensor<1x4x5xf32>, tensor<2xi64>) -> tensor<4x5xf32>
+  // CHECK: %[[v3:.*]] = "tf.Slice"(%[[v0]], %[[cst_4]], %[[cst_0]]) : (tensor<6x4x5xf32>, tensor<3xi64>, tensor<3xi64>) -> tensor<1x4x5xf32>
+  // CHECK: %[[v4:.*]] = "tf.Reshape"(%[[v3]], %[[cst_1]]) : (tensor<1x4x5xf32>, tensor<2xi64>) -> tensor<4x5xf32>
+  // CHECK: %[[v5:.*]] = "tf.Slice"(%[[v0]], %[[cst_5]], %[[cst_0]]) : (tensor<6x4x5xf32>, tensor<3xi64>, tensor<3xi64>) -> tensor<1x4x5xf32>
+  // CHECK: %[[v6:.*]] = "tf.Reshape"(%[[v5]], %[[cst_1]]) : (tensor<1x4x5xf32>, tensor<2xi64>) -> tensor<4x5xf32>
+  // CHECK: %[[v7:.*]] = "tf.Slice"(%[[v0]], %[[cst_6]], %[[cst_0]]) : (tensor<6x4x5xf32>, tensor<3xi64>, tensor<3xi64>) -> tensor<1x4x5xf32>
+  // CHECK: %[[v8:.*]] = "tf.Reshape"(%[[v7]], %[[cst_1]]) : (tensor<1x4x5xf32>, tensor<2xi64>) -> tensor<4x5xf32>
+  // CHECK: %[[v9:.*]] = "tf.Slice"(%[[v0]], %[[cst_7]], %[[cst_0]]) : (tensor<6x4x5xf32>, tensor<3xi64>, tensor<3xi64>) -> tensor<1x4x5xf32>
+  // CHECK: %[[v10:.*]] = "tf.Reshape"(%[[v9]], %[[cst_1]]) : (tensor<1x4x5xf32>, tensor<2xi64>) -> tensor<4x5xf32>
+  // CHECK: %[[v11:.*]] = "tf.Slice"(%[[v0]], %[[cst_8]], %[[cst_0]]) : (tensor<6x4x5xf32>, tensor<3xi64>, tensor<3xi64>) -> tensor<1x4x5xf32>
+  // CHECK: %[[v12:.*]] = "tf.Reshape"(%[[v11]], %[[cst_1]]) : (tensor<1x4x5xf32>, tensor<2xi64>) -> tensor<4x5xf32>
+
+  // CHECK: %[[v13:.*]] = "tf.Reshape"(%arg1, %[[cst_2]]) : (tensor<2x3x5x6xf32>, tensor<3xi64>) -> tensor<6x5x6xf32>
+  // CHECK: %[[v14:.*]] = "tf.Slice"(%[[v13]], %[[cst_3]], %[[cst_9]]) : (tensor<6x5x6xf32>, tensor<3xi64>, tensor<3xi64>) -> tensor<1x5x6xf32>
+  // CHECK: %[[v15:.*]] = "tf.Reshape"(%[[v14]], %[[cst_10]]) : (tensor<1x5x6xf32>, tensor<2xi64>) -> tensor<5x6xf32>
+  // CHECK: %[[v16:.*]] = "tf.Slice"(%[[v13]], %[[cst_4]], %[[cst_9]]) : (tensor<6x5x6xf32>, tensor<3xi64>, tensor<3xi64>) -> tensor<1x5x6xf32>
+  // CHECK: %[[v17:.*]] = "tf.Reshape"(%[[v16]], %[[cst_10]]) : (tensor<1x5x6xf32>, tensor<2xi64>) -> tensor<5x6xf32>
+  // CHECK: %[[v18:.*]] = "tf.Slice"(%[[v13]], %[[cst_5]], %[[cst_9]]) : (tensor<6x5x6xf32>, tensor<3xi64>, tensor<3xi64>) -> tensor<1x5x6xf32>
+  // CHECK: %[[v19:.*]] = "tf.Reshape"(%[[v18]], %[[cst_10]]) : (tensor<1x5x6xf32>, tensor<2xi64>) -> tensor<5x6xf32>
+  // CHECK: %[[v20:.*]] = "tf.Slice"(%[[v13]], %[[cst_6]], %[[cst_9]]) : (tensor<6x5x6xf32>, tensor<3xi64>, tensor<3xi64>) -> tensor<1x5x6xf32>
+  // CHECK: %[[v21:.*]] = "tf.Reshape"(%[[v20]], %[[cst_10]]) : (tensor<1x5x6xf32>, tensor<2xi64>) -> tensor<5x6xf32>
+  // CHECK: %[[v22:.*]] = "tf.Slice"(%[[v13]], %[[cst_7]], %[[cst_9]]) : (tensor<6x5x6xf32>, tensor<3xi64>, tensor<3xi64>) -> tensor<1x5x6xf32>
+  // CHECK: %[[v23:.*]] = "tf.Reshape"(%[[v22]], %[[cst_10]]) : (tensor<1x5x6xf32>, tensor<2xi64>) -> tensor<5x6xf32>
+  // CHECK: %[[v24:.*]] = "tf.Slice"(%[[v13]], %[[cst_8]], %[[cst_9]]) : (tensor<6x5x6xf32>, tensor<3xi64>, tensor<3xi64>) -> tensor<1x5x6xf32>
+  // CHECK: %[[v25:.*]] = "tf.Reshape"(%[[v24]], %[[cst_10]]) : (tensor<1x5x6xf32>, tensor<2xi64>) -> tensor<5x6xf32>
+
+  // CHECK: %[[v26:.*]] = "tf.MatMul"(%[[v2]], %[[v15]]) {transpose_a = false, transpose_b = false} : (tensor<4x5xf32>, tensor<5x6xf32>) -> tensor<4x6xf32>
+  // CHECK: %[[v27:.*]] = "tf.MatMul"(%[[v4]], %[[v17]]) {transpose_a = false, transpose_b = false} : (tensor<4x5xf32>, tensor<5x6xf32>) -> tensor<4x6xf32>
+  // CHECK: %[[v28:.*]] = "tf.MatMul"(%[[v6]], %[[v19]]) {transpose_a = false, transpose_b = false} : (tensor<4x5xf32>, tensor<5x6xf32>) -> tensor<4x6xf32>
+  // CHECK: %[[v29:.*]] = "tf.MatMul"(%[[v8]], %[[v21]]) {transpose_a = false, transpose_b = false} : (tensor<4x5xf32>, tensor<5x6xf32>) -> tensor<4x6xf32>
+  // CHECK: %[[v30:.*]] = "tf.MatMul"(%[[v10]], %[[v23]]) {transpose_a = false, transpose_b = false} : (tensor<4x5xf32>, tensor<5x6xf32>) -> tensor<4x6xf32>
+  // CHECK: %[[v31:.*]] = "tf.MatMul"(%[[v12]], %[[v25]]) {transpose_a = false, transpose_b = false} : (tensor<4x5xf32>, tensor<5x6xf32>) -> tensor<4x6xf32>
+
+  // CHECK: %[[v32:.*]] = "tf.Pack"(%[[v26]], %[[v27]], %[[v28]], %[[v29]], %[[v30]], %[[v31]]) {N = 6 : i64, axis = 0 : i64} : (tensor<4x6xf32>, tensor<4x6xf32>, tensor<4x6xf32>, tensor<4x6xf32>, tensor<4x6xf32>, tensor<4x6xf32>) -> tensor<6x4x6xf32>
+  // CHECK: %[[v33:.*]] = "tf.Reshape"(%[[v32]], %[[cst_11]]) : (tensor<6x4x6xf32>, tensor<4xi64>) -> tensor<2x3x4x6xf32>
+
+  // CHECK: return %[[v33]] : tensor<2x3x4x6xf32>
+}
+
+func @batchMatMulFlatInput(%arg0: tensor<3x4x5xf32>, %arg1: tensor<3x5x6xf32>) -> tensor<3x4x6xf32> {
+  %0 = "tf.BatchMatMul"(%arg0, %arg1) : (tensor<3x4x5xf32>, tensor<3x5x6xf32>) -> tensor<3x4x6xf32>
+  return %0 : tensor<3x4x6xf32>
+
+  // CHECK-LABEL: batchMatMulFlatInput
+  // CHECK: %[[cst:.*]] = constant dense<[3, 4, 5]> : tensor<3xi64>
+  // CHECK: %[[cst_0:.*]] = constant dense<[1, 4, 5]> : tensor<3xi64>
+  // CHECK: %[[cst_1:.*]] = constant dense<[4, 5]> : tensor<2xi64>
+  // CHECK: %[[cst_2:.*]] = constant dense<[3, 5, 6]> : tensor<3xi64>
+  // CHECK: %[[cst_3:.*]] = constant dense<0> : tensor<3xi64>
+  // CHECK: %[[cst_4:.*]] = constant dense<[1, 0, 0]> : tensor<3xi64>
+  // CHECK: %[[cst_5:.*]] = constant dense<[2, 0, 0]> : tensor<3xi64>
+  // CHECK: %[[cst_6:.*]] = constant dense<[1, 5, 6]> : tensor<3xi64>
+  // CHECK: %[[cst_7:.*]] = constant dense<[5, 6]> : tensor<2xi64>
+  // CHECK: %[[cst_8:.*]] = constant dense<[3, 4, 6]> : tensor<3xi64>
+
+  // CHECK: %[[v0:.*]] = "tf.Reshape"(%arg0, %[[cst]]) : (tensor<3x4x5xf32>, tensor<3xi64>) -> tensor<3x4x5xf32>
+  // CHECK: %[[v1:.*]] = "tf.Slice"(%[[v0]], %[[cst_3]], %[[cst_0]]) : (tensor<3x4x5xf32>, tensor<3xi64>, tensor<3xi64>) -> tensor<1x4x5xf32>
+  // CHECK: %[[v2:.*]] = "tf.Reshape"(%[[v1]], %[[cst_1]]) : (tensor<1x4x5xf32>, tensor<2xi64>) -> tensor<4x5xf32>
+  // CHECK: %[[v3:.*]] = "tf.Slice"(%[[v0]], %[[cst_4]], %[[cst_0]]) : (tensor<3x4x5xf32>, tensor<3xi64>, tensor<3xi64>) -> tensor<1x4x5xf32>
+  // CHECK: %[[v4:.*]] = "tf.Reshape"(%[[v3]], %[[cst_1]]) : (tensor<1x4x5xf32>, tensor<2xi64>) -> tensor<4x5xf32>
+  // CHECK: %[[v5:.*]] = "tf.Slice"(%[[v0]], %[[cst_5]], %[[cst_0]]) : (tensor<3x4x5xf32>, tensor<3xi64>, tensor<3xi64>) -> tensor<1x4x5xf32>
+  // CHECK: %[[v6:.*]] = "tf.Reshape"(%[[v5]], %[[cst_1]]) : (tensor<1x4x5xf32>, tensor<2xi64>) -> tensor<4x5xf32>
+
+  // CHECK: %[[v7:.*]] = "tf.Reshape"(%arg1, %[[cst_2]]) : (tensor<3x5x6xf32>, tensor<3xi64>) -> tensor<3x5x6xf32>
+  // CHECK: %[[v8:.*]] = "tf.Slice"(%[[v7]], %[[cst_3]], %[[cst_6]]) : (tensor<3x5x6xf32>, tensor<3xi64>, tensor<3xi64>) -> tensor<1x5x6xf32>
+  // CHECK: %[[v9:.*]] = "tf.Reshape"(%[[v8]], %[[cst_7]]) : (tensor<1x5x6xf32>, tensor<2xi64>) -> tensor<5x6xf32>
+  // CHECK: %[[v10:.*]] = "tf.Slice"(%[[v7]], %[[cst_4]], %[[cst_6]]) : (tensor<3x5x6xf32>, tensor<3xi64>, tensor<3xi64>) -> tensor<1x5x6xf32>
+  // CHECK: %[[v11:.*]] = "tf.Reshape"(%[[v10]], %[[cst_7]]) : (tensor<1x5x6xf32>, tensor<2xi64>) -> tensor<5x6xf32>
+  // CHECK: %[[v12:.*]] = "tf.Slice"(%[[v7]], %[[cst_5]], %[[cst_6]]) : (tensor<3x5x6xf32>, tensor<3xi64>, tensor<3xi64>) -> tensor<1x5x6xf32>
+  // CHECK: %[[v13:.*]] = "tf.Reshape"(%[[v12]], %[[cst_7]]) : (tensor<1x5x6xf32>, tensor<2xi64>) -> tensor<5x6xf32>
+
+  // CHECK: %[[v14:.*]] = "tf.MatMul"(%[[v2]], %[[v9]]) {transpose_a = false, transpose_b = false} : (tensor<4x5xf32>, tensor<5x6xf32>) -> tensor<4x6xf32>
+  // CHECK: %[[v15:.*]] = "tf.MatMul"(%[[v4]], %[[v11]]) {transpose_a = false, transpose_b = false} : (tensor<4x5xf32>, tensor<5x6xf32>) -> tensor<4x6xf32>
+  // CHECK: %[[v16:.*]] = "tf.MatMul"(%[[v6]], %[[v13]]) {transpose_a = false, transpose_b = false} : (tensor<4x5xf32>, tensor<5x6xf32>) -> tensor<4x6xf32>
+
+  // CHECK: %[[v17:.*]] = "tf.Pack"(%[[v14]], %[[v15]], %[[v16]]) {N = 3 : i64, axis = 0 : i64} : (tensor<4x6xf32>, tensor<4x6xf32>, tensor<4x6xf32>) -> tensor<3x4x6xf32>
+  // CHECK: %[[v18:.*]] = "tf.Reshape"(%[[v17]], %[[cst_8]]) : (tensor<3x4x6xf32>, tensor<3xi64>) -> tensor<3x4x6xf32>
+
+  // CHECK: return %[[v18]] : tensor<3x4x6xf32>
+}
+
+func @batchMatMulMatrix(%arg0: tensor<4x5xf32>, %arg1: tensor<5x6xf32>) -> tensor<4x6xf32> {
+  %0 = "tf.BatchMatMul"(%arg0, %arg1) : (tensor<4x5xf32>, tensor<5x6xf32>) -> tensor<4x6xf32>
+  return %0 : tensor<4x6xf32>
+
+  // CHECK-LABEL: batchMatMulMatrix
+  // CHECK: %[[v0:.*]] = "tf.MatMul"(%arg0, %arg1) {transpose_a = false, transpose_b = false} : (tensor<4x5xf32>, tensor<5x6xf32>) -> tensor<4x6xf32>
+  // CHECK: return %[[v0]] : tensor<4x6xf32>
+}
diff --git a/tensorflow/compiler/mlir/lite/tf_tfl_translate.cc b/tensorflow/compiler/mlir/lite/tf_tfl_translate.cc
index fc86eb63753..99d88ba9b93 100644
--- a/tensorflow/compiler/mlir/lite/tf_tfl_translate.cc
+++ b/tensorflow/compiler/mlir/lite/tf_tfl_translate.cc
@@ -149,7 +149,12 @@ int main(int argc, char **argv) {
       lower_tensor_list_ops, &result, &pm);
   if (!status.ok()) return kTrFailure;
 
-  auto output = mlir::openOutputFile(output_file_name);
+  std::string error_msg;
+  auto output = mlir::openOutputFile(output_file_name, &error_msg);
+  if (output == nullptr) {
+    llvm::errs() << error_msg << '\n';
+    return kTrFailure;
+  }
   output->os() << result;
   output->keep();
 
diff --git a/tensorflow/compiler/mlir/lite/transforms/extract_ophint.cc b/tensorflow/compiler/mlir/lite/transforms/extract_ophint.cc
index b6a898e6cda..29fc88462cb 100644
--- a/tensorflow/compiler/mlir/lite/transforms/extract_ophint.cc
+++ b/tensorflow/compiler/mlir/lite/transforms/extract_ophint.cc
@@ -14,7 +14,9 @@ limitations under the License.
 ==============================================================================*/
 #include <map>
 #include <queue>
+#include <vector>
 
+#include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/StringSwitch.h"
@@ -353,6 +355,127 @@ struct OphintCompositeOp {
   std::map<int, AggregatedOperand> outputs;
 };
 
+// Preprocess the graph for topo sort. (each operation is a node, while
+// inputs/outputs indictate edges) Assume the graph is acyclic. The preprocess
+// does the following:
+//   Compute each operations's in-degress (how many input nodes they're taken)
+//   Get all consumer operations for every operations. (operation_to_ouputs)
+//   Get the init_queue (those operations will be processed first).
+void PreprocessTopoSortGraph(
+    Block* block, std::queue<Operation*>* init_queue,
+    llvm::DenseMap<Operation*, llvm::DenseSet<Operation*>>* operation_to_ouputs,
+    llvm::DenseMap<Operation*, int>* operation_to_in_degrees) {
+  for (auto& op : *block) {
+    if (&op == block->getTerminator()) continue;
+    if (op.getNumOperands() == 0) {
+      init_queue->push(&op);
+    } else {
+      // The operand of the ops is not a direct indication of the "edge" as we
+      // can have a pack op after a unpack op (they have multiple edges), we
+      // should only count as one.
+      llvm::DenseSet<Operation*> input_ops;
+      for (int i = 0; i < op.getNumOperands(); ++i) {
+        Operation* input_op = op.getOperand(i)->getDefiningOp();
+        if (input_op) input_ops.insert(input_op);
+      }
+      if (input_ops.empty()) {
+        init_queue->push(&op);
+        continue;
+      }
+      operation_to_in_degrees->try_emplace(&op, input_ops.size());
+      for (auto* input_op : input_ops) {
+        auto preceeding_op_it = operation_to_ouputs->find(input_op);
+        if (preceeding_op_it == operation_to_ouputs->end()) {
+          auto result = operation_to_ouputs->try_emplace(
+              input_op, llvm::DenseSet<Operation*>());
+          preceeding_op_it = result.first;
+        }
+        preceeding_op_it->second.insert(&op);
+      }
+    }
+  }
+}
+
+bool IsSideEffectOp(Operation* op) {
+  if (op->hasNoSideEffect()) return false;
+
+  // Identity op has no side effect.
+  // Check the OperationName maybe more elegant here.
+  auto tf_identity_op = dyn_cast_or_null<TF::IdentityOp>(op);
+  if (tf_identity_op) return false;
+  return true;
+}
+
+// It's possible other transformations can benefit from this util function, but
+// since currently there's none, so we only limit this function to the ophint
+// extraction pass. We may refactor this function to extend the usage in future.
+//
+// Assume the graph is disconnected from outside.
+// Also assume the block has no arguments.
+LogicalResult TopoSortOperations(OpBuilder* builder) {
+  std::queue<Operation*> init_queue;
+  llvm::DenseMap<Operation*, llvm::DenseSet<Operation*>> operation_to_ouputs;
+  llvm::DenseMap<Operation*, int> operation_to_in_degrees;
+  std::vector<Operation*> sorted_ops;
+
+  PreprocessTopoSortGraph(builder->getBlock(), &init_queue,
+                          &operation_to_ouputs, &operation_to_in_degrees);
+  while (!init_queue.empty()) {
+    Operation* current_op = init_queue.front();
+    init_queue.pop();
+    sorted_ops.push_back(current_op);
+
+    auto current_op_to_output_it = operation_to_ouputs.find(current_op);
+    if (current_op_to_output_it == operation_to_ouputs.end()) {
+      continue;
+    }
+    for (Operation* output_op : current_op_to_output_it->second) {
+      auto output_op_it = operation_to_in_degrees.find(output_op);
+      if (output_op_it == operation_to_in_degrees.end()) return failure();
+
+      output_op_it->second -= 1;
+      if (output_op_it->second == 0) {
+        init_queue.push(output_op);
+        operation_to_in_degrees.erase(output_op_it);
+      }
+    }
+    operation_to_ouputs.erase(current_op_to_output_it);
+  }
+
+  // Before we performs the sort. We need to make sure we didn't mess the
+  // ordering of original side-effect operations.
+  // It's possible those side-effect operations have no topogocial relations
+  // at all!
+  std::vector<Operation*> original_side_effect_ops;
+  std::vector<Operation*> after_sort_side_effect_ops;
+  for (auto& op : *builder->getBlock()) {
+    if (IsSideEffectOp(&op) && (&op != builder->getBlock()->getTerminator()))
+      original_side_effect_ops.push_back(&op);
+  }
+  for (auto* op : sorted_ops) {
+    if (IsSideEffectOp(op)) after_sort_side_effect_ops.push_back(op);
+  }
+  if (original_side_effect_ops.size() != after_sort_side_effect_ops.size())
+    return failure();
+  for (int i = 0; i < original_side_effect_ops.size(); ++i) {
+    if (original_side_effect_ops[i] != after_sort_side_effect_ops[i])
+      return failure();
+  }
+
+  // Performs the sort.
+  // Ideally it would be nice to just clear the block then write the sorted ops.
+  // But unfortunately that's hard to do.
+  for (int i = sorted_ops.size() - 1; i > 0; --i) {
+    Operation* current_op = sorted_ops[i];
+    for (int j = i - 1; j >= 0; --j) {
+      Operation* prev_op = sorted_ops[j];
+      prev_op->moveBefore(current_op);
+    }
+  }
+
+  return success();
+}
+
 Operation* BuildFusedFuncOp(StringRef func_name, StringRef fused_func_type,
                             Operation* insert_before_op,
                             const std::map<int, Value*>& inputs,
@@ -360,10 +483,12 @@ Operation* BuildFusedFuncOp(StringRef func_name, StringRef fused_func_type,
                             OpBuilder* builder, ModuleOp* module_op) {
   SmallVector<Type, 4> input_types;
   SmallVector<Value*, 4> input_values;
+  SmallVector<int, 4> input_indexes;
   for (const auto& kv : inputs) {
     Value* input = kv.second;
     input_types.push_back(input->getType());
     input_values.push_back(input);
+    input_indexes.push_back(kv.first);
   }
 
   SmallVector<Type, 4> func_output_types;
@@ -378,6 +503,8 @@ Operation* BuildFusedFuncOp(StringRef func_name, StringRef fused_func_type,
   SmallVector<NamedAttribute, 4> attrs;
   attrs.push_back(builder->getNamedAttr(
       kTfLiteFunctionName, builder->getStringAttr(fused_func_type)));
+  attrs.push_back(builder->getNamedAttr(
+      kTfLiteFunctionInputIndex, builder->getI32ArrayAttr(input_indexes)));
   FuncOp func_op = FuncOp::create(insert_before_op->getLoc(), func_name,
                                   function_type, llvm::makeArrayRef(attrs));
   module_op->push_back(func_op);
@@ -507,6 +634,10 @@ LogicalResult ConvertOphintToStub(StringRef stub_name,
   };
 
   builder->getBlock()->walk(removeRemovableOps);
+
+  // Step 8: Topo sort to fix any invalid temporary IRs.
+  if (failed(TopoSortOperations(builder))) return failure();
+
   return success();
 }
 
diff --git a/tensorflow/compiler/mlir/lite/transforms/legalize_patterns.td b/tensorflow/compiler/mlir/lite/transforms/legalize_patterns.td
index 65c4eb76a77..ec328304d92 100644
--- a/tensorflow/compiler/mlir/lite/transforms/legalize_patterns.td
+++ b/tensorflow/compiler/mlir/lite/transforms/legalize_patterns.td
@@ -20,6 +20,10 @@ include "mlir/Dialect/StandardOps/Ops.td"
 include "tensorflow/compiler/mlir/lite/ir/tfl_ops.td"
 include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.td"
 
+def NonOpaqueElementsAttr : ElementsAttrBase<
+  CPred<"!$_self.isa<OpaqueElementsAttr>()">,
+  "non-opaque constant tensor">;
+
 def F32ElementsAttr : ElementsAttrBase<
   CPred<"$_self.cast<ElementsAttr>().getType().getElementType().isF32()">, "float constant tensor">;
 
@@ -56,8 +60,13 @@ def ExtractSingleElementAsInteger : NativeCodeCall<
 //===----------------------------------------------------------------------===//
 // Nullary ops patterns.
 //===----------------------------------------------------------------------===//
+
 def : Pat<(TF_ConstOp ElementsAttr:$value), (TFL_ConstOp $value)>;
 
+// Convert to std constant for statically shaped, non-opaque constants.
+def : Pat<(TF_ConstOp:$res NonOpaqueElementsAttr:$value), (ConstantOp $value),
+          [(AnyStaticShapeTensor $res)], (addBenefit 10)>;
+
 //===----------------------------------------------------------------------===//
 // Unary ops patterns.
 //===----------------------------------------------------------------------===//
@@ -157,7 +166,8 @@ def : Pat<(TF_ZerosLikeOp $arg), (TFL_ZerosLikeOp $arg)>;
 // The following two rules can both match an tf.Placeholder.input node with
 // min/max/type attributes, so we increase the benefit of the first rule by one
 // so the tfl.quantize and tfl.dequantize ops will be inserted if it matches.
-def : Pat<(TF_PlaceholderInputOp $inputs, $min, $max, $type),
+def : Pat<(TF_PlaceholderInputOp TensorOf<[F16, F32, F64]>:$inputs,
+              $min, $max, $type),
           (TFL_DequantizeOp
               (TFL_QuantizeOp
                   (TFL_InputOp $inputs),
@@ -191,7 +201,8 @@ def : Pat<(TF_GatherV2Op $params, $indices,
 
 def : Pat<(TF_FloorDivOp $l, $r), (TFL_FloorDivOp $l, $r)>;
 
-def : Pat<(TF_NotEqualOp $l, $r), (TFL_NotEqualOp $l, $r)>;
+def : Pat<(TF_NotEqualOp $l, $r, /*incompatible_shape_error=*/ConstBoolAttrTrue),
+          (TFL_NotEqualOp $l, $r)>;
 
 def : Pat<(TF_LogicalAndOp $l, $r), (TFL_LogicalAndOp $l, $r)>;
 
@@ -252,7 +263,7 @@ def : Pat<(TF_ReluOp (TF_SquaredDifferenceOp $l, $r)),
 
 def : Pat<(TF_ReverseV2Op $arg0, $arg1), (TFL_ReverseV2Op $arg0, $arg1)>;
 
-def : Pat<(TF_EqualOp $arg0, $arg1), (TFL_EqualOp $arg0, $arg1)>;
+def : Pat<(TF_EqualOp $arg0, $arg1, /*incompatible_shape_error=*/ConstBoolAttrTrue), (TFL_EqualOp $arg0, $arg1)>;
 
 def : Pat<(TF_PadOp $arg0, $arg1), (TFL_PadOp $arg0, $arg1)>;
 
@@ -308,3 +319,11 @@ def : Pat<(TF_FloorModOp $arg0, $arg1), (TFL_FloorModOp $arg0, $arg1)>;
 def : Pat<(TF_ExpOp $arg0), (TFL_ExpOp $arg0)>;
 
 def : Pat<(TF_LRNOp $arg0, $radius, F32Attr:$bias, F32Attr:$alpha, F32Attr:$beta), (TFL_LocalResponseNormalizationOp $arg0, (convertIntAttrTo32Bit $radius), $bias, $alpha, $beta)>;
+
+def : Pat<
+  (TF_NonMaxSuppressionV4Op $boxes, $scores, $max_output_size, $iou_threshold, $score_threshold, $pad_to_max_output_size),
+  (TFL_NonMaxSuppressionV4Op $boxes, $scores, $max_output_size, $iou_threshold, $score_threshold)>;
+
+def : Pat<
+  (TF_NonMaxSuppressionV5Op $boxes, $scores, $max_output_size, $iou_threshold, $score_threshold, $soft_nms_sigma, $pad_to_max_output_size),
+  (TFL_NonMaxSuppressionV5Op $boxes, $scores, $max_output_size, $iou_threshold, $score_threshold, $soft_nms_sigma)>;
diff --git a/tensorflow/compiler/mlir/lite/transforms/load_quantization_recipe.cc b/tensorflow/compiler/mlir/lite/transforms/load_quantization_recipe.cc
new file mode 100644
index 00000000000..01e54da1a61
--- /dev/null
+++ b/tensorflow/compiler/mlir/lite/transforms/load_quantization_recipe.cc
@@ -0,0 +1,228 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// This transformation pass prepare the tflite fused ops for quantization.
+
+#include "absl/memory/memory.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/None.h"
+#include "llvm/ADT/Optional.h"
+#include "mlir/Dialect/QuantOps/QuantTypes.h"  // TF:local_config_mlir
+#include "mlir/Dialect/StandardOps/Ops.h"  // TF:local_config_mlir
+#include "mlir/IR/Builders.h"  // TF:local_config_mlir
+#include "mlir/IR/MLIRContext.h"  // TF:local_config_mlir
+#include "mlir/Pass/Pass.h"  // TF:local_config_mlir
+#include "tensorflow/compiler/mlir/lite/ir/tfl_ops.h"
+#include "tensorflow/compiler/mlir/lite/quantization/quantization_utils.h"
+#include "tensorflow/compiler/mlir/lite/transforms/passes.h"
+
+//===----------------------------------------------------------------------===//
+// The LoadQuantizationRecipe Pass.
+//
+namespace mlir {
+namespace TFL {
+
+namespace {
+
+// This pass loads the quantization recipe for the TFLite ops to be quantized.
+// Specifically, it extends the fused ops with their internal implementation as
+// op regions. Each ops in the region produces results with element type
+// AnyQuantizedType, thus bitwidth, narrow_range, etc are included. The op also
+// defines the op quantization traits, which are used to propgate the
+// quantization parameters by the following passes.
+struct LoadQuantizationRecipe : public FunctionPass<LoadQuantizationRecipe> {
+  void runOnFunction() override;
+
+ private:
+  void Initialize(LSTMOp lstm, OpBuilder* builder);
+
+  // Create LSTM gates with different weights for input, recurrent and
+  // cell state, and also the layer normalization parameters.
+  Operation* CreateGate(Location loc, Value* in, Value* in_w, Value* rec,
+                        Value* rec_w,
+                        llvm::Optional<std::pair<Value*, Value*>> cell,
+                        Value* ln_w, Value* ln_bias, OpBuilder* builder);
+
+  Operation* CreateLayerNorm(Location loc, Value* in, Value* ln_w,
+                             Value* ln_bias, OpBuilder* builder);
+
+  // Add the internal implementation of the LSTM to its regions.
+  void LoadForLSTMOp(LSTMOp lstm, OpBuilder* builder);
+
+  StringAttr none_af;
+  StringAttr fc_format;
+  BoolAttr keep_dims;
+  Type int8;
+  Type int16;
+  ConstantOp none_cst;
+};
+
+void LoadQuantizationRecipe::Initialize(LSTMOp lstm, OpBuilder* builder) {
+  Type expressed_type =
+      lstm.input()->getType().cast<ShapedType>().getElementType();
+  Type int8_storage_type = builder->getIntegerType(8);
+  Type int16_storage_type = builder->getIntegerType(16);
+  auto flag = quant::QuantizationFlags::FlagValue::Signed;
+  int64_t int8_min = quant::QuantizedType::getDefaultMininumForInteger(
+      flag, /*integralWidth=*/8);
+  int64_t int8_max = quant::QuantizedType::getDefaultMaxinumForInteger(
+      flag, /*integralWidth=*/8);
+  int64_t int16_min = quant::QuantizedType::getDefaultMininumForInteger(
+      flag, /*integralWidth=*/16);
+  int64_t int16_max = quant::QuantizedType::getDefaultMaxinumForInteger(
+      flag, /*integralWidth=*/16);
+  auto any_int8 = quant::AnyQuantizedType::get(
+      flag, int8_storage_type, expressed_type, int8_min, int8_max);
+  auto any_int16 = quant::AnyQuantizedType::get(
+      flag, int16_storage_type, expressed_type, int16_min, int16_max);
+
+  int8 = any_int8.castFromExpressedType(lstm.input()->getType());
+  int16 = any_int16.castFromExpressedType(lstm.input()->getType());
+}
+
+Operation* LoadQuantizationRecipe::CreateLayerNorm(Location loc, Value* in,
+                                                   Value* ln_w, Value* ln_bias,
+                                                   OpBuilder* builder) {
+  // Note that l2_normalization and add ops here are not the execution kernle
+  // implementation for layer_normalization and we just want to use them to
+  // model the quantization requirement.
+  auto l2_norm = builder->create<L2NormalizationOp>(loc, int16, in, none_af);
+  auto add = builder->create<AddOp>(loc, int16, in, l2_norm, none_af);
+  return builder->create<FullyConnectedOp>(loc, int16, add, ln_w, ln_bias,
+                                           none_af, fc_format, keep_dims);
+}
+
+Operation* LoadQuantizationRecipe::CreateGate(
+    Location loc, Value* in, Value* in_w, Value* rec, Value* rec_w,
+    llvm::Optional<std::pair<Value*, Value*>> cell, Value* ln_w, Value* ln_bias,
+    OpBuilder* builder) {
+  auto s1 = builder->create<FullyConnectedOp>(loc, int16, in, in_w, none_cst,
+                                              none_af, fc_format, keep_dims);
+  auto s2 = builder->create<FullyConnectedOp>(loc, int16, rec, rec_w, none_cst,
+                                              none_af, fc_format, keep_dims);
+
+  AddNOp s4;
+  if (cell.hasValue()) {
+    auto s3 = builder->create<MulOp>(loc, int16, cell.getValue().first,
+                                     cell.getValue().second, none_af);
+    s4 = builder->create<AddNOp>(
+        loc, int16,
+        llvm::ArrayRef<Value*>(
+            {*s1.output().begin(), *s2.output().begin(), s3.output()}));
+
+  } else {
+    s4 = builder->create<AddNOp>(
+        loc, int16,
+        llvm::ArrayRef<Value*>({*s1.output().begin(), *s2.output().begin()}));
+  }
+
+  auto s5 = CreateLayerNorm(loc, s4.sum(), ln_w, ln_bias, builder);
+
+  if (cell.hasValue()) {
+    return builder->create<LogisticOp>(loc, int16, s5->getResult(0));
+  } else {
+    return builder->create<TanhOp>(loc, int16, s5->getResult(0));
+  }
+}
+
+void LoadQuantizationRecipe::LoadForLSTMOp(LSTMOp lstm, OpBuilder* builder) {
+  Initialize(lstm, builder);
+
+  Region region;
+  region.push_back(new Block);
+  builder->setInsertionPointToEnd(&region.front());
+  Location loc = lstm.getLoc();
+  Type int32_type = builder->getIntegerType(32);
+  Type int32_tensor = builder->getTensorType(int32_type);
+  none_cst = builder->create<ConstantOp>(loc, builder->getNoneType(),
+                                         builder->getUnitAttr());
+
+  auto input_gate = CreateGate(
+      loc, lstm.input(), lstm.input_to_input_weights(),
+      lstm.input_activation_state(), lstm.recurrent_to_input_weights(),
+      llvm::Optional<std::pair<Value*, Value*>>(
+          {lstm.input_cell_state(), lstm.cell_to_input_weights()}),
+      lstm.input_layer_norm_coefficients(), lstm.input_gate_bias(), builder);
+
+  auto forget_gate = CreateGate(
+      loc, lstm.input(), lstm.input_to_forget_weights(),
+      lstm.input_activation_state(), lstm.recurrent_to_forget_weights(),
+      llvm::Optional<std::pair<Value*, Value*>>(
+          {lstm.input_cell_state(), lstm.cell_to_forget_weights()}),
+      lstm.forget_layer_norm_coefficients(), lstm.forget_gate_bias(), builder);
+
+  auto cell_gate = CreateGate(loc, lstm.input(), lstm.input_to_cell_weights(),
+                              lstm.input_activation_state(),
+                              lstm.recurrent_to_cell_weights(), llvm::None,
+                              lstm.cell_layer_norm_coefficients(),
+                              lstm.cell_bias(), builder);
+
+  auto forget_cell_state = builder->create<MulOp>(
+      loc, int16, forget_gate->getResult(0), lstm.input_cell_state(), none_af);
+  auto input_cell_state = builder->create<MulOp>(
+      loc, int16, input_gate->getResult(0), cell_gate->getResult(0), none_af);
+  auto new_cell = builder->create<AddOp>(loc, int16, forget_cell_state.output(),
+                                         input_cell_state.output(), none_af);
+
+  auto output_gate = CreateGate(
+      loc, lstm.input(), lstm.input_to_output_weights(),
+      lstm.input_activation_state(), lstm.recurrent_to_output_weights(),
+      llvm::Optional<std::pair<Value*, Value*>>(
+          {new_cell, lstm.cell_to_output_weights()}),
+      lstm.output_layer_norm_coefficients(), lstm.output_gate_bias(), builder);
+
+  auto new_cell_tanh = builder->create<TanhOp>(loc, int16, new_cell);
+  auto hidden_state = builder->create<MulOp>(
+      loc, int16, new_cell_tanh.y(), output_gate->getResult(0), none_af);
+  auto act = builder->create<FullyConnectedOp>(
+      loc, int8, hidden_state.output(), lstm.projection_weights(),
+      lstm.projection_bias(), none_af, fc_format, keep_dims);
+
+  // TODO(fengliuai): define and register the op in the QuantOps Dialect.
+  OperationState return_state(loc, "tf_quant.pseudo_return", act.getResult(0),
+                              {int8}, {});
+  builder->createOperation(return_state);
+
+  lstm.internal().takeBody(region);
+}
+
+void LoadQuantizationRecipe::runOnFunction() {
+  FuncOp func = getFunction();
+  OpBuilder builder(func);
+  none_af = builder.getStringAttr("NONE");
+  fc_format = builder.getStringAttr("DEFAULT");
+  keep_dims = builder.getBoolAttr(false);
+
+  func.walk([&](Operation* op) {
+    if (auto lstm = llvm::dyn_cast<LSTMOp>(op)) {
+      LoadForLSTMOp(lstm, &builder);
+    }
+    // Handles other ops.
+  });
+}
+
+}  // namespace
+
+// Creates an instance of the TensorFlow Lite dialect LoadQuantizationRecipe
+// pass.
+std::unique_ptr<FunctionPassBase> CreateLoadQuantizationRecipePass() {
+  return absl::make_unique<LoadQuantizationRecipe>();
+}
+
+static PassRegistration<LoadQuantizationRecipe> pass(
+    "tfl-load-recipe", "Load TFL op quantization recipe");
+
+}  // namespace TFL
+}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/lite/transforms/lower_static_tensor_list.cc b/tensorflow/compiler/mlir/lite/transforms/lower_static_tensor_list.cc
index 716c8216433..35dd5e0a75d 100644
--- a/tensorflow/compiler/mlir/lite/transforms/lower_static_tensor_list.cc
+++ b/tensorflow/compiler/mlir/lite/transforms/lower_static_tensor_list.cc
@@ -429,12 +429,14 @@ LogicalResult LowerStaticTensorListPass::RewriteFunction(
       } else if (auto tf_op = llvm::dyn_cast<TF::TensorListReserveOp>(op)) {
         if (!(tf_op.element_dtype().isF16() || tf_op.element_dtype().isF32() ||
               tf_op.element_dtype().isF64() ||
+              tf_op.element_dtype().isInteger(1) ||
               tf_op.element_dtype().isInteger(8) ||
               tf_op.element_dtype().isInteger(16) ||
               tf_op.element_dtype().isInteger(32) ||
               tf_op.element_dtype().isInteger(64))) {
           return tf_op.emitError(
-              "requires element_dtype to be 8-bit/16-bit/32-bit/64-bit integer "
+              "requires element_dtype to be 1-bit/8-bit/16-bit/32-bit/64-bit "
+              "integer "
               "or 16-bit/32-bit/64-bit "
               "float type during TF Lite transformation pass");
         }
@@ -461,6 +463,10 @@ LogicalResult LowerStaticTensorListPass::RewriteFunction(
         auto c = ConvertTFTensorListPushBack(context);
         rewriter->setInsertionPoint(op);
         c.matchAndRewrite(op, *rewriter);
+      } else if (auto tf_op = llvm::dyn_cast<TF::TensorListLengthOp>(op)) {
+        auto c = TFL::ConvertTFTensorListLength(context);
+        rewriter->setInsertionPoint(op);
+        c.matchAndRewrite(op, *rewriter);
       } else if (auto tf_op = llvm::dyn_cast<TF::WhileOp>(op)) {
         if (op->getAttr("T")) op->removeAttr(Identifier::get("T", context));
         UpdateWhileFunctionType(tf_op);
diff --git a/tensorflow/compiler/mlir/lite/transforms/optimize_patterns.td b/tensorflow/compiler/mlir/lite/transforms/optimize_patterns.td
index 51610832db6..9de40eb3cd6 100644
--- a/tensorflow/compiler/mlir/lite/transforms/optimize_patterns.td
+++ b/tensorflow/compiler/mlir/lite/transforms/optimize_patterns.td
@@ -122,6 +122,8 @@ class OperandHasRank<int n> : Constraint<
 // Mul->Rsqrt->Sum->Square
 // Currently L2Normalization doesn't support activation function
 // in TFLite.
+// TODO(karimnosseir): Add constraints that the kernel code assumes.
+// constraint on axis and depth.
 def : Pat<(TFL_MulOp $operand1,
                      (TFL_RsqrtOp
                         (TFL_SumOp
@@ -130,13 +132,14 @@ def : Pat<(TFL_MulOp $operand1,
                            $keep_dims)),
                      TFL_AF_None),
            (TFL_L2NormalizationOp $operand1, TFL_AF_None),
-           [(EqualOperands $operand1, $square_operand),
-            (OperandHasRank<1> $operand1)]>;
+           [(EqualOperands $operand1, $square_operand)]>;
 
 // This pattern constructs L2NormalizationOp from
 // Div->sqrt->Sum->Square
 // Currently L2Normalization doesn't support activation function
 // in TFLite.
+// TODO(karimnosseir): Add constraints that the kernel code assumes.
+// constraint on axis and depth.
 def : Pat<(TFL_DivOp $operand1,
                      (TFL_SqrtOp
                         (TFL_SumOp
@@ -145,5 +148,4 @@ def : Pat<(TFL_DivOp $operand1,
                            $keep_dims)),
                      TFL_AF_None),
            (TFL_L2NormalizationOp $operand1, TFL_AF_None),
-           [(EqualOperands $operand1, $square_operand),
-            (OperandHasRank<1> $operand1)]>;
+           [(EqualOperands $operand1, $square_operand)]>;
diff --git a/tensorflow/compiler/mlir/lite/transforms/prepare_patterns.td b/tensorflow/compiler/mlir/lite/transforms/prepare_patterns.td
index e3dabb7a48d..7cb89c4219c 100644
--- a/tensorflow/compiler/mlir/lite/transforms/prepare_patterns.td
+++ b/tensorflow/compiler/mlir/lite/transforms/prepare_patterns.td
@@ -18,6 +18,14 @@ include "tensorflow/compiler/mlir/lite/ir/tfl_ops.td"
 
 def FalseBoolAttr : AttrConstraint<CPred<"!$_self.getValue()">>;
 
+def NonOpaqueElementsAttr : ElementsAttrBase<
+  CPred<"!$_self.isa<OpaqueElementsAttr>()">,
+  "non-opaque constant tensor">;
+
+// Convert to std constant for statically shaped, non-opaque constants.
+def : Pat<(TF_ConstOp:$res NonOpaqueElementsAttr:$value), (ConstantOp $value),
+          [(AnyStaticShapeTensor $res)]>;
+
 // Converts tf.FusedBatchNorm & tf.FusedBatchNormV3 into a sequence of more primitive arithmetic
 // operations. Specifically, performs the following calculation:
 //
@@ -81,8 +89,8 @@ class TFi32<int v> : ConstantAttr<I32ElementsAttr, !cast<string>(v)>;
 def : Pat<(TF_MatMulOp $a, $b, ConstBoolAttrFalse:$at, ConstBoolAttrFalse),
           (TF_MatMulOp $a, (TF_TransposeOp $b, (TF_SubOp (TF_RangeOp
              /*start=*/(TF_RankOp $b),
-             /*limit=*/(ConstantOp TFi32<0>),
-             /*delta=*/(ConstantOp TFi32<-1>)), (ConstantOp TFi32<1>))),
+             /*limit=*/(TF_ConstOp TFi32<0>),
+             /*delta=*/(TF_ConstOp TFi32<-1>)), (TF_ConstOp TFi32<1>))),
            $at, ConstBoolAttrTrue)>;
 
 // Matmul with transpose on a to matmul with explicit transpose op and a not
@@ -90,10 +98,12 @@ def : Pat<(TF_MatMulOp $a, $b, ConstBoolAttrFalse:$at, ConstBoolAttrFalse),
 def : Pat<(TF_MatMulOp $a, $b, ConstBoolAttrTrue, $bt),
           (TF_MatMulOp (TF_TransposeOp $a, (TF_SubOp (TF_RangeOp
              /*start=*/(TF_RankOp $a),
-             /*limit=*/(ConstantOp TFi32<0>),
-             /*delta=*/(ConstantOp TFi32<-1>)), (ConstantOp TFi32<1>))), $b,
+             /*limit=*/(TF_ConstOp TFi32<0>),
+             /*delta=*/(TF_ConstOp TFi32<-1>)), (TF_ConstOp TFi32<1>))), $b,
            ConstBoolAttrFalse, $bt)>;
 
+// Partially supported in TFLite, treated as passthrough IdentityOp
+def : Pat<(TF_CheckNumericsOp $arg, $msg), (TF_IdentityOp $arg)>;
 def : Pat<(TF_SnapshotOp $arg), (TF_IdentityOp $arg)>;
 def : Pat<(TF_StopGradientOp $arg), (TF_IdentityOp $arg)>;
 
diff --git a/tensorflow/compiler/mlir/lite/transforms/prepare_tf.cc b/tensorflow/compiler/mlir/lite/transforms/prepare_tf.cc
index 7c7983ae254..2b91b2f4177 100644
--- a/tensorflow/compiler/mlir/lite/transforms/prepare_tf.cc
+++ b/tensorflow/compiler/mlir/lite/transforms/prepare_tf.cc
@@ -50,6 +50,7 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/lite/ir/tfl_ops.h"
 #include "tensorflow/compiler/mlir/lite/quantization/quantization_utils.h"
 #include "tensorflow/compiler/mlir/lite/transforms/passes.h"
+#include "tensorflow/compiler/mlir/lite/transforms/unroll_batch_matmul.h"
 #include "tensorflow/compiler/mlir/lite/utils/attribute_utils.h"
 #include "tensorflow/compiler/mlir/lite/utils/validators.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
@@ -246,7 +247,8 @@ struct ConvertTFConvOp : public RewritePattern {
         filter_type.getShape());
     auto bias_type = rewriter.getTensorType({bias_dim}, elem_type);
     auto bias_attr = rewriter.getZeroAttr(bias_type);
-    auto bias = rewriter.create<ConstantOp>(op->getLoc(), bias_type, bias_attr);
+    auto bias =
+        rewriter.create<TF::ConstOp>(op->getLoc(), bias_type, bias_attr);
 
     auto *conv_state = static_cast<ConvertTFConvOpMatchState *>(state.get());
     auto conv_op = static_cast<const ConcreteType *>(this)->createTFLOp(
@@ -297,7 +299,7 @@ class ConvertTFConv2D : public ConvertTFConvOp<ConvertTFConv2D, TF::Conv2DOp> {
                                             rewriter.getIntegerType(32));
     auto perm_attr =
         DenseElementsAttr::get(perm_type, llvm::makeArrayRef<int>(perm));
-    auto perm_op = rewriter.create<ConstantOp>(loc, perm_type, perm_attr);
+    auto perm_op = rewriter.create<TF::ConstOp>(loc, perm_type, perm_attr);
 
     // Create tensor type for the transpose result.
     auto filter_type = filter->getType().cast<RankedTensorType>();
@@ -366,7 +368,7 @@ class ConvertTFDepthwiseConv2dNative
     auto shape_type = rewriter.getTensorType({4}, rewriter.getIntegerType(64));
     auto shape_attr =
         DenseElementsAttr::get(shape_type, llvm::makeArrayRef(result_shape));
-    auto shape = rewriter.create<ConstantOp>(loc, shape_type, shape_attr);
+    auto shape = rewriter.create<TF::ConstOp>(loc, shape_type, shape_attr);
 
     return rewriter.create<TF::ReshapeOp>(loc, result_type, filter, shape);
   }
@@ -377,6 +379,11 @@ class ConvertTFDepthwiseConv2dNative
 void PrepareTFPass::runOnFunction() {
   OwningRewritePatternList patterns;
   auto func = getFunction();
+
+  patterns.insert<ConvertTFBatchMatMulOp<TF::BatchMatMulOp>,
+                  ConvertTFBatchMatMulOp<TF::BatchMatMulV2Op>>(&getContext());
+  applyPatternsGreedily(func, patterns);
+
   // This pattern was intented to uses TFL QDQs to preserve the quantization
   // parameters from the TF Quant ops, thus this pattern should run with the
   // first `applyPatternsGreedily` method, which would otherwise removes the
diff --git a/tensorflow/compiler/mlir/lite/transforms/tensorlist_patterns.td b/tensorflow/compiler/mlir/lite/transforms/tensorlist_patterns.td
index 764f8e95f55..167a4be3579 100644
--- a/tensorflow/compiler/mlir/lite/transforms/tensorlist_patterns.td
+++ b/tensorflow/compiler/mlir/lite/transforms/tensorlist_patterns.td
@@ -14,9 +14,13 @@ limitations under the License.
 ==============================================================================*/
 
 include "mlir/IR/OpBase.td"
+include "mlir/Dialect/StandardOps/Ops.td"
 include "tensorflow/compiler/mlir/lite/ir/tfl_ops.td"
 include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.td"
 
+def CreateTFShapeOp : NativeCodeCall<
+    "$_builder.create<TF::ShapeOp>($0->getLoc(), $1, $2)">;
+
 //===----------------------------------------------------------------------===//
 // TensorList transformation patterns.
 // Note that the pattern below rewrites `TensorList` tensors  (which has type DT_VARIANT)
@@ -34,3 +38,11 @@ def ConvertTFTensorListStack : Pat<
 def ConvertTFTensorListGetItem : Pat<
   (TF_TensorListGetItemOp $input, $index, $element_shape),
   (TF_GatherOp $input, $index, (NativeCodeCall<"$_builder.getBoolAttr(true)">))>;
+
+// TensorListLength is equivalent to the size of the first dimension of the
+// input tensorlist, rewrite it to a combination of Gather and Shape op.
+def ConvertTFTensorListLength: Pat<
+  (TF_TensorListLengthOp:$old_value $input),
+  (TF_GatherOp
+    (CreateTFShapeOp $old_value, $input, /*use 32bit*/ConstBoolAttrTrue),
+    (ConstantOp ConstantAttr<I32ElementsAttr, "0">), ConstBoolAttrTrue)>;
diff --git a/tensorflow/compiler/mlir/lite/transforms/unroll_batch_matmul.cc b/tensorflow/compiler/mlir/lite/transforms/unroll_batch_matmul.cc
new file mode 100644
index 00000000000..50b644f9635
--- /dev/null
+++ b/tensorflow/compiler/mlir/lite/transforms/unroll_batch_matmul.cc
@@ -0,0 +1,309 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/mlir/lite/transforms/unroll_batch_matmul.h"
+
+#include <climits>
+#include <cstdint>
+
+#include "absl/memory/memory.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/StringSwitch.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/Debug.h"
+#include "mlir/Analysis/LoopAnalysis.h"  // TF:local_config_mlir
+#include "mlir/Dialect/QuantOps/FakeQuantSupport.h"  // TF:local_config_mlir
+#include "mlir/Dialect/QuantOps/UniformSupport.h"  // TF:local_config_mlir
+#include "mlir/IR/Attributes.h"  // TF:local_config_mlir
+#include "mlir/IR/OpImplementation.h"  // TF:local_config_mlir
+#include "mlir/IR/PatternMatch.h"  // TF:local_config_mlir
+#include "mlir/IR/StandardTypes.h"  // TF:local_config_mlir
+#include "mlir/Pass/Pass.h"  // TF:local_config_mlir
+#include "mlir/Support/Functional.h"  // TF:local_config_mlir
+#include "mlir/Support/LLVM.h"  // TF:local_config_mlir
+#include "mlir/Support/LogicalResult.h"  // TF:local_config_mlir
+#include "tensorflow/compiler/mlir/lite/ir/tfl_ops.h"
+#include "tensorflow/compiler/mlir/lite/quantization/quantization_utils.h"
+#include "tensorflow/compiler/mlir/lite/transforms/passes.h"
+#include "tensorflow/compiler/mlir/lite/utils/attribute_utils.h"
+#include "tensorflow/compiler/mlir/lite/utils/validators.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
+#include "tensorflow/core/util/matmul_bcast.h"
+
+namespace mlir {
+namespace TFL {
+
+namespace {
+// Unrolls a BatchMatMul on the batch dimension. We need to slice each batch out
+// of the inputs, matmul them individually, then stack them all back together at
+// the end.
+struct UnrollBatchMatMulPass : public FunctionPass<UnrollBatchMatMulPass> {
+  void runOnFunction() override;
+};
+
+void UnrollBatchMatMulPass::runOnFunction() {
+  OwningRewritePatternList patterns;
+  auto func = getFunction();
+
+  patterns.insert<ConvertTFBatchMatMulOp<TF::BatchMatMulOp>,
+                  ConvertTFBatchMatMulOp<TF::BatchMatMulV2Op>>(&getContext());
+  applyPatternsGreedily(func, patterns);
+}
+
+}  // namespace
+
+template <typename BatchMatMulOpType>
+TF::ReshapeOp ConvertTFBatchMatMulOp<BatchMatMulOpType>::createReshapeOp(
+    Value* value, ArrayRef<int64_t> shape, Type elementType, Location loc,
+    PatternRewriter& rewriter) {
+  int64_t shape_rank = shape.size();
+  auto shapeSpecType =
+      rewriter.getTensorType({shape_rank}, rewriter.getIntegerType(64));
+  Type resultType = rewriter.getTensorType(shape, elementType);
+  auto constant_attr = DenseElementsAttr::get(shapeSpecType, shape);
+  auto shapeTensor =
+      rewriter.create<ConstantOp>(loc, shapeSpecType, constant_attr);
+  return rewriter.create<TF::ReshapeOp>(loc, resultType, /*tensor=*/value,
+                                        /*shape=*/shapeTensor);
+}
+
+template <typename BatchMatMulOpType>
+std::vector<Value*> ConvertTFBatchMatMulOp<BatchMatMulOpType>::sliceInput(
+    Value* value, int batch_size, Location loc, PatternRewriter& rewriter) {
+  RankedTensorType tensorType = value->getType().cast<RankedTensorType>();
+  Type elementType = tensorType.getElementType();
+
+  int rank = tensorType.getShape().size();
+  int num_rows = tensorType.getShape()[rank - 2];
+  int num_cols = tensorType.getShape()[rank - 1];
+
+  // Reshape to rank-3 Tensor with first dimension as the batch size.
+  auto reshapeOp = createReshapeOp(value, {batch_size, num_rows, num_cols},
+                                   elementType, loc, rewriter);
+
+  SmallVector<int64_t, 3> sliceSize = {1, num_rows, num_cols};
+
+  std::vector<Value*> sliced;
+  Type int64Type = rewriter.getIntegerType(64);
+  Type sliceResultType = rewriter.getTensorType(sliceSize, elementType);
+
+  // Slice along each batch index and remember the slice output for future
+  // use.
+  for (int batch_idx = 0; batch_idx < batch_size; ++batch_idx) {
+    auto vector3Type = rewriter.getTensorType({3}, int64Type);
+
+    auto begin_attr =
+        DenseElementsAttr::get<int64_t>(vector3Type, {batch_idx, 0, 0});
+    auto size_attr = DenseElementsAttr::get<int64_t>(vector3Type, sliceSize);
+    auto begin = rewriter.create<ConstantOp>(loc, vector3Type, begin_attr);
+    auto size = rewriter.create<ConstantOp>(loc, vector3Type, size_attr);
+    auto sliceOp =
+        rewriter.create<TF::SliceOp>(loc, sliceResultType,
+                                     /*input=*/reshapeOp.output(), begin, size);
+
+    // Squeeze matrix, i.e. reshape [1, num_rows, num_cols] -> [num_rows,
+    // num_cols]
+    auto squeezeOp = createReshapeOp(sliceOp.output(), {num_rows, num_cols},
+                                     elementType, loc, rewriter);
+
+    sliced.emplace_back(squeezeOp.output());
+  }
+  return sliced;
+}
+
+template <typename BatchMatMulOpType>
+TF::TransposeOp ConvertTFBatchMatMulOp<BatchMatMulOpType>::createTransposeOp(
+    Value* value, Location loc, PatternRewriter& rewriter) {
+  auto valueType = value->getType().cast<RankedTensorType>();
+  auto shape = valueType.getShape();
+  int dims = shape.size();
+
+  std::vector<int32_t> perm(dims);
+  for (int i = 0; i < dims - 2; i++) {
+    perm[i] = i;
+  }
+  perm[dims - 2] = dims - 1;
+  perm[dims - 1] = dims - 2;
+
+  auto perm_type = rewriter.getTensorType({static_cast<int32_t>(perm.size())},
+                                          rewriter.getIntegerType(32));
+
+  auto perm_attr = DenseElementsAttr::get(perm_type, llvm::makeArrayRef(perm));
+  auto perm_op = rewriter.create<ConstantOp>(loc, perm_type, perm_attr);
+
+  std::vector<int64_t> transposed_shape(shape.begin(), shape.end());
+  int64_t r = transposed_shape[dims - 1];
+  int64_t c = transposed_shape[dims - 2];
+
+  transposed_shape[dims - 1] = c;
+  transposed_shape[dims - 2] = r;
+
+  auto transposed_type =
+      rewriter.getTensorType(transposed_shape, valueType.getElementType());
+  return rewriter.create<TF::TransposeOp>(loc, transposed_type, value, perm_op);
+}
+
+template <typename BatchMatMulOpType>
+TF::PackOp ConvertTFBatchMatMulOp<BatchMatMulOpType>::createMatMulOps(
+    const std::vector<Value*>& sliced_lhs,
+    const std::vector<Value*>& sliced_rhs, const tensorflow::MatMulBCast& bcast,
+    int rows, int cols, Type elementType, Location loc,
+    PatternRewriter& rewriter) {
+  auto matmulType = rewriter.getTensorType({rows, cols}, elementType);
+
+  std::vector<Value*> matmuls;
+  for (int batch_idx = 0; batch_idx < bcast.output_batch_size(); ++batch_idx) {
+    int lhs_batch_idx, rhs_batch_idx;
+    if (bcast.IsBroadcastingRequired()) {
+      lhs_batch_idx = bcast.x_batch_indices()[batch_idx];
+      rhs_batch_idx = bcast.y_batch_indices()[batch_idx];
+    } else {
+      lhs_batch_idx = batch_idx;
+      rhs_batch_idx = batch_idx;
+    }
+    auto false_attr = rewriter.getBoolAttr(false);
+    auto matmul = rewriter.create<TF::MatMulOp>(loc, matmulType,
+                                                /*a=*/sliced_lhs[lhs_batch_idx],
+                                                /*b=*/sliced_rhs[rhs_batch_idx],
+                                                /*transpose_a=*/false_attr,
+                                                /*transpose_b=*/false_attr);
+    matmuls.emplace_back(matmul.product());
+  }
+
+  // Combine the result of each individual MatMul into a rank-3 Tensor.
+  Type packedType = rewriter.getTensorType(
+      {bcast.output_batch_size(), rows, cols}, elementType);
+
+  auto N = rewriter.getI64IntegerAttr(matmuls.size());
+  auto axis = rewriter.getI64IntegerAttr(0);
+  return rewriter.create<TF::PackOp>(loc, packedType,
+                                     /*values=*/matmuls, N, axis);
+}
+
+template <typename BatchMatMulOpType>
+PatternMatchResult ConvertTFBatchMatMulOp<BatchMatMulOpType>::matchAndRewrite(
+    BatchMatMulOpType op, PatternRewriter& rewriter) const {
+  Value* input_lhs = op.x();
+  Value* input_rhs = op.y();
+
+  if (!input_lhs->getType().isa<RankedTensorType>()) {
+    // LHS must be a ranked tensor type
+    return this->matchFailure();
+  }
+  if (!input_rhs->getType().isa<RankedTensorType>()) {
+    // RHS must be a ranked tensor type
+    return this->matchFailure();
+  }
+
+  auto lhs_type = input_lhs->getType().cast<RankedTensorType>();
+  auto rhs_type = input_rhs->getType().cast<RankedTensorType>();
+
+  auto elementType = lhs_type.getElementType();
+
+  if (elementType != rhs_type.getElementType()) {
+    // The element type of LHS must be the same with element type of RHS
+    return this->matchFailure();
+  }
+
+  auto lhs_shape = lhs_type.getShape();
+  auto rhs_shape = rhs_type.getShape();
+
+  Location loc = op.getLoc();
+
+  // Transpose LHS input if necessary.
+  if (op.adj_x()) {
+    input_lhs = createTransposeOp(input_lhs, loc, rewriter);
+
+    lhs_type = input_lhs->getType().cast<RankedTensorType>();
+    lhs_shape = lhs_type.getShape();
+  }
+
+  // Transpose RHS input if necessary.
+  if (op.adj_y()) {
+    input_rhs = createTransposeOp(input_rhs, loc, rewriter);
+
+    rhs_type = input_rhs->getType().cast<RankedTensorType>();
+    rhs_shape = rhs_type.getShape();
+  }
+
+  // Ensure that input ranks are at least 2 and batch shapes are
+  // broadcastable.
+  const int dims_a = lhs_shape.size();
+  const int dims_b = rhs_shape.size();
+  if (dims_a < 2 || dims_b < 2) {
+    // Both inputs must have rank >= 2
+    return this->matchFailure();
+  }
+
+  if (lhs_shape[dims_a - 1] != rhs_shape[dims_b - 2]) {
+    // Input dimensions must be compatible for multipication.
+    return this->matchFailure();
+  }
+
+  if (dims_a == 2 && dims_b == 2) {
+    // When both inputs are matrices, just replace the op to a matmul op.
+    Type resultType =
+        rewriter.getTensorType({lhs_shape[0], rhs_shape[1]}, elementType);
+    auto false_attr = rewriter.getBoolAttr(false);
+    rewriter.replaceOpWithNewOp<TF::MatMulOp>(op, resultType,
+                                              /*a=*/input_lhs,
+                                              /*b=*/input_rhs,
+                                              /*transpose_a=*/false_attr,
+                                              /*transpose_b=*/false_attr);
+    return this->matchSuccess();
+  }
+
+  tensorflow::MatMulBCast bcast(absl::InlinedVector<tensorflow::int64, 4>(
+                                    lhs_shape.begin(), lhs_shape.end()),
+                                absl::InlinedVector<tensorflow::int64, 4>(
+                                    rhs_shape.begin(), rhs_shape.end()));
+
+  if (!bcast.IsValid()) {
+    // Input batch dimensions must be broadcastable
+    return this->matchFailure();
+  }
+
+  // Compute slices for each batch in the LHS and RHS.
+  std::vector<Value*> sliced_lhs =
+      sliceInput(input_lhs, bcast.x_batch_size(), loc, rewriter);
+  std::vector<Value*> sliced_rhs =
+      sliceInput(input_rhs, bcast.y_batch_size(), loc, rewriter);
+
+  // Compute (single batch) MatMul for each output batch. The MatMul outputs
+  // are then packed together into one output Tensor.
+  auto packOp =
+      createMatMulOps(sliced_lhs, sliced_rhs, bcast, lhs_shape[dims_a - 2],
+                      rhs_shape[dims_b - 1], elementType, loc, rewriter);
+
+  // Reshape the rank-3 Tensor into the correct output shape.
+  const auto& resultBatchShape = bcast.output_batch_shape().dim_sizes();
+  std::vector<int64_t> resultShape(resultBatchShape.begin(),
+                                   resultBatchShape.end());
+  resultShape.push_back(lhs_shape[dims_a - 2]);
+  resultShape.push_back(rhs_shape[dims_b - 1]);
+
+  auto reshapeOp =
+      createReshapeOp(packOp.output(), resultShape, elementType, loc, rewriter);
+  rewriter.replaceOp(op, reshapeOp.output());
+  return this->matchSuccess();
+}
+
+static PassRegistration<UnrollBatchMatMulPass> pass(
+    "tfl-unroll-batch-matmul",
+    "Unroll TF BatchMatMul op into Reshape, Slice, MatMul, Pack ops.");
+
+}  // namespace TFL
+}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/lite/transforms/unroll_batch_matmul.h b/tensorflow/compiler/mlir/lite/transforms/unroll_batch_matmul.h
new file mode 100644
index 00000000000..d4b46eabf7d
--- /dev/null
+++ b/tensorflow/compiler/mlir/lite/transforms/unroll_batch_matmul.h
@@ -0,0 +1,60 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_LITE_TRANSFORMS_UNROLL_BATCH_MATMUL_H_
+#define TENSORFLOW_COMPILER_MLIR_LITE_TRANSFORMS_UNROLL_BATCH_MATMUL_H_
+
+#include "llvm/ADT/ArrayRef.h"
+#include "mlir/IR/Location.h"  // TF:local_config_mlir
+#include "mlir/IR/PatternMatch.h"  // TF:local_config_mlir
+#include "mlir/IR/TypeUtilities.h"  // TF:local_config_mlir
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
+#include "tensorflow/core/util/matmul_bcast.h"
+
+namespace mlir {
+namespace TFL {
+
+// Unroll tf.BatchMatMulV2 op into a sequence of TF ops. Since TFLite does not
+// support BatchMatMul operation, it unrolls a BatchMatMul op into tf.Reshape,
+// tf.Slice, tf.MatMul, tf.Pack, and tf.Reshape ops.
+template <typename BatchMatMulOpType>
+class ConvertTFBatchMatMulOp : public OpRewritePattern<BatchMatMulOpType> {
+  using OpRewritePattern<BatchMatMulOpType>::OpRewritePattern;
+
+  static TF::ReshapeOp createReshapeOp(Value* value, ArrayRef<int64_t> shape,
+                                       Type elementType, Location loc,
+                                       PatternRewriter& rewriter);
+
+  static std::vector<Value*> sliceInput(Value* value, int batch_size,
+                                        Location loc,
+                                        PatternRewriter& rewriter);
+
+  static TF::TransposeOp createTransposeOp(Value* value, Location loc,
+                                           PatternRewriter& rewriter);
+
+  static TF::PackOp createMatMulOps(const std::vector<Value*>& sliced_lhs,
+                                    const std::vector<Value*>& sliced_rhs,
+                                    const tensorflow::MatMulBCast& bcast,
+                                    int rows, int cols, Type elementType,
+                                    Location loc, PatternRewriter& rewriter);
+
+  PatternMatchResult matchAndRewrite(BatchMatMulOpType op,
+                                     PatternRewriter& rewriter) const override;
+};
+
+}  // namespace TFL
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_LITE_TRANSFORMS_UNROLL_BATCH_MATMUL_H_
diff --git a/tensorflow/compiler/mlir/lite/utils/lstm_utils.cc b/tensorflow/compiler/mlir/lite/utils/lstm_utils.cc
new file mode 100644
index 00000000000..d98101bd4cb
--- /dev/null
+++ b/tensorflow/compiler/mlir/lite/utils/lstm_utils.cc
@@ -0,0 +1,456 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/mlir/lite/utils/lstm_utils.h"
+
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/None.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Support/Casting.h"
+#include "mlir/Dialect/StandardOps/Ops.h"  // TF:local_config_mlir
+#include "mlir/IR/Attributes.h"  // TF:local_config_mlir
+#include "mlir/IR/Builders.h"  // TF:local_config_mlir
+#include "mlir/IR/Function.h"  // TF:local_config_mlir
+#include "mlir/IR/Identifier.h"  // TF:local_config_mlir
+#include "mlir/IR/Location.h"  // TF:local_config_mlir
+#include "mlir/IR/MLIRContext.h"  // TF:local_config_mlir
+#include "mlir/IR/OpDefinition.h"  // TF:local_config_mlir
+#include "mlir/IR/Operation.h"  // TF:local_config_mlir
+#include "mlir/IR/StandardTypes.h"  // TF:local_config_mlir
+#include "mlir/IR/Types.h"  // TF:local_config_mlir
+#include "mlir/IR/Value.h"  // TF:local_config_mlir
+#include "mlir/Support/LLVM.h"  // TF:local_config_mlir
+#include "mlir/Support/LogicalResult.h"  // TF:local_config_mlir
+#include "tensorflow/compiler/mlir/lite/ir/tfl_ops.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
+
+namespace mlir {
+namespace TFL {
+
+namespace {
+
+Value* CreateI32SplatConst(OpBuilder* builder, ArrayRef<int64_t> shape,
+                           int32_t val, mlir::Location location) {
+  auto type = builder->getTensorType(shape, builder->getIntegerType(32));
+  auto attr = DenseElementsAttr::get(type, val);
+  return builder->create<ConstantOp>(location, type, attr);
+}
+
+Value* CreateF32SplatConst(OpBuilder* builder, ArrayRef<int64_t> shape,
+                           float val, mlir::Location location) {
+  auto type = builder->getTensorType(shape, builder->getF32Type());
+  auto attr = DenseElementsAttr::get(type, val);
+  return builder->create<ConstantOp>(location, type, attr);
+}
+
+Value* CreateI64DenseConst(OpBuilder* builder, ArrayRef<int64_t> shape,
+                           ArrayRef<int64_t> values, mlir::Location location) {
+  auto type = builder->getTensorType(static_cast<int>(shape.size()),
+                                     builder->getIntegerType(64));
+  auto attr = DenseElementsAttr::get(type, values);
+  return builder->create<ConstantOp>(location, type, attr);
+}
+
+Value* CreateNoneValue(OpBuilder* builder, mlir::Location location) {
+  return builder->create<mlir::ConstantOp>(location, builder->getNoneType(),
+                                           builder->getUnitAttr());
+}
+
+Value* Transpose2D(OpBuilder* builder, Value* value_to_transpose,
+                   RankedTensorType type, mlir::Location location) {
+  // Create a constant op for transpose permutation.
+  SmallVector<int64_t, 2> perm = {1, 0};
+  auto perm_op = CreateI64DenseConst(builder, perm, perm, location);
+
+  // Create tensor type for the transpose result.
+  auto transpose_type = type;
+  auto transpose_shape = functional::map(
+      [transpose_type](int64_t dim) { return transpose_type.getDimSize(dim); },
+      perm);
+  auto elem_type = transpose_type.getElementType();
+  auto result_type = builder->getTensorType(transpose_shape, elem_type);
+
+  return builder->create<TF::TransposeOp>(location, result_type,
+                                          value_to_transpose, perm_op);
+}
+
+Value* SliceRankedTensor(OpBuilder* builder, Value* input,
+                         ArrayRef<int64_t> begin_shape,
+                         ArrayRef<int64_t> begin_values,
+                         ArrayRef<int64_t> size_shape,
+                         ArrayRef<int64_t> size_values,
+                         mlir::Location location) {
+  // Create a dense constant op for slice's begin
+  auto slice_i2c_begin =
+      CreateI64DenseConst(builder, begin_shape, begin_values, location);
+
+  // Create a dense constant op for slice's size
+  auto slice_i2c_size =
+      CreateI64DenseConst(builder, size_shape, size_values, location);
+
+  return builder->create<TF::SliceOp>(
+      location,
+      builder->getTensorType(
+          size_values,
+          input->getType().cast<RankedTensorType>().getElementType()),
+      input, slice_i2c_begin, slice_i2c_size);
+}
+
+}  // namespace
+
+void ConvertLSTMCellSimpleToFusedLSTM::SetWeightForInputToCellGate() {
+  SmallVector<int64_t, 2> begin_i2c_values = {0, 0};
+  input2cell_ = SliceRankedTensor(
+      &builder_, weight_transposed_, weight_slice_shape_, begin_i2c_values,
+      weight_slice_shape_, weight_slice_size_input_values_,
+      fused_func_op_.getLoc());
+}
+
+void ConvertLSTMCellSimpleToFusedLSTM::SetWeightForInputToInputGate() {
+  SmallVector<int64_t, 2> begin_i2i_values = {n_cell_, 0};
+  input2input_ = couple_input_forget_gates_
+                     ? none_
+                     : SliceRankedTensor(&builder_, weight_transposed_,
+                                         weight_slice_shape_, begin_i2i_values,
+                                         weight_slice_shape_,
+                                         weight_slice_size_input_values_,
+                                         fused_func_op_.getLoc());
+}
+
+void ConvertLSTMCellSimpleToFusedLSTM::SetWeightForInputToForgetGate() {
+  int input_forget_start = couple_input_forget_gates_ ? n_cell_ : 2 * n_cell_;
+  SmallVector<int64_t, 2> begin_i2f_values = {input_forget_start, 0};
+  input2forget_ = SliceRankedTensor(
+      &builder_, weight_transposed_, weight_slice_shape_, begin_i2f_values,
+      weight_slice_shape_, weight_slice_size_input_values_,
+      fused_func_op_.getLoc());
+}
+
+void ConvertLSTMCellSimpleToFusedLSTM::SetWeightForInputToOutputGate() {
+  int input_output_start =
+      couple_input_forget_gates_ ? 2 * n_cell_ : 3 * n_cell_;
+  SmallVector<int64_t, 2> begin_i2o_values = {input_output_start, 0};
+  input2output_ = SliceRankedTensor(
+      &builder_, weight_transposed_, weight_slice_shape_, begin_i2o_values,
+      weight_slice_shape_, weight_slice_size_input_values_,
+      fused_func_op_.getLoc());
+}
+
+void ConvertLSTMCellSimpleToFusedLSTM::SetWeightForRecurrentToCellGate() {
+  SmallVector<int64_t, 2> begin_rec2c_values = {0, n_input_};
+  rec2cell_ = SliceRankedTensor(
+      &builder_, weight_transposed_, weight_slice_shape_, begin_rec2c_values,
+      weight_slice_shape_, weight_slice_size_recurrent_values_,
+      fused_func_op_.getLoc());
+}
+
+void ConvertLSTMCellSimpleToFusedLSTM::SetWeightForRecurrentToInputGate() {
+  SmallVector<int64_t, 2> begin_rec2i_values = {n_cell_, n_input_};
+  rec2input_ = couple_input_forget_gates_
+                   ? none_
+                   : SliceRankedTensor(&builder_, weight_transposed_,
+                                       weight_slice_shape_, begin_rec2i_values,
+                                       weight_slice_shape_,
+                                       weight_slice_size_recurrent_values_,
+                                       fused_func_op_.getLoc());
+}
+
+void ConvertLSTMCellSimpleToFusedLSTM::SetWeightForRecurrentToForgetGate() {
+  int rec_forget_start = couple_input_forget_gates_ ? n_cell_ : 2 * n_cell_;
+  SmallVector<int64_t, 2> begin_rec2f_values = {rec_forget_start, n_input_};
+  rec2forget_ = SliceRankedTensor(
+      &builder_, weight_transposed_, weight_slice_shape_, begin_rec2f_values,
+      weight_slice_shape_, weight_slice_size_recurrent_values_,
+      fused_func_op_.getLoc());
+}
+
+void ConvertLSTMCellSimpleToFusedLSTM::SetWeightForRecurrentToOutputGate() {
+  int rec_output_start = couple_input_forget_gates_ ? 2 * n_cell_ : 3 * n_cell_;
+  SmallVector<int64_t, 2> begin_rec2o_values = {rec_output_start, n_input_};
+  rec2output_ = SliceRankedTensor(
+      &builder_, weight_transposed_, weight_slice_shape_, begin_rec2o_values,
+      weight_slice_shape_, weight_slice_size_recurrent_values_,
+      fused_func_op_.getLoc());
+}
+
+void ConvertLSTMCellSimpleToFusedLSTM::SetBiasToCellGate() {
+  SmallVector<int64_t, 1> begin_bias2c_values = {0};
+  bias2cell_ = SliceRankedTensor(&builder_, bias_, bias_slice_shape_,
+                                 begin_bias2c_values, bias_slice_shape_,
+                                 bias_size_values_, fused_func_op_.getLoc());
+}
+
+void ConvertLSTMCellSimpleToFusedLSTM::SetBiasToInputGate() {
+  SmallVector<int64_t, 1> begin_bias2i_values = {n_cell_};
+  bias2input_ =
+      couple_input_forget_gates_
+          ? none_
+          : SliceRankedTensor(&builder_, bias_, bias_slice_shape_,
+                              begin_bias2i_values, bias_slice_shape_,
+                              bias_size_values_, fused_func_op_.getLoc());
+}
+
+void ConvertLSTMCellSimpleToFusedLSTM::SetBiasToForgetGate() {
+  int bias_forget_start = couple_input_forget_gates_ ? n_cell_ : 2 * n_cell_;
+  SmallVector<int64_t, 1> begin_bias2f_values = {bias_forget_start};
+  bias2forget_ = SliceRankedTensor(&builder_, bias_, bias_slice_shape_,
+                                   begin_bias2f_values, bias_slice_shape_,
+                                   bias_size_values_, fused_func_op_.getLoc());
+}
+
+void ConvertLSTMCellSimpleToFusedLSTM::SetBiasToOutputGate() {
+  int bias_output_start =
+      couple_input_forget_gates_ ? 2 * n_cell_ : 3 * n_cell_;
+  SmallVector<int64_t, 1> begin_bias2o_values = {bias_output_start};
+  bias2output_ = SliceRankedTensor(&builder_, bias_, bias_slice_shape_,
+                                   begin_bias2o_values, bias_slice_shape_,
+                                   bias_size_values_, fused_func_op_.getLoc());
+}
+
+void ConvertLSTMCellSimpleToFusedLSTM::SetProjection() {
+  SmallVector<int64_t, 2> projection_slice_shape = {
+      1, num_cols_projection_transposed_};
+  SmallVector<int64_t, 2> projection_slice_size_values = {n_output_, n_cell_};
+  SmallVector<int64_t, 2> projection_slice_begin_values = {0, 0};
+  proj_weight_ =
+      !projection_
+          ? none_
+          : SliceRankedTensor(
+                &builder_, projection_transposed_, projection_slice_shape,
+                projection_slice_begin_values, projection_slice_shape,
+                projection_slice_size_values, fused_func_op_.getLoc());
+}
+
+void ConvertLSTMCellSimpleToFusedLSTM::SetProjectionBias() {
+  proj_bias_ = !projection_type_
+                   ? none_
+                   : CreateF32SplatConst(&builder_, {n_output_}, 0,
+                                         fused_func_op_.getLoc());
+}
+
+void ConvertLSTMCellSimpleToFusedLSTM::SetInputActivationState() {
+  input_activation_state_ = CreateF32SplatConst(&builder_, {1, n_output_}, 0,
+                                                fused_func_op_.getLoc());
+}
+
+void ConvertLSTMCellSimpleToFusedLSTM::SetInputCellState() {
+  input_cell_state_ =
+      CreateF32SplatConst(&builder_, {1, n_cell_}, 0, fused_func_op_.getLoc());
+}
+
+void ConvertLSTMCellSimpleToFusedLSTM::SetCellLayerNormCoefficients() {
+  cell_layer_norm_coefficients_ = none_;
+}
+
+void ConvertLSTMCellSimpleToFusedLSTM::SetInputLayerNormCoefficients() {
+  input_layer_norm_coefficients_ = none_;
+}
+
+void ConvertLSTMCellSimpleToFusedLSTM::SetForgetLayerNormCoefficients() {
+  forget_layer_norm_coefficients_ = none_;
+}
+void ConvertLSTMCellSimpleToFusedLSTM::SetOutputLayerNormCoefficients() {
+  output_layer_norm_coefficients_ = none_;
+}
+
+void ConvertLSTMCellSimpleToFusedLSTM::GenerateFusedOpOperands() {
+  // Transpose both weight and projection.
+  weight_transposed_ =
+      Transpose2D(&builder_, weight_, weight_type_, fused_func_op_.getLoc());
+  projection_transposed_ = Transpose2D(&builder_, projection_, projection_type_,
+                                       fused_func_op_.getLoc());
+
+  none_ = CreateNoneValue(&builder_, fused_func_op_.getLoc());
+  // Extract input to cifg gates via slicing the weight tensor
+  SetWeightForInputToCellGate();
+  SetWeightForInputToInputGate();
+  SetWeightForInputToForgetGate();
+  SetWeightForInputToOutputGate();
+
+  // Extract recurrent to cifg gates via slicing the weight tensor
+  SetWeightForRecurrentToCellGate();
+  SetWeightForRecurrentToInputGate();
+  SetWeightForRecurrentToForgetGate();
+  SetWeightForRecurrentToOutputGate();
+
+  // Extract bias to cifg gates via slicing the bias tensor
+  SetBiasToCellGate();
+  SetBiasToInputGate();
+  SetBiasToForgetGate();
+  SetBiasToOutputGate();
+
+  // Extract projection and set an empty projection bias
+  SetProjection();
+  SetProjectionBias();
+
+  // Set the variable tensors
+  SetInputActivationState();
+  SetInputCellState();
+
+  // Extract the layer norm coefficients
+  SetCellLayerNormCoefficients();
+  SetInputLayerNormCoefficients();
+  SetForgetLayerNormCoefficients();
+  SetOutputLayerNormCoefficients();
+}
+
+void ConvertLSTMCellSimpleToFusedLSTM::UpdateFuncSignature() {
+  // https://github.com/tensorflow/community/pull/113
+  auto attr = fused_func_op_.getAttrOfType<StringAttr>("tf_.implements");
+  if (!attr) {
+    fused_func_op_.setAttr("tf._implements",
+                           builder_.getStringAttr(GetCompositeOpName()));
+  }
+  SmallVector<int64_t, 2> output_shape{1, n_output_};
+  auto input_types = fused_func_op_.getType().getInputs();
+  auto output_type = builder_.getTensorType(
+      output_shape,
+      input_->getType().cast<RankedTensorType>().getElementType());
+  fused_func_op_.setType(mlir::FunctionType::get(input_types, output_type,
+                                                 fused_func_op_.getContext()));
+}
+
+void ConvertLSTMCellSimpleToFusedLSTM::RewriteFunc() {
+  // Update the func signature, based on output shape.
+  // The func will ultimately return the output of the fused
+  // LSTM op.
+  UpdateFuncSignature();
+
+  // Transoform the weights, projection, bias and layer norm coefficients
+  // to generate operands for the TFL fused LSTM op.
+  GenerateFusedOpOperands();
+
+  // Create the fused LSTM op.
+  SmallVector<int64_t, 2> output_shape = {1, n_output_};
+  auto result_type = builder_.getTensorType(
+      output_shape,
+      input_->getType().cast<RankedTensorType>().getElementType());
+  lstm_ = builder_.create<mlir::TFL::LSTMOp>(
+      fused_func_op_.getLoc(), result_type, input_, input2input_, input2forget_,
+      input2cell_, input2output_, rec2input_, rec2forget_, rec2cell_,
+      rec2output_, /*cell_to_input_weights*/ none_,
+      /*cell_to_forget_weights*/ none_,
+      /*cell_to_output_weights*/ none_, bias2input_, bias2forget_, bias2cell_,
+      bias2output_, proj_weight_, proj_bias_, input_activation_state_,
+      input_cell_state_, input_layer_norm_coefficients_,
+      forget_layer_norm_coefficients_, cell_layer_norm_coefficients_,
+      output_layer_norm_coefficients_, builder_.getStringAttr("TANH"),
+      builder_.getF32FloatAttr(10.0), builder_.getF32FloatAttr(0.0),
+      builder_.getStringAttr("FULL"));
+
+  builder_.create<mlir::ReturnOp>(fused_func_op_.getLoc(), lstm_.getResult());
+}
+
+LogicalResult ConvertLSTMCellSimpleToFusedLSTM::Initialize() {
+  num_gates_ = couple_input_forget_gates_ ? 3 : 4;
+
+  input_ = fused_func_op_.getArgument(0);
+  bias_ = fused_func_op_.getArgument(2);
+
+  weight_ = fused_func_op_.getArgument(1);
+  weight_type_ = weight_->getType().cast<RankedTensorType>();
+
+  if (weight_type_.getRank() != 2) {
+    return fused_func_op_.emitError() << "The weight tensor was not of rank 2";
+  }
+
+  if (weight_type_.getDimSize(1) % num_gates_ != 0) {
+    return fused_func_op_.emitError()
+           << "Invalid dimension 1 of weight tensor, "
+              "should be divisible by the number of gates";
+  }
+  n_cell_ = weight_type_.getDimSize(1) / num_gates_;
+
+  projection_ = fused_func_op_.getArgument(3);
+  projection_type_ = projection_->getType().cast<RankedTensorType>();
+  if (projection_type_.getRank() != 2) {
+    n_output_ = n_cell_;
+  } else {
+    n_output_ = projection_type_.getDimSize(1);
+  }
+  n_input_ = weight_type_.getDimSize(0) - n_output_;
+  num_cols_weight_transposed_ = weight_type_.getDimSize(0);
+  num_cols_projection_transposed_ = projection_type_.getDimSize(0);
+
+  bias_slice_shape_ = {n_cell_};
+  bias_size_values_ = {n_cell_};
+  weight_slice_shape_ = {1, num_cols_weight_transposed_};
+  weight_slice_size_input_values_ = {n_cell_, n_input_};
+  weight_slice_size_recurrent_values_ = {n_cell_, n_output_};
+
+  return success();
+}
+
+LogicalResult ConvertLayerNormalizedLSTMCellSimpleToFusedLSTM::Initialize() {
+  if (failed(ConvertLSTMCellSimpleToFusedLSTM::Initialize())) {
+    return fused_func_op_.emitError()
+           << "Specified LayerNormalizedLSTMCellSimple was not of the expected "
+              "interface and cannot not be converted to the fused LSTM op";
+  }
+
+  layer_norm_scale_ = fused_func_op_.getArgument(4);
+  layer_norm_scale_type_ =
+      layer_norm_scale_->getType().cast<RankedTensorType>();
+  if (layer_norm_scale_type_.getRank() != 1) {
+    return fused_func_op_.emitError()
+           << "The layer_norm_scale tensor was not of rank 1";
+  }
+  layer_norm_slice_shape_ = {n_cell_};
+  layer_norm_size_values_ = {n_cell_};
+
+  return success();
+}
+
+void ConvertLayerNormalizedLSTMCellSimpleToFusedLSTM::
+    SetCellLayerNormCoefficients() {
+  SmallVector<int64_t, 1> begin_cell_layer_norm_values = {0};
+  cell_layer_norm_coefficients_ =
+      SliceRankedTensor(&builder_, layer_norm_scale_, layer_norm_slice_shape_,
+                        begin_cell_layer_norm_values, layer_norm_slice_shape_,
+                        layer_norm_size_values_, fused_func_op_.getLoc());
+}
+
+void ConvertLayerNormalizedLSTMCellSimpleToFusedLSTM::
+    SetInputLayerNormCoefficients() {
+  SmallVector<int64_t, 1> begin_input_layer_norm_values = {n_cell_};
+  input_layer_norm_coefficients_ =
+      couple_input_forget_gates_
+          ? none_
+          : SliceRankedTensor(
+                &builder_, layer_norm_scale_, layer_norm_slice_shape_,
+                begin_input_layer_norm_values, layer_norm_slice_shape_,
+                layer_norm_size_values_, fused_func_op_.getLoc());
+}
+
+void ConvertLayerNormalizedLSTMCellSimpleToFusedLSTM::
+    SetForgetLayerNormCoefficients() {
+  SmallVector<int64_t, 1> begin_forget_layer_norm_values = {2 * n_cell_};
+  forget_layer_norm_coefficients_ =
+      SliceRankedTensor(&builder_, layer_norm_scale_, layer_norm_slice_shape_,
+                        begin_forget_layer_norm_values, layer_norm_slice_shape_,
+                        layer_norm_size_values_, fused_func_op_.getLoc());
+}
+
+void ConvertLayerNormalizedLSTMCellSimpleToFusedLSTM::
+    SetOutputLayerNormCoefficients() {
+  SmallVector<int64_t, 1> begin_output_layer_norm_values = {3 * n_cell_};
+  output_layer_norm_coefficients_ =
+      SliceRankedTensor(&builder_, layer_norm_scale_, layer_norm_slice_shape_,
+                        begin_output_layer_norm_values, layer_norm_slice_shape_,
+                        layer_norm_size_values_, fused_func_op_.getLoc());
+}
+
+}  // namespace TFL
+}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/lite/utils/lstm_utils.h b/tensorflow/compiler/mlir/lite/utils/lstm_utils.h
new file mode 100644
index 00000000000..e59b2b662dd
--- /dev/null
+++ b/tensorflow/compiler/mlir/lite/utils/lstm_utils.h
@@ -0,0 +1,214 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// This header file defines common utils used by TFLite transformation
+// passes to work with op attributes.
+
+#ifndef TENSORFLOW_COMPILER_MLIR_LITE_UTILS_LSTM_UTILS_H_
+#define TENSORFLOW_COMPILER_MLIR_LITE_UTILS_LSTM_UTILS_H_
+
+#include "llvm/ADT/StringRef.h"
+#include "mlir/IR/Builders.h"  // TF:local_config_mlir
+#include "mlir/IR/Function.h"  // TF:local_config_mlir
+#include "mlir/IR/Location.h"  // TF:local_config_mlir
+#include "mlir/IR/StandardTypes.h"  // TF:local_config_mlir
+#include "mlir/IR/Value.h"  // TF:local_config_mlir
+#include "mlir/Support/LogicalResult.h"  // TF:local_config_mlir
+#include "tensorflow/compiler/mlir/lite/ir/tfl_ops.h"
+
+namespace mlir {
+namespace TFL {
+
+constexpr char kLstmCellSimple[] = "LSTMCellSimple";
+constexpr char kLayerNormalizedLstmCellSimple[] =
+    "LayerNormalizedLstmCellSimple";
+
+// A utility class that enables the conversion of the LSTMCellSimple composite
+// op into a fused TFL LSTM op. The fused op is contained within a FuncOp
+// that also contains other supporting ops needed to construct the operands for
+// the fused op. The caller provides the containing FuncOp as input with
+// arguments specifying the input, weight, projection and bias.
+// The weight, pprojection, bias and layer norm scale all need to be
+// RankedTensorType.
+// This class sets the layer norm coefficients to NoneType.
+class ConvertLSTMCellSimpleToFusedLSTM {
+ public:
+  // TODO(b/140053256): The couple_input_forget_gates should be specified on
+  // FuncOp as an attribute.
+  explicit ConvertLSTMCellSimpleToFusedLSTM(mlir::FuncOp fused_func_op,
+                                            bool couple_input_forget_gates)
+      : fused_func_op_(fused_func_op),
+        couple_input_forget_gates_(couple_input_forget_gates),
+        builder_(fused_func_op.getBody()) {}
+
+  // not copyable.
+  ConvertLSTMCellSimpleToFusedLSTM(const ConvertLSTMCellSimpleToFusedLSTM&) =
+      delete;
+  ConvertLSTMCellSimpleToFusedLSTM& operator=(
+      const ConvertLSTMCellSimpleToFusedLSTM&) = delete;
+  virtual ~ConvertLSTMCellSimpleToFusedLSTM() {}
+
+  // verify input func op arguments and initialize internal state.
+  virtual LogicalResult Initialize();
+
+  virtual llvm::StringRef GetCompositeOpName() { return kLstmCellSimple; }
+
+  // Rewrite the func body with constructed fused lstm.
+  void RewriteFunc();
+
+ protected:
+  void UpdateFuncSignature();
+  void GenerateFusedOpOperands();
+
+  void SetWeightForInputToCellGate();
+  void SetWeightForInputToInputGate();
+  void SetWeightForInputToForgetGate();
+  void SetWeightForInputToOutputGate();
+
+  void SetWeightForRecurrentToCellGate();
+  void SetWeightForRecurrentToInputGate();
+  void SetWeightForRecurrentToForgetGate();
+  void SetWeightForRecurrentToOutputGate();
+
+  void SetBiasToCellGate();
+  void SetBiasToInputGate();
+  void SetBiasToForgetGate();
+  void SetBiasToOutputGate();
+
+  void SetProjection();
+  void SetProjectionBias();
+
+  void SetInputActivationState();
+  void SetInputCellState();
+
+  virtual void SetCellLayerNormCoefficients();
+  virtual void SetInputLayerNormCoefficients();
+  virtual void SetForgetLayerNormCoefficients();
+  virtual void SetOutputLayerNormCoefficients();
+
+  // specified state
+  FuncOp fused_func_op_;
+  Value* input_;
+  Value* weight_;
+  Value* bias_;
+  Value* projection_;
+  bool couple_input_forget_gates_;
+
+  // internal state
+  Value* weight_transposed_;
+  Value* projection_transposed_;
+  RankedTensorType weight_type_;
+  RankedTensorType projection_type_;
+  int num_gates_;
+  int n_cell_;
+  int n_output_;
+  int n_input_;
+  int num_cols_weight_transposed_;
+  int num_cols_projection_transposed_;
+
+  // input -> cifg
+  Value* input2input_;
+  Value* input2forget_;
+  Value* input2cell_;
+  Value* input2output_;
+
+  // reccurrent -> cifg
+  Value* rec2input_;
+  Value* rec2forget_;
+  Value* rec2cell_;
+  Value* rec2output_;
+
+  // bias -> cifg
+  Value* bias2input_;
+  Value* bias2forget_;
+  Value* bias2cell_;
+  Value* bias2output_;
+
+  // projection
+  Value* proj_weight_;
+  Value* proj_bias_;
+
+  // state
+  Value* input_activation_state_;
+  Value* input_cell_state_;
+
+  // layer norm coefficients
+  Value* input_layer_norm_coefficients_;
+  Value* forget_layer_norm_coefficients_;
+  Value* cell_layer_norm_coefficients_;
+  Value* output_layer_norm_coefficients_;
+
+  mlir::TFL::LSTMOp lstm_;
+
+  Value* none_;
+  SmallVector<int64_t, 1> bias_slice_shape_;
+  SmallVector<int64_t, 1> bias_size_values_;
+  SmallVector<int64_t, 2> weight_slice_shape_;
+  SmallVector<int64_t, 2> weight_slice_size_input_values_;
+  SmallVector<int64_t, 2> weight_slice_size_recurrent_values_;
+  OpBuilder builder_;
+};
+
+// A utility class that enables the conversion of the
+// LayerNormalizedLSTMCellSimple composite op into a fused TFL LSTM op. The
+// fused op is contained within a FuncOp that also contains other supporting ops
+// needed to construct the operands for the fused op. The caller provides the
+// containing FuncOp as input with arguments specifying the input, weight,
+// projection, bias and layer norm scale. The weight, pprojection, bias and
+// layer norm scale all need to be RankedTensorType.
+// This class overrides the layer norm coefficient setters from the base class.
+class ConvertLayerNormalizedLSTMCellSimpleToFusedLSTM
+    : public ConvertLSTMCellSimpleToFusedLSTM {
+ public:
+  // TODO(b/140053256): The couple_input_forget_gates should be specified on
+  // FuncOp as an attribute.
+  explicit ConvertLayerNormalizedLSTMCellSimpleToFusedLSTM(
+      mlir::FuncOp fused_func_op, bool couple_input_forget_gates)
+      : ConvertLSTMCellSimpleToFusedLSTM(fused_func_op,
+                                         couple_input_forget_gates) {}
+
+  // not copyable.
+  ConvertLayerNormalizedLSTMCellSimpleToFusedLSTM(
+      const ConvertLayerNormalizedLSTMCellSimpleToFusedLSTM&) = delete;
+  ConvertLayerNormalizedLSTMCellSimpleToFusedLSTM& operator=(
+      const ConvertLayerNormalizedLSTMCellSimpleToFusedLSTM&) = delete;
+  ~ConvertLayerNormalizedLSTMCellSimpleToFusedLSTM() override {}
+
+  llvm::StringRef GetCompositeOpName() override {
+    return kLayerNormalizedLstmCellSimple;
+  }
+
+  LogicalResult Initialize() override;
+
+ protected:
+  void SetCellLayerNormCoefficients() override;
+  void SetInputLayerNormCoefficients() override;
+  void SetForgetLayerNormCoefficients() override;
+  void SetOutputLayerNormCoefficients() override;
+
+ private:
+  // specified state
+  Value* layer_norm_scale_;
+
+  // internal state
+  RankedTensorType layer_norm_scale_type_;
+  SmallVector<int64_t, 1> layer_norm_slice_shape_;
+  SmallVector<int64_t, 1> layer_norm_size_values_;
+};
+
+}  // end namespace TFL
+}  // end namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_LITE_UTILS_LSTM_UTILS_H_
diff --git a/tensorflow/compiler/mlir/lite/utils/lstm_utils_test.cc b/tensorflow/compiler/mlir/lite/utils/lstm_utils_test.cc
new file mode 100644
index 00000000000..56d6ab1f8ab
--- /dev/null
+++ b/tensorflow/compiler/mlir/lite/utils/lstm_utils_test.cc
@@ -0,0 +1,222 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/mlir/lite/utils/lstm_utils.h"
+
+#include <memory>
+
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Support/Casting.h"
+#include "mlir/Dialect/StandardOps/Ops.h"  // TF:local_config_mlir
+#include "mlir/IR/Builders.h"  // TF:local_config_mlir
+#include "mlir/IR/Function.h"  // TF:local_config_mlir
+#include "mlir/IR/Location.h"  // TF:local_config_mlir
+#include "mlir/IR/MLIRContext.h"  // TF:local_config_mlir
+#include "mlir/IR/StandardTypes.h"  // TF:local_config_mlir
+#include "mlir/IR/Types.h"  // TF:local_config_mlir
+#include "mlir/IR/Value.h"  // TF:local_config_mlir
+#include "mlir/Support/LLVM.h"  // TF:local_config_mlir
+#include "mlir/Support/LogicalResult.h"  // TF:local_config_mlir
+#include "tensorflow/core/platform/test.h"
+
+namespace mlir {
+namespace TFL {
+
+FuncOp createFusedFunc(mlir::Builder* builder) {
+  SmallVector<int64_t, 2> input_shape{1, 2};
+  SmallVector<int64_t, 2> weight_shape{3, 12};
+  SmallVector<int64_t, 1> bias_shape{2};
+  SmallVector<int64_t, 2> projection_shape{1, 2};
+  SmallVector<int64_t, 1> layer_norm_scale{4};
+  SmallVector<int64_t, 2> output_shape{1, 2};
+  auto input_type = builder->getTensorType(input_shape, builder->getF32Type());
+  auto weight_type =
+      builder->getTensorType(weight_shape, builder->getF32Type());
+  auto bias_type = builder->getTensorType(bias_shape, builder->getF32Type());
+  auto projection_type =
+      builder->getTensorType(projection_shape, builder->getF32Type());
+  auto layer_norm_scale_type =
+      builder->getTensorType(layer_norm_scale, builder->getF32Type());
+  auto output_type =
+      builder->getTensorType(output_shape, builder->getF32Type());
+  SmallVector<mlir::Type, 4> input_types{input_type, weight_type, bias_type,
+                                         projection_type,
+                                         layer_norm_scale_type};
+  auto func_type = builder->getFunctionType(input_types, output_type);
+
+  auto func =
+      FuncOp::create(mlir::NameLoc::get(builder->getIdentifier("fused_func"),
+                                        builder->getContext()),
+                     "fused_func", func_type, {});
+  func.addEntryBlock();
+  return func;
+}
+
+// TODO(ashwinm): Revisit if this test should be moved to a test pass
+// with FileCheck test after the pass that consumes the lstm_utils to stack
+// the layers.
+class LstmUtilsTest : public ::testing::Test {
+ protected:
+  LstmUtilsTest() {}
+
+  void SetUp() override {
+    builder_ = std::unique_ptr<mlir::Builder>(new Builder(&context_));
+    fused_lstm_func_ = createFusedFunc(builder_.get());
+  }
+
+  void TearDown() override {
+    fused_lstm_func_.erase();
+    builder_.reset();
+  }
+  FuncOp fused_lstm_func_;
+  mlir::MLIRContext context_;
+  std::unique_ptr<mlir::Builder> builder_;
+};
+
+TEST_F(LstmUtilsTest, ConvertLSTMCellSimple) {
+  mlir::TFL::ConvertLSTMCellSimpleToFusedLSTM convert(fused_lstm_func_, false);
+
+  auto result = convert.Initialize();
+  EXPECT_FALSE(failed(result));
+
+  convert.RewriteFunc();
+  fused_lstm_func_.dump();
+
+  // verify transpose
+  EXPECT_EQ(
+      fused_lstm_func_.getAttrOfType<StringAttr>("tf._implements").getValue(),
+      convert.GetCompositeOpName());
+  EXPECT_EQ(fused_lstm_func_.getNumArguments(), 5);
+  EXPECT_EQ(fused_lstm_func_.getType().getNumResults(), 1);
+
+  auto transpose_op = fused_lstm_func_.getBody().front().begin();
+  transpose_op++;
+  EXPECT_EQ(transpose_op->getOperand(0)
+                ->getType()
+                .cast<RankedTensorType>()
+                .getDimSize(0),
+            3);
+  EXPECT_EQ(transpose_op->getOperand(0)
+                ->getType()
+                .cast<RankedTensorType>()
+                .getDimSize(1),
+            12);
+  EXPECT_EQ(
+      transpose_op->getResult(0)->getType().cast<RankedTensorType>().getDimSize(
+          0),
+      12);
+  EXPECT_EQ(
+      transpose_op->getResult(0)->getType().cast<RankedTensorType>().getDimSize(
+          1),
+      3);
+
+  auto return_op = fused_lstm_func_.getBody().back().rbegin();
+  EXPECT_EQ(return_op->getName().getStringRef(),
+            mlir::ReturnOp::getOperationName());
+  return_op++;
+  EXPECT_EQ(return_op->getName().getStringRef(),
+            mlir::TFL::LSTMOp::getOperationName());
+  EXPECT_EQ(return_op->getNumOperands(), 24);
+  EXPECT_EQ(return_op->getNumResults(), 1);
+  // cifg = false, so input2input is not None.
+  EXPECT_FALSE(return_op->getOperand(1)->getType().isa<NoneType>());
+  // input layer norm is None
+  EXPECT_TRUE(return_op->getOperand(20)->getType().isa<NoneType>());
+  // proj_bias is F32
+  EXPECT_TRUE(return_op->getOperand(17)
+                  ->getType()
+                  .cast<RankedTensorType>()
+                  .getElementType()
+                  .isF32());
+
+  EXPECT_EQ(fused_lstm_func_.getType().getNumResults(), 1);
+  auto output_types = fused_lstm_func_.getType().getResults();
+  SmallVector<int64_t, 2> output_shape{1, 2};
+  EXPECT_EQ(output_types[0].cast<RankedTensorType>().getShape().size(),
+            output_shape.size());
+  for (int i = 0; i < output_shape.size(); i++) {
+    EXPECT_EQ(output_types[0].cast<RankedTensorType>().getDimSize(i),
+              output_shape[i]);
+  }
+}
+
+TEST_F(LstmUtilsTest, ConvertLSTMCellSimpleToFusedLSTMCoupleInputForget) {
+  mlir::TFL::ConvertLSTMCellSimpleToFusedLSTM convert(fused_lstm_func_, true);
+
+  auto result = convert.Initialize();
+  EXPECT_FALSE(failed(result));
+
+  convert.RewriteFunc();
+  fused_lstm_func_.dump();
+
+  auto it = fused_lstm_func_.getBody().back().rbegin();
+  EXPECT_EQ(it->getName().getStringRef(), mlir::ReturnOp::getOperationName());
+  it++;
+  EXPECT_EQ(it->getName().getStringRef(),
+            mlir::TFL::LSTMOp::getOperationName());
+  EXPECT_EQ(it->getNumOperands(), 24);
+  EXPECT_EQ(it->getNumResults(), 1);
+  // cifg = true, so input2input is None.
+  EXPECT_TRUE(it->getOperand(1)->getType().isa<NoneType>());
+}
+
+TEST_F(LstmUtilsTest, ConvertLayerNormLSTMCellSimpleToFusedLSTM) {
+  mlir::TFL::ConvertLayerNormalizedLSTMCellSimpleToFusedLSTM convert(
+      fused_lstm_func_, false);
+
+  auto result = convert.Initialize();
+  EXPECT_FALSE(failed(result));
+
+  convert.RewriteFunc();
+  fused_lstm_func_.dump();
+
+  EXPECT_EQ(
+      fused_lstm_func_.getAttrOfType<StringAttr>("tf._implements").getValue(),
+      convert.GetCompositeOpName());
+  EXPECT_EQ(fused_lstm_func_.getNumArguments(), 5);
+  EXPECT_EQ(fused_lstm_func_.getType().getNumResults(), 1);
+
+  auto it = fused_lstm_func_.getBody().back().rbegin();
+  EXPECT_EQ(it->getName().getStringRef(), mlir::ReturnOp::getOperationName());
+  it++;
+  EXPECT_EQ(it->getName().getStringRef(),
+            mlir::TFL::LSTMOp::getOperationName());
+  EXPECT_EQ(it->getNumOperands(), 24);
+  EXPECT_EQ(it->getNumResults(), 1);
+  // cifg = false, so input2input is not None.
+  EXPECT_FALSE(it->getOperand(1)->getType().isa<NoneType>());
+
+  // input layer norm
+  EXPECT_FALSE(it->getOperand(20)->getType().isa<NoneType>());
+  EXPECT_EQ(
+      it->getOperand(20)->getType().cast<RankedTensorType>().getShape().size(),
+      1);
+  EXPECT_EQ(
+      it->getOperand(20)->getType().cast<RankedTensorType>().getDimSize(0), 3);
+
+  EXPECT_EQ(fused_lstm_func_.getType().getNumResults(), 1);
+  auto output_types = fused_lstm_func_.getType().getResults();
+  SmallVector<int64_t, 2> output_shape{1, 2};
+  EXPECT_EQ(output_types[0].cast<RankedTensorType>().getShape().size(),
+            output_shape.size());
+  for (int i = 0; i < output_shape.size(); i++) {
+    EXPECT_EQ(output_types[0].cast<RankedTensorType>().getDimSize(i),
+              output_shape[i]);
+  }
+}
+
+}  // namespace TFL
+}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/python/BUILD b/tensorflow/compiler/mlir/python/BUILD
new file mode 100644
index 00000000000..5291cf3b141
--- /dev/null
+++ b/tensorflow/compiler/mlir/python/BUILD
@@ -0,0 +1,11 @@
+package(
+    default_visibility = ["//visibility:public"],
+    licenses = ["notice"],  # Apache 2.0
+)
+
+exports_files(
+    ["mlir.i"],
+    visibility = [
+        "//tensorflow/python:__subpackages__",
+    ],
+)
diff --git a/tensorflow/compiler/mlir/python/mlir.i b/tensorflow/compiler/mlir/python/mlir.i
new file mode 100644
index 00000000000..03273357b2b
--- /dev/null
+++ b/tensorflow/compiler/mlir/python/mlir.i
@@ -0,0 +1,74 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+%include "tensorflow/python/platform/base.i"
+
+%{
+
+#include "tensorflow/compiler/mlir/tensorflow/translate/import_model.h"
+#include "tensorflow/compiler/mlir/tensorflow/utils/error_util.h"
+#include "tensorflow/compiler/mlir/tensorflow/utils/import_utils.h"
+
+namespace tensorflow {
+namespace swig {
+
+// Simple wrapper to support tf.mlir.experimental.convert_graph_def.
+// Load a .pbptx, convert to MLIR, and (optionally) optimize the module before
+// returning it as a string.
+// This is an early experimental API, ideally we should return a wrapper object
+// around a Python binding to the MLIR module.
+string ImportGraphDef(const string &proto, TF_Status* status) {
+  GraphDef graphdef;
+  auto s = tensorflow::LoadProtoFromBuffer(proto, &graphdef);
+  if (!s.ok()) {
+    Set_TF_Status_from_Status(status, s);
+    return "// error";
+  }
+  GraphDebugInfo debug_info;
+  NodeSpecs specs;
+  mlir::MLIRContext context;
+  auto module = ConvertGraphdefToMlir(graphdef, debug_info, specs, &context);
+  if (!module.ok()) {
+    Set_TF_Status_from_Status(status, module.status());
+    return "// error";
+  }
+
+  return MlirModuleToString(*module.ConsumeValueOrDie());
+}
+
+}  // namespace swig
+}  // namespace tensorflow
+
+%}
+
+%ignoreall
+
+%unignore tensorflow;
+%unignore tensorflow::swig;
+%unignore tensorflow::swig::ImportGraphDef;
+
+// Wrap this function
+namespace tensorflow {
+namespace swig {
+static string ImportGraphDef(const string &graphdef, TF_Status* status);
+}  // namespace swig
+}  // namespace tensorflow
+
+%insert("python") %{
+def import_graphdef(graphdef):
+  return str(ImportGraphDef(str(graphdef).encode('utf-8')));
+%}
+
+%unignoreall
diff --git a/tensorflow/compiler/mlir/tensorflow/BUILD b/tensorflow/compiler/mlir/tensorflow/BUILD
index 4b64dfcb9dd..b54aef1e42a 100644
--- a/tensorflow/compiler/mlir/tensorflow/BUILD
+++ b/tensorflow/compiler/mlir/tensorflow/BUILD
@@ -1,5 +1,5 @@
 load("@local_config_mlir//:tblgen.bzl", "gentbl")
-load("//tensorflow:tensorflow.bzl", "tf_cc_test", "tf_native_cc_binary")
+load("//tensorflow:tensorflow.bzl", "tf_cc_test", "tf_gen_op_wrapper_py", "tf_native_cc_binary")
 
 package(
     default_visibility = [":friends"],
@@ -123,21 +123,6 @@ cc_library(
         "ir/tf_ops.cc.inc",
         "ir/tf_ops.h.inc",
         "ir/tf_types.cc",
-        "transforms/bridge.cc",
-        "transforms/bridge_pass.cc",
-        "transforms/cluster_formation.cc",
-        "transforms/cluster_outlining.cc",
-        "transforms/executor_island_coarsening.cc",
-        "transforms/functional_control_flow_to_cfg.cc",
-        "transforms/generated_canonicalize.inc",
-        "transforms/generated_optimize.inc",
-        "transforms/graph_pruning.cc",
-        "transforms/optimize.cc",
-        "transforms/raise_control_flow.cc",
-        "transforms/tpu_cluster_formation.cc",
-        "transforms/tpu_rewrite_pass.cc",
-        "translate/control_to_executor_dialect.cc",
-        "translate/executor_to_control_dialect.cc",
     ],
     hdrs = [
         "ir/control_flow_ops.h",
@@ -153,11 +138,11 @@ cc_library(
     includes = ["include"],
     deps = [
         ":error_util",
+        ":mlir_passthrough_op",
         ":tensorflow_canonicalize_inc_gen",
         ":tensorflow_device_ops_inc_gen",
         ":tensorflow_executor_inc_gen",
         ":tensorflow_ops_inc_gen",
-        ":tensorflow_optimize_inc_gen",
         "//tensorflow/compiler/mlir/lite:validators",
         "//tensorflow/core:lib",
         "@llvm//:support",
@@ -175,12 +160,74 @@ cc_library(
     alwayslink = 1,
 )
 
+cc_library(
+    name = "tensorflow_passes",
+    srcs = [
+        "transforms/bridge.cc",
+        "transforms/bridge_pass.cc",
+        "transforms/cluster_formation.cc",
+        "transforms/cluster_outlining.cc",
+        "transforms/executor_island_coarsening.cc",
+        "transforms/fold_switch.cc",
+        "transforms/functional_control_flow_to_cfg.cc",
+        "transforms/generated_canonicalize.inc",
+        "transforms/generated_optimize.inc",
+        "transforms/graph_pruning.cc",
+        "transforms/materialize_mlir_passthrough_op.cc",
+        "transforms/optimize.cc",
+        "transforms/raise_control_flow.cc",
+        "transforms/sink_constant.cc",
+        "transforms/tpu_cluster_formation.cc",
+        "transforms/tpu_rewrite_pass.cc",
+        "translate/control_to_executor_dialect.cc",
+        "translate/executor_to_control_dialect.cc",
+    ],
+    hdrs = [
+        "transforms/bridge.h",
+        "transforms/passes.h",
+    ],
+    includes = ["include"],
+    deps = [
+        ":error_util",
+        ":tensorflow",
+        ":tensorflow_optimize_inc_gen",
+        "//tensorflow/compiler/mlir/lite:validators",
+        "//tensorflow/core:lib",
+        "//tensorflow/core/platform:logging",
+        "@llvm//:support",
+        "@local_config_mlir//:Analysis",
+        "@local_config_mlir//:IR",
+        "@local_config_mlir//:Parser",
+        "@local_config_mlir//:Pass",
+        "@local_config_mlir//:StandardOps",
+        "@local_config_mlir//:Support",
+        "@local_config_mlir//:TransformUtils",
+        "@local_config_mlir//:Transforms",
+    ],
+    # TODO(jpienaar): Merge in the dialect registration.
+    alwayslink = 1,
+)
+
+cc_library(
+    name = "tensorflow_test_passes",
+    srcs = [
+        "transforms/lower_tf_pass.cc",
+    ],
+    deps = [
+        ":lower_tf_lib",
+        "@local_config_mlir//:IR",
+        "@local_config_mlir//:Pass",
+    ],
+    alwayslink = 1,
+)
+
 # Library with TensorFlow dialect static initialization.
 cc_library(
     name = "tensorflow_dialect_registration",
     srcs = ["ir/dialect_registration.cc"],
     deps = [
         ":tensorflow",
+        ":tensorflow_passes",
         "@local_config_mlir//:IR",
     ],
     alwayslink = 1,
@@ -204,6 +251,7 @@ cc_library(
         ":mangling_util",
         ":mlir_roundtrip_flags",
         ":tensorflow",
+        ":tensorflow_passes",
         "//tensorflow/cc/saved_model:loader_lite",
         "//tensorflow/compiler/jit:shape_inference_helpers",
         "//tensorflow/compiler/xla:status_macros",
@@ -252,6 +300,8 @@ cc_library(
         "utils/export_utils.cc",
     ],
     hdrs = [
+        "ir/tf_types.def",
+        "ir/tf_types.h",
         "utils/export_utils.h",
     ],
     deps = [
@@ -639,26 +689,73 @@ gentbl(
 )
 
 cc_library(
-    name = "tensorflow_fold_switch",
-    srcs = [
-        "transforms/fold_switch.cc",
-    ],
-    hdrs = [
-        "transforms/passes.h",
-    ],
-    copts = ["-std=c++14"],
+    name = "compile_mlir_util",
+    srcs = ["utils/compile_mlir_util.cc"],
+    hdrs = ["utils/compile_mlir_util.h"],
     deps = [
-        ":tensorflow",
+        ":convert_type",
+        ":error_util",
+        "//tensorflow/compiler/mlir/xla:hlo",
+        "//tensorflow/compiler/mlir/xla:mlir_hlo_to_hlo",
+        "//tensorflow/compiler/mlir/xla:type_to_shape",
+        "//tensorflow/compiler/mlir/xla:xla_legalize_tf",
+        "//tensorflow/compiler/tf2xla:common",
+        "//tensorflow/compiler/tf2xla:xla_compiler",
         "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "@com_google_absl//absl/memory",
+        "//tensorflow/stream_executor/lib",
+        "@com_google_absl//absl/types:span",
         "@llvm//:support",
-        "@local_config_mlir//:Analysis",
         "@local_config_mlir//:IR",
-        "@local_config_mlir//:Pass",
-        "@local_config_mlir//:QuantOps",
-        "@local_config_mlir//:StandardOps",
-        "@local_config_mlir//:Support",
+        "@local_config_mlir//:Parser",
+    ],
+)
+
+tf_cc_test(
+    name = "compile_mlir_util_test",
+    size = "small",
+    srcs = ["utils/compile_mlir_util_test.cc"],
+    deps = [
+        ":compile_mlir_util",
+        "//tensorflow/compiler/tf2xla:common",
+        "//tensorflow/compiler/tf2xla:xla_compiler",
+        "//tensorflow/compiler/xla:test",
+        "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/compiler/xla/service:hlo",
+        "//tensorflow/core:test_main",
+        "//tensorflow/stream_executor/lib",
+    ],
+)
+
+cc_library(
+    name = "mlir_passthrough_op",
+    srcs = ["ops/mlir_passthrough_op.cc"],
+    deps = [
+        "//tensorflow/core:framework",
+    ],
+    alwayslink = 1,
+)
+
+tf_gen_op_wrapper_py(
+    name = "gen_mlir_passthrough_op_py",
+    out = "gen_mlir_passthrough_op.py",
+    deps = [":mlir_passthrough_op"],
+)
+
+# Library to get rewrite patterns lowering within TensorFlow.
+#
+# This is a separate library so that external passes can link only this library
+# without linking any of the other tensorflow passes.
+cc_library(
+    name = "lower_tf_lib",
+    srcs = [
+        "transforms/lower_tf.cc",
+    ],
+    hdrs = [
+        "transforms/lower_tf.h",
+    ],
+    deps = [
+        ":tensorflow",
+        "@local_config_mlir//:IR",
     ],
     alwayslink = 1,
 )
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_executor.cc b/tensorflow/compiler/mlir/tensorflow/ir/tf_executor.cc
index 77d412f02c9..72799e19a0d 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_executor.cc
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_executor.cc
@@ -40,6 +40,7 @@ limitations under the License.
 #include "mlir/IR/Types.h"  // TF:local_config_mlir
 #include "mlir/IR/Value.h"  // TF:local_config_mlir
 #include "mlir/Support/LogicalResult.h"  // TF:local_config_mlir
+#include "mlir/Transforms/FoldUtils.h"  // TF:local_config_mlir
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_types.h"
 
 namespace mlir {
@@ -48,31 +49,65 @@ namespace {
 
 // If the given tensor has elements of type variant, then returns a new type
 // after dropping subtypes info. Otherwise, returns the original type as is.
-Type DropVariantSubTypes(Type ty) {
-  ShapedType shaped_ty = ty.cast<ShapedType>();
-  Type element_ty = shaped_ty.getElementType();
+ShapedType DropVariantSubTypes(ShapedType ty) {
+  Type element_ty = ty.getElementType();
   if (!element_ty.isa<TF::VariantType>()) return ty;
 
   Type variant_ty = TF::VariantType::get(ty.getContext());
-  if (shaped_ty.hasRank()) {
-    return RankedTensorType::get(shaped_ty.getShape(), variant_ty);
+  if (ty.hasRank()) {
+    return RankedTensorType::get(ty.getShape(), variant_ty);
   }
 
   return UnrankedTensorType::get(variant_ty);
 }
 
+// If the given tensor has elements of type ref, then returns a new type
+// of the shape, but corresponding non-ref type as element type. Otherwise,
+// returns the original type as is.
+ShapedType DropRefType(ShapedType type) {
+  Type element_ty = type.getElementType();
+  TF::TensorFlowRefType ref_type = element_ty.dyn_cast<TF::TensorFlowRefType>();
+  if (!ref_type) return type;
+
+  if (type.hasRank()) {
+    return RankedTensorType::get(type.getShape(), ref_type.RemoveRef());
+  }
+  return UnrankedTensorType::get(ref_type.RemoveRef());
+}
+
 }  // namespace
 
 //===----------------------------------------------------------------------===//
 // TF Executor Dialect
 //===----------------------------------------------------------------------===//
 
+namespace {
+
+struct TensorFlowExecutorOpFolderDialectInterface
+    : public OpFolderDialectInterface {
+  using OpFolderDialectInterface::OpFolderDialectInterface;
+
+  // Registered hook to check if the given region, which is attached to an
+  // operation that is *not* isolated from above (i.e. no internal regions
+  // reference values defined in an enclosing region), should be used when
+  // materializing constants.
+  // In the executor dialect we materialize inside an island.
+  bool shouldMaterializeInto(Region *region) const final {
+    return isa<tf_executor::IslandOp>(region->getParentOp());
+  }
+};
+
+}  // namespace
+
 TensorFlowExecutorDialect::TensorFlowExecutorDialect(MLIRContext *context)
     : Dialect(/*name=*/"tf_executor", context) {
   addOperations<
 #define GET_OP_LIST
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_executor.cc.inc"
       >();
+
+  addInterfaces<TensorFlowExecutorOpFolderDialectInterface>();
+
   addTypes<ControlType, TokenType>();
 }
 
@@ -296,6 +331,23 @@ void Print(IslandOp op, OpAsmPrinter *p) {
     p->printOperands(op.getOperands());
     *p << ')';
   }
+
+  // Check if we can print the short "wraps" form: that is if the island
+  // contains a single operation and the result of this operation are perfectly
+  // forwarded to the yield.
+  if (op.getAttrs().empty() &&
+      std::next(op.GetBody().begin(), 2) == op.GetBody().end()) {
+    Operation &wrapped_op = op.GetBody().front();
+    Operation &yield_op = op.GetBody().back();
+    if (wrapped_op.getNumResults() == yield_op.getNumOperands() &&
+        std::equal(wrapped_op.getResults().begin(),
+                   wrapped_op.getResults().end(),
+                   yield_op.getOperands().begin())) {
+      *p << " wraps ";
+      p->printGenericOp(&op.GetBody().front());
+      return;
+    }
+  }
   p->printRegion(op.getOperation()->getRegion(0));
   p->printOptionalAttrDict(op.getAttrs());
 }
@@ -316,17 +368,22 @@ ParseResult ParseIslandOp(OpAsmParser *parser, OperationState *result) {
   // Parse the body region.
   Region &body = *result->addRegion();
 
-  // TODO(b/134773778): the custom parser is missing support to implement to
-  // short syntax right now.
-  // if (!parser->parseOptionalKeyword("wraps")) {
-  //   body.push_back(new Block);
-  //   Block &block = body.back();
-  //   parser->getBuilder().setInsertionPointToEnd(&block);
-  //   if (parser->parseOperation())
-  //     return failure();
-  // }
-
-  if (parser->parseRegion(body, llvm::None, llvm::None)) return failure();
+  if (succeeded(parser->parseOptionalKeyword("wraps"))) {
+    // If we parse the short version of the island, we have an operation in the
+    // generic form that follows the "wraps" keyword. Parse it inside the region
+    // and forward all of its results as-is to the yield operation.
+    body.push_back(new Block);
+    Block &block = body.back();
+    Operation *wrapped_op =
+        parser->parseGenericOperation(&block, block.begin());
+    if (!wrapped_op) return failure();
+    OpBuilder builder(parser->getBuilder().getContext());
+    builder.setInsertionPointToEnd(&block);
+    builder.create<YieldOp>(result->location,
+                            llvm::to_vector<8>(wrapped_op->getResults()));
+  } else if (parser->parseRegion(body, llvm::None, llvm::None)) {
+    return failure();
+  }
 
   IslandOp::ensureTerminator(body, parser->getBuilder(), result->location);
 
@@ -536,35 +593,43 @@ LogicalResult Verify(MergeOp merge) {
   if (data_type.isa<ControlType>())
     return merge.emitOpError() << "expects a non-control input";
 
-  // Check that all operands can be broadcasted to a common type compatible with
-  // the result type.
-  Type broadcasted_type = merge.output()->getType();
+  // Check that each operand can be individually broadcasted to the output type.
+  Type output_type = merge.output()->getType();
+  TensorType output_tensor_ty = output_type.dyn_cast<TensorType>();
+  if (!output_tensor_ty) {
+    return merge.emitOpError()
+           << "expects output to have tensor type but got " << output_type;
+  }
+  bool is_output_ref =
+      output_tensor_ty.getElementType().isa<TF::TensorFlowRefType>();
   for (Type operand_type : merge.getOperandTypes()) {
     if (operand_type.isa<ControlType>()) break;
 
     // TODO(hinsu): Update ControlOperandsAfterAllData trait to verify this
     // constraint.
-    if (!operand_type.isa<TensorType>())
-      return merge.emitOpError("expects data operands to have tensor type");
-
-    // Variant types may have opaque subtypes information that need not match
-    // between the two types so drop them before computing the broadcasted type.
-    Type new_broadcasted_type =
-        OpTrait::util::getBroadcastedType(DropVariantSubTypes(broadcasted_type),
-                                          DropVariantSubTypes(operand_type));
-    if (!new_broadcasted_type)
+    TensorType operand_tensor_ty = operand_type.dyn_cast<TensorType>();
+    if (!operand_tensor_ty)
       return merge.emitOpError()
-             << "expects all operands to be broadcastable"
-             << " but got " << broadcasted_type << " vs " << operand_type;
-    // Use the broadcasted type unless we're losing the rank information here.
-    // This is because for example starting with a result of tensor<4xf32>, if
-    // the first operand is unranked, the broadcasted type will be unranked.
-    // Then any tensor operand will be broadcastable to this unranked type.
-    if (!broadcasted_type.cast<TensorType>().hasRank() ||
-        new_broadcasted_type.cast<TensorType>().hasRank())
-      broadcasted_type = new_broadcasted_type;
-  }
+             << "expects data operands to have tensor type but got "
+             << operand_type;
 
+    // If output type is a ref type then all operand types should also be of the
+    // same ref type. However, if the output type is a non-ref type T, operands
+    // can be tensor of type T or T_REF.
+    if (is_output_ref &&
+        !operand_tensor_ty.getElementType().isa<TF::TensorFlowRefType>()) {
+      return merge.emitOpError()
+             << "expects same operand and output element type but got "
+             << operand_tensor_ty << " vs " << output_tensor_ty;
+    }
+    Type broadcasted_type = OpTrait::util::getBroadcastedType(
+        DropRefType(DropVariantSubTypes(output_tensor_ty)),
+        DropRefType(DropVariantSubTypes(operand_tensor_ty)));
+    if (!broadcasted_type)
+      return merge.emitOpError()
+             << "expects all operands to be broadcastable with output type"
+             << " but got " << operand_tensor_ty << " vs " << output_tensor_ty;
+  }
   return success();
 }
 
@@ -1088,6 +1153,35 @@ void IslandOp::getCanonicalizationPatterns(OwningRewritePatternList &results,
                  DropEmptyIslandNoOperandOneDataResult>(context);
 }
 
+//===----------------------------------------------------------------------===//
+// tf_executor.ControlTrigger
+//===----------------------------------------------------------------------===//
+
+namespace {
+// This pattern matches and removes ControlTriggerOps with no control operands.
+// Control result users will have their relevant operands removed.
+struct DropEmptyControlTrigger : public OpRewritePattern<ControlTriggerOp> {
+  using OpRewritePattern<ControlTriggerOp>::OpRewritePattern;
+
+  PatternMatchResult matchAndRewrite(ControlTriggerOp op,
+                                     PatternRewriter &rewriter) const override {
+    if (op.getNumOperands() != 0) return matchFailure();
+
+    for (auto &use : llvm::make_early_inc_range(op.control()->getUses()))
+      use.getOwner()->eraseOperand(use.getOperandNumber());
+
+    rewriter.replaceOp(op, {nullptr});
+
+    return matchSuccess();
+  }
+};
+}  // anonymous namespace
+
+void ControlTriggerOp::getCanonicalizationPatterns(
+    OwningRewritePatternList &results, MLIRContext *context) {
+  results.insert<DropEmptyControlTrigger>(context);
+}
+
 //===----------------------------------------------------------------------===//
 // Folders
 //===----------------------------------------------------------------------===//
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_executor_ops.td b/tensorflow/compiler/mlir/tensorflow/ir/tf_executor_ops.td
index e8843c7d64f..eb3b9797192 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_executor_ops.td
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_executor_ops.td
@@ -594,6 +594,8 @@ def TfExecutor_ControlTriggerOp : TfExecutor_Op<"ControlTrigger",
 
   let verifier = ?;
 
+  let hasCanonicalizer = 1;
+
   let builders = [OpBuilder<
     "Builder *builder, OperationState *result, "
     "ArrayRef<Value *> operands, ArrayRef<NamedAttribute> attributes = {}",
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td b/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td
index 153ac5346b9..f01ff57c41d 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td
@@ -88,13 +88,13 @@ Inputs must be of same size and shape.
   }];
 
   let arguments = (ins
-    Variadic<TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Variant]>>:$inputs,
+    Variadic<TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8, TF_Variant]>>:$inputs,
 
     Confined<I64Attr, [IntMinValue<1>]>:$N
   );
 
   let results = (outs
-    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Variant]>:$sum
+    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8, TF_Variant]>:$sum
   );
 
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
@@ -110,12 +110,12 @@ def TF_AddV2Op : TF_Op<"AddV2", [Broadcastable, Commutative, NoSideEffect]>,
   }];
 
   let arguments = (ins
-    TF_NumberTensor:$x,
-    TF_NumberTensor:$y
+    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Uint8]>:$x,
+    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Uint8]>:$y
   );
 
   let results = (outs
-    TF_NumberTensor:$z
+    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Uint8]>:$z
   );
 
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
@@ -123,6 +123,32 @@ def TF_AddV2Op : TF_Op<"AddV2", [Broadcastable, Commutative, NoSideEffect]>,
   let hasCanonicalizer = 1;
 }
 
+def TF_AllOp : TF_Op<"All", [NoSideEffect]> {
+  let summary = [{
+Computes the "logical and" of elements across dimensions of a tensor.
+  }];
+
+  let description = [{
+Reduces `input` along the dimensions given in `axis`. Unless
+`keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
+`axis`. If `keep_dims` is true, the reduced dimensions are
+retained with length 1.
+  }];
+
+  let arguments = (ins
+    I1Tensor:$input,
+    TF_I32OrI64Tensor:$reduction_indices,
+
+    DefaultValuedAttr<BoolAttr, "false">:$keep_dims
+  );
+
+  let results = (outs
+    I1Tensor:$output
+  );
+
+  TF_DerivedOperandTypeAttr Tidx = TF_DerivedOperandTypeAttr<1>;
+}
+
 def TF_AnyOp : TF_Op<"Any", [NoSideEffect]> {
   let summary = [{
 Computes the "logical or" of elements across dimensions of a tensor.
@@ -169,7 +195,7 @@ Usage:
   }];
 
   let arguments = (ins
-    TF_NumberTensor:$input,
+    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$input,
     TF_I32OrI64Tensor:$dimension
   );
 
@@ -202,7 +228,7 @@ Usage:
   }];
 
   let arguments = (ins
-    TF_NumberTensor:$input,
+    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$input,
     TF_I32OrI64Tensor:$dimension
   );
 
@@ -261,6 +287,88 @@ window in `value`.
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
 }
 
+def TF_BatchMatMulOp : TF_Op<"BatchMatMul", [NoSideEffect]> {
+  let summary = "Multiplies slices of two tensors in batches.";
+
+  let description = [{
+Multiplies all slices of `Tensor` `x` and `y` (each slice can be
+viewed as an element of a batch), and arranges the individual results
+in a single output tensor of the same batch size. Each of the
+individual slices can optionally be adjointed (to adjoint a matrix
+means to transpose and conjugate it) before multiplication by setting
+the `adj_x` or `adj_y` flag to `True`, which are by default `False`.
+
+The input tensors `x` and `y` are 2-D or higher with shape `[..., r_x, c_x]`
+and `[..., r_y, c_y]`.
+
+The output tensor is 2-D or higher with shape `[..., r_o, c_o]`, where:
+
+    r_o = c_x if adj_x else r_x
+    c_o = r_y if adj_y else c_y
+
+It is computed as:
+
+    output[..., :, :] = matrix(x[..., :, :]) * matrix(y[..., :, :])
+  }];
+
+  let arguments = (ins
+    TensorOf<[BF16, F16, F32, F64, I32, I64, TF_Complex128, TF_Complex64]>:$x,
+    TensorOf<[BF16, F16, F32, F64, I32, I64, TF_Complex128, TF_Complex64]>:$y,
+
+    DefaultValuedAttr<BoolAttr, "false">:$adj_x,
+    DefaultValuedAttr<BoolAttr, "false">:$adj_y
+  );
+
+  let results = (outs
+    TensorOf<[BF16, F16, F32, F64, I32, I64, TF_Complex128, TF_Complex64]>:$output
+  );
+
+  TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
+}
+
+def TF_BatchMatMulV2Op : TF_Op<"BatchMatMulV2", [NoSideEffect]> {
+  let summary = "Multiplies slices of two tensors in batches.";
+
+  let description = [{
+Multiplies all slices of `Tensor` `x` and `y` (each slice can be
+viewed as an element of a batch), and arranges the individual results
+in a single output tensor of the same batch size. Each of the
+individual slices can optionally be adjointed (to adjoint a matrix
+means to transpose and conjugate it) before multiplication by setting
+the `adj_x` or `adj_y` flag to `True`, which are by default `False`.
+
+The input tensors `x` and `y` are 2-D or higher with shape `[..., r_x, c_x]`
+and `[..., r_y, c_y]`.
+
+The output tensor is 2-D or higher with shape `[..., r_o, c_o]`, where:
+
+    r_o = c_x if adj_x else r_x
+    c_o = r_y if adj_y else c_y
+
+It is computed as:
+
+    output[..., :, :] = matrix(x[..., :, :]) * matrix(y[..., :, :])
+
+*NOTE*: `BatchMatMulV2` supports broadcasting in the batch dimensions. More
+about broadcasting
+[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html).
+  }];
+
+  let arguments = (ins
+    TensorOf<[BF16, F16, F32, F64, I32, I64, TF_Complex128, TF_Complex64]>:$x,
+    TensorOf<[BF16, F16, F32, F64, I32, I64, TF_Complex128, TF_Complex64]>:$y,
+
+    DefaultValuedAttr<BoolAttr, "false">:$adj_x,
+    DefaultValuedAttr<BoolAttr, "false">:$adj_y
+  );
+
+  let results = (outs
+    TensorOf<[BF16, F16, F32, F64, I32, I64, TF_Complex128, TF_Complex64]>:$output
+  );
+
+  TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
+}
+
 def TF_BatchToSpaceNDOp : TF_Op<"BatchToSpaceND", [NoSideEffect]> {
   let summary = "BatchToSpace for N-D tensors of type T.";
 
@@ -297,14 +405,14 @@ Broadcasting is supported, so `value` may have any number of dimensions.
   }];
 
   let arguments = (ins
-    TF_NumberTensor:$value,
-    TF_NumberTensor:$bias,
+    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$value,
+    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$bias,
 
     DefaultValuedAttr<TF_ConvnetDataFormatAttr, "NHWC">:$data_format
   );
 
   let results = (outs
-    TF_NumberTensor:$output
+    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$output
   );
 
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
@@ -332,21 +440,23 @@ gives module error.
 For example,
 
 Example 1:
-```python
+
 >>> a = [1., 2., 3.]
->>> equality_bitcast = tf.bitcast(a,tf.complex128)
-tensorflow.python.framework.errors_impl.InvalidArgumentError: Cannot bitcast from float to complex128: shape [3] [Op:Bitcast]
->>> equality_cast = tf.cast(a,tf.complex128)
+>>> equality_bitcast = tf.bitcast(a, tf.complex128)
+Traceback (most recent call last):
+...
+InvalidArgumentError: Cannot bitcast from 1 to 18 [Op:Bitcast]
+>>> equality_cast = tf.cast(a, tf.complex128)
 >>> print(equality_cast)
 tf.Tensor([1.+0.j 2.+0.j 3.+0.j], shape=(3,), dtype=complex128)
-```
+
 Example 2:
-```python
+
 >>> tf.bitcast(tf.constant(0xffffffff, dtype=tf.uint32), tf.uint8)
 <tf.Tensor: ... shape=(4,), dtype=uint8, numpy=array([255, 255, 255, 255], dtype=uint8)>
-```
+
 Example 3:
-```python
+
 >>> x = [1., 2., 3.]
 >>> y = [0., 2., 3.]
 >>> equality= tf.equal(x,y)
@@ -358,10 +468,9 @@ tf.Tensor([False True True], shape=(3,), dtype=bool)
 tf.Tensor([0. 1. 1.], shape=(3,), dtype=float32)
 >>> print(equality_bitcast)
 tf.Tensor(
-[[ 0 0 0 0]
- [ 0 0 128 63]
- [ 0 0 128 63]], shape=(3, 4), dtype=uint8)
-```
+    [[  0   0   0   0]
+     [  0   0 128  63]
+     [  0   0 128  63]], shape=(3, 4), dtype=uint8)
 
 *NOTE*: Bitcast is implemented as a low-level cast, so machines with different
 endian orderings will give different results.
@@ -393,14 +502,13 @@ and works its way forward.
 
 For example,
 
-```python
 >>> x = tf.constant([1, 2, 3])
 >>> y = tf.broadcast_to(x, [3, 3])
->>> sess.run(y)
-array([[1, 2, 3],
-       [1, 2, 3],
-       [1, 2, 3]], dtype=int32)
-```
+>>> print(y)
+tf.Tensor(
+    [[1 2 3]
+     [1 2 3]
+     [1 2 3]], shape=(3, 3), dtype=int32)
 
 In the above example, the input Tensor with the shape of `[1, 3]`
 is broadcasted to output Tensor with shape of `[3, 3]`.
@@ -462,6 +570,27 @@ def TF_CeilOp : TF_Op<"Ceil", [NoSideEffect, SameOperandsAndResultType]> {
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
 }
 
+def TF_CheckNumericsOp : TF_Op<"CheckNumerics", [SameOperandsAndResultType]> {
+  let summary = "Checks a tensor for NaN and Inf values.";
+
+  let description = [{
+When run, reports an `InvalidArgument` error if `tensor` has any values
+that are not a number (NaN) or infinity (Inf). Otherwise, passes `tensor` as-is.
+  }];
+
+  let arguments = (ins
+    TF_FpTensor:$tensor,
+
+    StrAttr:$message
+  );
+
+  let results = (outs
+    TF_FpTensor:$output
+  );
+
+  TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
+}
+
 def TF_ConcatOp : TF_Op<"Concat", [NoSideEffect]> {
   let summary = "Concatenates tensors along one dimension.";
 
@@ -480,6 +609,10 @@ def TF_ConcatOp : TF_Op<"Concat", [NoSideEffect]> {
   );
 
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<1>;
+
+  let verifier = [{
+    return Verify(*this);
+  }];
 }
 
 def TF_ConcatV2Op : TF_Op<"ConcatV2", [NoSideEffect]> {
@@ -501,6 +634,10 @@ def TF_ConcatV2Op : TF_Op<"ConcatV2", [NoSideEffect]> {
 
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
   TF_DerivedOperandTypeAttr Tidx = TF_DerivedOperandTypeAttr<1>;
+
+  let verifier = [{
+    return Verify(*this);
+  }];
 }
 
 def TF_ConjOp : TF_Op<"Conj", [NoSideEffect]> {
@@ -771,12 +908,12 @@ def TF_DivOp : TF_Op<"Div", [Broadcastable, NoSideEffect]>,
   }];
 
   let arguments = (ins
-    TF_NumberTensor:$x,
-    TF_NumberTensor:$y
+    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Uint16, TF_Uint8]>:$x,
+    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Uint16, TF_Uint8]>:$y
   );
 
   let results = (outs
-    TF_NumberTensor:$z
+    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Uint16, TF_Uint8]>:$z
   );
 
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
@@ -805,8 +942,7 @@ See [Fast and Accurate Deep Network Learning by Exponential Linear Units (ELUs)
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
 }
 
-def TF_EqualOp : TF_Op<"Equal", [Broadcastable, Commutative, NoSideEffect]>,
-                 WithBroadcastableCmpOpBuilder {
+def TF_EqualOp : TF_Op<"Equal", [Commutative, NoSideEffect]> {
   let summary = "Returns the truth value of (x == y) element-wise.";
 
   let description = [{
@@ -825,8 +961,10 @@ tf.math.equal(x, y) ==> array([True,  True])
   }];
 
   let arguments = (ins
-    TensorOf<[BF16, F16, F32, F64, I1, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Str]>:$x,
-    TensorOf<[BF16, F16, F32, F64, I1, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Str]>:$y
+    TensorOf<[BF16, F16, F32, F64, I1, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Str, TF_Uint8]>:$x,
+    TensorOf<[BF16, F16, F32, F64, I1, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Str, TF_Uint8]>:$y,
+
+    DefaultValuedAttr<BoolAttr, "true">:$incompatible_shape_error
   );
 
   let results = (outs
@@ -834,6 +972,15 @@ tf.math.equal(x, y) ==> array([True,  True])
   );
 
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
+
+  let builders = [
+    OpBuilder<"Builder* builder, OperationState* result, Value* x, "
+              "Value* y, BoolAttr incompatible_shape_error">
+  ];
+
+  let verifier = [{
+    return Verify(*this);
+  }];
 }
 
 def TF_ExpOp : TF_Op<"Exp", [NoSideEffect, SameOperandsAndResultType]> {
@@ -1017,6 +1164,52 @@ values.
   }];
 }
 
+def TF_FakeQuantWithMinMaxVarsPerChannelOp : TF_Op<"FakeQuantWithMinMaxVarsPerChannel", [NoSideEffect]> {
+  let summary = [{
+Fake-quantize the 'inputs' tensor of type float and one of the shapes: `[d]`,
+  }];
+
+  let description = [{
+`[b, d]` `[b, h, w, d]` via per-channel floats `min` and `max` of shape `[d]`
+to 'outputs' tensor of same shape as `inputs`.
+
+`[min; max]` define the clamping range for the `inputs` data.
+`inputs` values are quantized into the quantization range (`[0; 2^num_bits - 1]`
+when `narrow_range` is false and `[1; 2^num_bits - 1]` when it is true) and
+then de-quantized and output as floats in `[min; max]` interval.
+`num_bits` is the bitwidth of the quantization; between 2 and 16, inclusive.
+
+Before quantization, `min` and `max` values are adjusted with the following
+logic.
+It is suggested to have `min <= 0 <= max`. If `0` is not in the range of values,
+the behavior can be unexpected:
+If `0 < min < max`: `min_adj = 0` and `max_adj = max - min`.
+If `min < max < 0`: `min_adj = min - max` and `max_adj = 0`.
+If `min <= 0 <= max`: `scale = (max - min) / (2^num_bits - 1) `,
+`min_adj = scale * round(min / scale)` and `max_adj = max + min_adj - min`.
+
+This operation has a gradient and thus allows for training `min` and `max`
+values.
+  }];
+
+  let arguments = (ins
+    F32Tensor:$inputs,
+    F32Tensor:$min,
+    F32Tensor:$max,
+
+    DefaultValuedAttr<I64Attr, "8">:$num_bits,
+    DefaultValuedAttr<BoolAttr, "false">:$narrow_range
+  );
+
+  let results = (outs
+    F32Tensor:$outputs
+  );
+
+  let verifier = [{
+    return Verify(*this);
+  }];
+}
+
 def TF_FillOp : TF_Op<"Fill", [NoSideEffect]> {
   let summary = "Creates a tensor filled with a scalar value.";
 
@@ -1082,12 +1275,12 @@ def TF_FloorDivOp : TF_Op<"FloorDiv", [Broadcastable, NoSideEffect]>,
   }];
 
   let arguments = (ins
-    TF_NumberTensor:$x,
-    TF_NumberTensor:$y
+    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Uint16, TF_Uint8]>:$x,
+    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Uint16, TF_Uint8]>:$y
   );
 
   let results = (outs
-    TF_NumberTensor:$z
+    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Uint16, TF_Uint8]>:$z
   );
 
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
@@ -1780,14 +1973,14 @@ retained with length 1.
   }];
 
   let arguments = (ins
-    TF_NumberTensor:$input,
+    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$input,
     TF_I32OrI64Tensor:$reduction_indices,
 
     DefaultValuedAttr<BoolAttr, "false">:$keep_dims
   );
 
   let results = (outs
-    TF_NumberTensor:$output
+    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$output
   );
 
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
@@ -1801,7 +1994,7 @@ def TF_MaxPoolOp : TF_Op<"MaxPool", [NoSideEffect]> {
   }];
 
   let arguments = (ins
-    TF_IntOrFpTensor:$input,
+    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Qint8, TF_Uint16, TF_Uint8]>:$input,
 
     Confined<I64ArrayAttr, [ArrayMinCount<4>]>:$ksize,
     Confined<I64ArrayAttr, [ArrayMinCount<4>]>:$strides,
@@ -1810,7 +2003,7 @@ def TF_MaxPoolOp : TF_Op<"MaxPool", [NoSideEffect]> {
   );
 
   let results = (outs
-    TF_IntOrFpTensor:$output
+    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Qint8, TF_Uint16, TF_Uint8]>:$output
   );
 
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
@@ -1850,14 +2043,14 @@ retained with length 1.
   }];
 
   let arguments = (ins
-    TF_NumberTensor:$input,
+    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$input,
     TF_I32OrI64Tensor:$reduction_indices,
 
     DefaultValuedAttr<BoolAttr, "false">:$keep_dims
   );
 
   let results = (outs
-    TF_NumberTensor:$output
+    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$output
   );
 
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
@@ -1931,6 +2124,57 @@ pad(t, paddings) ==> [[2, 1, 1, 2, 3, 3, 2]
   TF_DerivedOperandTypeAttr Tpaddings = TF_DerivedOperandTypeAttr<1>;
 }
 
+def TF_MlirPassthroughOpOp : TF_Op<"MlirPassthroughOp", [NoSideEffect]> {
+  let summary = [{
+Wraps an arbitrary MLIR computation expressed as a module with a main() function.
+  }];
+
+  let description = [{
+This operation does not have an associated kernel and is not intended to be
+executed in a regular TensorFlow session. Instead it is intended to be used for
+testing or for special case where a user intends to pass custom MLIR computation
+through a TensorFlow graph with the intent of having custom tooling processing
+it downstream (when targeting a different environment, like TensorFlow lite for
+example).
+The MLIR module is expected to have a main() function that will be used as an
+entry point. The inputs to the operations will be passed as argument to the
+main() function and the returned values of the main function mapped to the
+outputs.
+Example usage:
+
+```
+import tensorflow as tf
+from tensorflow.compiler.mlir.tensorflow.gen_mlir_passthrough_op import mlir_passthrough_op
+
+mlir_module = '''
+func @main(%arg0 : tensor<10xf32>, %arg1 : tensor<10xf32>) -> tensor<10x10xf32> {
+   %add = "magic.op"(%arg0, %arg1) : (tensor<10xf32>, tensor<10xf32>) -> tensor<10x10xf32>
+   return %ret : tensor<10x10xf32>
+}
+'''
+
+@tf.function
+def foo(x, y):
+  return = mlir_passthrough_op([x, y], mlir_module, Toutputs=[tf.float32])
+
+graph_def = foo.get_concrete_function(tf.TensorSpec([10], tf.float32), tf.TensorSpec([10], tf.float32)).graph.as_graph_def()
+```
+  }];
+
+  let arguments = (ins
+    Variadic<TF_Tensor>:$inputs,
+
+    StrAttr:$mlir_module
+  );
+
+  let results = (outs
+    Variadic<TF_Tensor>:$outputs
+  );
+
+  TF_DerivedOperandTypeListAttr Tinputs = TF_DerivedOperandTypeListAttr<0>;
+  TF_DerivedResultTypeListAttr Toutputs = TF_DerivedResultTypeListAttr<0>;
+}
+
 def TF_MulOp : TF_Op<"Mul", [Broadcastable, Commutative, NoSideEffect]>,
                WithBroadcastableBinOpBuilder {
   let summary = "Returns x * y element-wise.";
@@ -1941,12 +2185,12 @@ def TF_MulOp : TF_Op<"Mul", [Broadcastable, Commutative, NoSideEffect]>,
   }];
 
   let arguments = (ins
-    TF_NumberTensor:$x,
-    TF_NumberTensor:$y
+    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Uint16, TF_Uint8]>:$x,
+    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Uint16, TF_Uint8]>:$y
   );
 
   let results = (outs
-    TF_NumberTensor:$z
+    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Uint16, TF_Uint8]>:$z
   );
 
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
@@ -2006,8 +2250,101 @@ def TF_NoOp : TF_Op<"NoOp", [NoSideEffect]> {
   let results = (outs);
 }
 
-def TF_NotEqualOp : TF_Op<"NotEqual", [Broadcastable, Commutative, NoSideEffect]>,
-                    WithBroadcastableCmpOpBuilder {
+def TF_NonMaxSuppressionV4Op : TF_Op<"NonMaxSuppressionV4", [NoSideEffect]> {
+  let summary = [{
+Greedily selects a subset of bounding boxes in descending order of score,
+  }];
+
+  let description = [{
+pruning away boxes that have high intersection-over-union (IOU) overlap
+with previously selected boxes.  Bounding boxes with score less than
+`score_threshold` are removed.  Bounding boxes are supplied as
+[y1, x1, y2, x2], where (y1, x1) and (y2, x2) are the coordinates of any
+diagonal pair of box corners and the coordinates can be provided as normalized
+(i.e., lying in the interval [0, 1]) or absolute.  Note that this algorithm
+is agnostic to where the origin is in the coordinate system and more
+generally is invariant to orthogonal transformations and translations
+of the coordinate system; thus translating or reflections of the coordinate
+system result in the same boxes being selected by the algorithm.
+The output of this operation is a set of integers indexing into the input
+collection of bounding boxes representing the selected boxes.  The bounding
+box coordinates corresponding to the selected indices can then be obtained
+using the `tf.gather operation`.  For example:
+  selected_indices = tf.image.non_max_suppression_v2(
+      boxes, scores, max_output_size, iou_threshold, score_threshold)
+  selected_boxes = tf.gather(boxes, selected_indices)
+  }];
+
+  let arguments = (ins
+    TensorOf<[F16, F32]>:$boxes,
+    TensorOf<[F16, F32]>:$scores,
+    I32Tensor:$max_output_size,
+    TensorOf<[F16, F32]>:$iou_threshold,
+    TensorOf<[F16, F32]>:$score_threshold,
+
+    DefaultValuedAttr<BoolAttr, "false">:$pad_to_max_output_size
+  );
+
+  let results = (outs
+    I32Tensor:$selected_indices,
+    I32Tensor:$valid_outputs
+  );
+
+  TF_DerivedOperandTypeAttr T_threshold = TF_DerivedOperandTypeAttr<3>;
+  TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
+}
+
+def TF_NonMaxSuppressionV5Op : TF_Op<"NonMaxSuppressionV5", [NoSideEffect]> {
+  let summary = [{
+Greedily selects a subset of bounding boxes in descending order of score,
+  }];
+
+  let description = [{
+pruning away boxes that have high intersection-over-union (IOU) overlap
+with previously selected boxes.  Bounding boxes with score less than
+`score_threshold` are removed.  Bounding boxes are supplied as
+[y1, x1, y2, x2], where (y1, x1) and (y2, x2) are the coordinates of any
+diagonal pair of box corners and the coordinates can be provided as normalized
+(i.e., lying in the interval [0, 1]) or absolute.  Note that this algorithm
+is agnostic to where the origin is in the coordinate system and more
+generally is invariant to orthogonal transformations and translations
+of the coordinate system; thus translating or reflections of the coordinate
+system result in the same boxes being selected by the algorithm.
+The output of this operation is a set of integers indexing into the input
+collection of bounding boxes representing the selected boxes.  The bounding
+box coordinates corresponding to the selected indices can then be obtained
+using the `tf.gather operation`.  For example:
+  selected_indices = tf.image.non_max_suppression_v2(
+      boxes, scores, max_output_size, iou_threshold, score_threshold)
+  selected_boxes = tf.gather(boxes, selected_indices)
+This op also supports a Soft-NMS (with Gaussian weighting) mode (c.f.
+Bodla et al, https://arxiv.org/abs/1704.04503) where boxes reduce the score
+of other overlapping boxes instead of directly causing them to be pruned.
+To enable this Soft-NMS mode, set the `soft_nms_sigma` parameter to be
+larger than 0.
+  }];
+
+  let arguments = (ins
+    TensorOf<[F16, F32]>:$boxes,
+    TensorOf<[F16, F32]>:$scores,
+    I32Tensor:$max_output_size,
+    TensorOf<[F16, F32]>:$iou_threshold,
+    TensorOf<[F16, F32]>:$score_threshold,
+    TensorOf<[F16, F32]>:$soft_nms_sigma,
+
+    DefaultValuedAttr<BoolAttr, "false">:$pad_to_max_output_size
+  );
+
+  let results = (outs
+    I32Tensor:$selected_indices,
+    TensorOf<[F16, F32]>:$selected_scores,
+    I32Tensor:$valid_outputs
+  );
+
+  TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
+}
+
+def TF_NotEqualOp : TF_Op<"NotEqual", [Commutative, NoSideEffect]> {
   let summary = "Returns the truth value of (x != y) element-wise.";
 
   let description = [{
@@ -2016,8 +2353,10 @@ def TF_NotEqualOp : TF_Op<"NotEqual", [Broadcastable, Commutative, NoSideEffect]
   }];
 
   let arguments = (ins
-    TensorOf<[BF16, F16, F32, F64, I1, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Str]>:$x,
-    TensorOf<[BF16, F16, F32, F64, I1, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Str]>:$y
+    TensorOf<[BF16, F16, F32, F64, I1, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Str, TF_Uint8]>:$x,
+    TensorOf<[BF16, F16, F32, F64, I1, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Str, TF_Uint8]>:$y,
+
+    DefaultValuedAttr<BoolAttr, "true">:$incompatible_shape_error
   );
 
   let results = (outs
@@ -2025,6 +2364,15 @@ def TF_NotEqualOp : TF_Op<"NotEqual", [Broadcastable, Commutative, NoSideEffect]
   );
 
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
+
+  let builders = [
+    OpBuilder<"Builder* builder, OperationState* result, Value* x, "
+              "Value* y, BoolAttr incompatible_shape_error">
+  ];
+
+  let verifier = [{
+    return Verify(*this);
+  }];
 }
 
 def TF_OneHotOp : TF_Op<"OneHot", [NoSideEffect]> {
@@ -2121,7 +2469,7 @@ output =
   }];
 
   let arguments = (ins
-    TensorOf<[I32, I64, I8]>:$indices,
+    TensorOf<[I32, I64, TF_Uint8]>:$indices,
     I32Tensor:$depth,
     TF_Tensor:$on_value,
     TF_Tensor:$off_value,
@@ -2176,6 +2524,10 @@ This is the opposite of `unpack`.
   );
 
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
+
+  let verifier = [{
+    return Verify(*this);
+  }];
 }
 
 def TF_PadOp : TF_Op<"Pad", [NoSideEffect]> {
@@ -2303,14 +2655,14 @@ retained with length 1.
   }];
 
   let arguments = (ins
-    TF_NumberTensor:$input,
+    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$input,
     TF_I32OrI64Tensor:$reduction_indices,
 
     DefaultValuedAttr<BoolAttr, "false">:$keep_dims
   );
 
   let results = (outs
-    TF_NumberTensor:$output
+    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$output
   );
 
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
@@ -2406,7 +2758,8 @@ The above round function rounds the value based on the given round_mode.
     DefaultValuedAttr<I64Attr, "8">:$num_bits,
     DefaultValuedAttr<BoolAttr, "false">:$range_given,
     DefaultValuedAttr<TF_AnyStrAttrOf<["HALF_TO_EVEN", "HALF_UP"]>, "HALF_TO_EVEN">:$round_mode,
-    DefaultValuedAttr<BoolAttr, "false">:$narrow_range
+    DefaultValuedAttr<BoolAttr, "false">:$narrow_range,
+    DefaultValuedAttr<I64Attr, "-1">:$axis
   );
 
   let results = (outs
@@ -2432,7 +2785,8 @@ tensor, so its value can change during training.
 
     DefaultValuedAttr<BoolAttr, "true">:$signed_input,
     DefaultValuedAttr<BoolAttr, "true">:$range_given,
-    DefaultValuedAttr<BoolAttr, "false">:$narrow_range
+    DefaultValuedAttr<BoolAttr, "false">:$narrow_range,
+    DefaultValuedAttr<I64Attr, "-1">:$axis
   );
 
   let results = (outs
@@ -2550,12 +2904,12 @@ If `x` and `y` are reals, this will return the floating-point division.
   }];
 
   let arguments = (ins
-    TF_NumberTensor:$x,
-    TF_NumberTensor:$y
+    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Uint16, TF_Uint8]>:$x,
+    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Uint16, TF_Uint8]>:$y
   );
 
   let results = (outs
-    TF_NumberTensor:$z
+    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Uint16, TF_Uint8]>:$z
   );
 
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
@@ -2590,11 +2944,11 @@ def TF_ReluOp : TF_Op<"Relu", [NoSideEffect, SameOperandsAndResultType]> {
   }];
 
   let arguments = (ins
-    TF_IntOrFpTensor:$features
+    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Qint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$features
   );
 
   let results = (outs
-    TF_IntOrFpTensor:$activations
+    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Qint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$activations
   );
 
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
@@ -2709,7 +3063,7 @@ Input images can be of different types but output images are always float.
   }];
 
   let arguments = (ins
-    TF_IntOrFpTensor:$images,
+    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Uint16, TF_Uint8]>:$images,
     I32Tensor:$size,
 
     DefaultValuedAttr<BoolAttr, "false">:$align_corners,
@@ -2732,7 +3086,7 @@ Resize `images` to `size` using nearest neighbor interpolation.
   }];
 
   let arguments = (ins
-    TensorOf<[F16, F32, F64, I16, I32, I64, I8]>:$images,
+    TensorOf<[F16, F32, F64, I16, I32, I64, I8, TF_Uint16, TF_Uint8]>:$images,
     I32Tensor:$size,
 
     DefaultValuedAttr<BoolAttr, "false">:$align_corners,
@@ -2740,7 +3094,7 @@ Resize `images` to `size` using nearest neighbor interpolation.
   );
 
   let results = (outs
-    TensorOf<[F16, F32, F64, I16, I32, I64, I8]>:$resized_images
+    TensorOf<[F16, F32, F64, I16, I32, I64, I8, TF_Uint16, TF_Uint8]>:$resized_images
   );
 
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
@@ -2875,12 +3229,12 @@ reverse(t, dims) ==> [[[[8, 9, 10, 11],
   }];
 
   let arguments = (ins
-    TensorOf<[BF16, F16, F32, F64, I1, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Str]>:$tensor,
+    TensorOf<[BF16, F16, F32, F64, I1, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Str, TF_Uint16, TF_Uint8]>:$tensor,
     TF_I32OrI64Tensor:$axis
   );
 
   let results = (outs
-    TensorOf<[BF16, F16, F32, F64, I1, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Str]>:$output
+    TensorOf<[BF16, F16, F32, F64, I1, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Str, TF_Uint16, TF_Uint8]>:$output
   );
 
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
@@ -3031,6 +3385,10 @@ shape(t) ==> [2, 2, 3]
     return Verify(*this);
   }];
 
+  let builders = [
+    OpBuilder<"Builder* builder, OperationState* result, Value* input, BoolAttr use32Bit">
+  ];
+
   let hasFolder = 1;
 }
 
@@ -3643,12 +4001,12 @@ def TF_SubOp : TF_Op<"Sub", [Broadcastable, NoSideEffect]>,
   }];
 
   let arguments = (ins
-    TF_NumberTensor:$x,
-    TF_NumberTensor:$y
+    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Uint16, TF_Uint8]>:$x,
+    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Uint16, TF_Uint8]>:$y
   );
 
   let results = (outs
-    TF_NumberTensor:$z
+    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Uint16, TF_Uint8]>:$z
   );
 
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
@@ -3667,20 +4025,36 @@ retained with length 1.
   }];
 
   let arguments = (ins
-    TF_NumberTensor:$input,
+    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$input,
     TF_I32OrI64Tensor:$reduction_indices,
 
     DefaultValuedAttr<BoolAttr, "false">:$keep_dims
   );
 
   let results = (outs
-    TF_NumberTensor:$output
+    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$output
   );
 
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
   TF_DerivedOperandTypeAttr Tidx = TF_DerivedOperandTypeAttr<1>;
 }
 
+def TF_TPUCompilationResultOp : TF_Op<"TPUCompilationResult", [NoSideEffect]> {
+  let summary = "Returns the result of a TPU compilation.";
+
+  let description = [{
+This operation returns the result of a TPU compilation as a serialized
+CompilationResultProto, which holds a status and an error message if an error
+occurred during compilation.
+  }];
+
+  let arguments = (ins);
+
+  let results = (outs
+    TF_StrTensor:$output
+  );
+}
+
 def TF_TanhOp : TF_Op<"Tanh", [NoSideEffect, SameOperandsAndResultType]> {
   let summary = "Computes hyperbolic tangent of `x` element-wise.";
 
@@ -3750,6 +4124,23 @@ def TF_TensorListGetItemOp : TF_Op<"TensorListGetItem", [NoSideEffect]> {
   TF_DerivedResultTypeAttr element_dtype = TF_DerivedResultTypeAttr<0>;
 }
 
+def TF_TensorListLengthOp : TF_Op<"TensorListLength", [NoSideEffect]> {
+  let summary = "Returns the number of tensors in the input tensor list.";
+
+  let description = [{
+input_handle: the input list
+length: the number of tensors in the list
+  }];
+
+  let arguments = (ins
+    TF_VariantTensor:$input_handle
+  );
+
+  let results = (outs
+    I32Tensor:$length
+  );
+}
+
 def TF_TensorListPushBackOp : TF_Op<"TensorListPushBack", [NoSideEffect]> {
   let summary = [{
 Returns a list which has the passed-in `Tensor` as last element and the other elements of the given list in `input_handle`.
@@ -3921,12 +4312,12 @@ Python Semantics.
   }];
 
   let arguments = (ins
-    TF_NumberTensor:$x,
-    TF_NumberTensor:$y
+    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Uint16, TF_Uint8]>:$x,
+    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Uint16, TF_Uint8]>:$y
   );
 
   let results = (outs
-    TF_NumberTensor:$z
+    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Uint16, TF_Uint8]>:$z
   );
 
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
@@ -4068,7 +4459,7 @@ where(input) ==> [[0, 0, 0],
   }];
 
   let arguments = (ins
-    TensorOf<[BF16, F16, F32, F64, I1, I16, I32, I64, I8, TF_Complex128, TF_Complex64]>:$input
+    TensorOf<[BF16, F16, F32, F64, I1, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$input
   );
 
   let results = (outs
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_op_base.td b/tensorflow/compiler/mlir/tensorflow/ir/tf_op_base.td
index 080e78042a7..2a3f984d3d1 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_op_base.td
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_op_base.td
@@ -178,7 +178,8 @@ def TF_AnyNumber : AnyTypeOf<[TF_Int, AnyFloat, TF_AnyQuantized, TF_AnyComplex],
 
 def TF_NumberTensor : TensorOf<[TF_AnyNumber]>;
 
-def TF_NumberOrStrTensor : TensorOf<[TF_AnyNumber, TF_Str]>;
+def TF_NumberOrStr : AnyTypeOf<[AnyFloat, TF_SInt, TF_AnyComplex, TF_Uint8, TF_Str]>;
+def TF_NumberOrStrTensor : TensorOf<[TF_NumberOrStr]>;
 
 //===----------------------------------------------------------------------===//
 // TensorFlow attribute definitions
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.cc b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.cc
index 39e3bf08553..8d28ec26507 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.cc
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.cc
@@ -19,13 +19,16 @@ limitations under the License.
 #include <functional>
 #include <numeric>
 #include <string>
+#include <type_traits>
 
 #include "llvm/ADT/Sequence.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/StringSwitch.h"
 #include "llvm/Support/FormatVariadic.h"
 #include "mlir/Dialect/StandardOps/Ops.h"  // TF:local_config_mlir
+#include "mlir/Dialect/Traits.h"  // TF:local_config_mlir
 #include "mlir/IR/Attributes.h"  // TF:local_config_mlir
 #include "mlir/IR/Builders.h"  // TF:local_config_mlir
 #include "mlir/IR/Diagnostics.h"  // TF:local_config_mlir
@@ -34,6 +37,7 @@ limitations under the License.
 #include "mlir/IR/Matchers.h"  // TF:local_config_mlir
 #include "mlir/IR/OpImplementation.h"  // TF:local_config_mlir
 #include "mlir/IR/PatternMatch.h"  // TF:local_config_mlir
+#include "mlir/IR/StandardTypes.h"  // TF:local_config_mlir
 #include "mlir/IR/TypeUtilities.h"  // TF:local_config_mlir
 #include "mlir/IR/Types.h"  // TF:local_config_mlir
 #include "mlir/IR/Value.h"  // TF:local_config_mlir
@@ -71,10 +75,19 @@ static inline bool IsOfRankOrUnranked(Value *value, int64_t rank) {
 // Returns true if the given `value` has at least the specified rank or has
 // unranked type.
 static inline bool HasRankAtLeast(Value *value, int64_t rank) {
-  auto type = value->getType();
+  Type type = value->getType();
   if (auto ranked_type = type.dyn_cast<RankedTensorType>())
     return ranked_type.getRank() >= rank;
-  return type.isa<UnrankedTensorType>();
+  return true;
+}
+
+// Returns true if the given `value` has at most the specified rank or has
+// unranked type.
+static inline bool HasRankAtMost(Value *value, int64_t rank) {
+  Type type = value->getType();
+  if (auto ranked_type = type.dyn_cast<RankedTensorType>())
+    return ranked_type.getRank() <= rank;
+  return true;
 }
 
 // Returns true if the given pair of TensorFlow types can be cast to one
@@ -95,6 +108,85 @@ static bool IsUnknownDimOrRank(int64_t dim_or_rank) {
   return dim_or_rank == -1;
 }
 
+// Returns the tf.Equal/tf.NotEqual result type given `x` and `y` and inputs. If
+// `incompatible_shape_error` is true, reports error if `x` and `y` has
+// incompatible shapes. Otherwise, returns a tensor type with unknown rank.
+static Type DeduceEqualCmpOpType(Builder *builder, Location loc, Value *x,
+                                 Value *y, BoolAttr incompatible_shape_error) {
+  auto result_type =
+      OpTrait::util::getBroadcastedType(x->getType(), y->getType());
+  if (!result_type) {
+    if (incompatible_shape_error.getValue()) {
+      mlir::emitError(loc, "non-broadcastable operands");
+    } else {
+      result_type = builder->getTensorType(builder->getI1Type());
+    }
+  }
+  return result_type;
+}
+
+// Verifies that the given types are cast compatible. If not, emits appropriate
+// error for the given op. If mask_one_dim is set to true, then the types are
+// allowed to have one mismatching dimension. Masking one of the dimensions is
+// useful for ops like Concat that requires all ranked inputs to have the same
+// rank and match dimension sizes for all but one of the dimensions.
+static LogicalResult VerifyTypesCompatibility(
+    Operation::operand_type_range types, bool mask_one_dim, Operation *op) {
+  constexpr int64_t kUninitialized = -1;
+  int64_t common_rank = kUninitialized;
+  llvm::SmallVector<int64_t, 4> common_dims;
+  int64_t dim_to_mask = kUninitialized;
+
+  // Initialize common_rank with rank of the first ranked type and verify that
+  // following ranked types have the same rank.
+  // Similarly, initialize each of the dimensions with the first type that has
+  // the dimension size available and verify that all following types have the
+  // same size for the dimension. However, if mask_one_dim is true, note down
+  // the dimension index on the first mismatch and ignore dimension at that
+  // index in following types.
+  for (Type ty : types) {
+    RankedTensorType ranked_ty = ty.dyn_cast<RankedTensorType>();
+    if (!ranked_ty) continue;
+
+    int64_t rank = ranked_ty.getRank();
+    if (common_rank == kUninitialized) {
+      common_rank = rank;
+      common_dims.resize(common_rank, kUninitialized);
+    } else if (common_rank != rank) {
+      return op->emitError()
+             << "operand type " << ranked_ty
+             << " is not compatible with preceding operands; expected rank: "
+             << common_rank;
+    }
+
+    for (int64_t i = 0, e = common_rank; i != e; i++) {
+      if (i == dim_to_mask) continue;
+
+      int64_t dim = ranked_ty.getDimSize(i);
+      if (dim == kUninitialized) continue;
+
+      int64_t &common_dim = common_dims[i];
+      if (common_dim == kUninitialized) {
+        common_dim = dim;
+      } else if (common_dim != dim) {
+        // If mask_one_dim is true, do not emit an error if this is the only
+        // dimension with mismatches. Note down the dimension to mask it from
+        // the following types.
+        if (mask_one_dim && dim_to_mask == kUninitialized) {
+          dim_to_mask = i;
+          continue;
+        }
+
+        return op->emitError() << "operand type " << ranked_ty
+                               << " is not compatible with preceding operands; "
+                                  "expected dimension at index "
+                               << i << ": " << common_dim;
+      }
+    }
+  }
+  return success();
+}
+
 namespace {
 #include "tensorflow/compiler/mlir/tensorflow/transforms/generated_canonicalize.inc"
 }  // namespace
@@ -176,6 +268,36 @@ void CastOp::getCanonicalizationPatterns(OwningRewritePatternList &results,
   results.insert<CastSameType>(context);
 }
 
+//===----------------------------------------------------------------------===//
+// ConcatOp and ConcatV2Op
+//===----------------------------------------------------------------------===//
+
+template <typename OpT, typename = typename std::enable_if_t<
+                            llvm::is_one_of<OpT, ConcatOp, ConcatV2Op>::value>>
+static LogicalResult Verify(OpT op) {
+  // TODO(hinsu): Convert variadic length attributes to derived attributes.
+  Operation::operand_range values = op.values();
+
+  auto num_values = std::distance(values.begin(), values.end());
+  int64_t attr_N = op.N().getLimitedValue();
+  if (num_values != attr_N) {
+    return op.emitOpError()
+           << "requires attribute 'N' to match the number of inputs; expected: "
+           << num_values << " Found: " << attr_N;
+  }
+
+  int axis_idx = std::is_same<OpT, ConcatOp>() ? 0 : 1;
+  Value *axis = *op.getODSOperands(axis_idx).begin();
+  if (!HasRankAtMost(axis, 1)) {
+    return op.emitOpError(
+        "requires axis to be of scalar type (or vector type for older "
+        "versions)");
+  }
+
+  return VerifyTypesCompatibility(values,
+                                  /*mask_one_dim=*/true, op.getOperation());
+}
+
 //===----------------------------------------------------------------------===//
 // ConjOp
 //===----------------------------------------------------------------------===//
@@ -257,6 +379,26 @@ static LogicalResult Verify(EmptyTensorListOp op) {
   return success();
 }
 
+//===----------------------------------------------------------------------===//
+// EqualOp
+//===----------------------------------------------------------------------===//
+
+static LogicalResult Verify(EqualOp op) {
+  // If we allow inputs to have incompatible type, then nothing to do.
+  if (!op.incompatible_shape_error()) return success();
+
+  // Otherwise, check inputs are broadcastable.
+  return mlir::OpTrait::impl::verifyCompatibleOperandBroadcast(
+      op.getOperation());
+}
+
+void EqualOp::build(Builder *builder, OperationState *result, Value *x,
+                    Value *y, BoolAttr incompatible_shape_error) {
+  auto result_type = DeduceEqualCmpOpType(builder, result->location, x, y,
+                                          incompatible_shape_error);
+  return build(builder, result, result_type, x, y, incompatible_shape_error);
+}
+
 //===----------------------------------------------------------------------===//
 // FakeQuantWithMinMaxArgsOp
 //===----------------------------------------------------------------------===//
@@ -276,12 +418,6 @@ static LogicalResult Verify(FakeQuantWithMinMaxArgsOp op) {
     return op.emitOpError("range is invalid: [" + Twine(std::to_string(rmin)) +
                           "," + Twine(std::to_string(rmax)) + "]");
   }
-  // Range must straddle zero.
-  if (rmin > 0.0 || rmax < 0.0) {
-    return op.emitOpError("range failed to straddle zero: [" +
-                          Twine(std::to_string(rmin)) + "," +
-                          Twine(std::to_string(rmax)) + "]");
-  }
   int64_t num_bits = op.num_bits().getSExtValue();
   if (num_bits < 2 || num_bits > 16) {
     return op.emitOpError(
@@ -308,6 +444,37 @@ static LogicalResult Verify(FakeQuantWithMinMaxVarsOp op) {
   return success();
 }
 
+//===----------------------------------------------------------------------===//
+// FakeQuantWithMinMaxVarsPerChannelOp
+//===----------------------------------------------------------------------===//
+static LogicalResult Verify(FakeQuantWithMinMaxVarsPerChannelOp op) {
+  if (!isOfRankedFloatTensorType(op.min(), 1))
+    return op.emitOpError("requires min to be a 1d float tensor");
+
+  if (!isOfRankedFloatTensorType(op.max(), 1))
+    return op.emitOpError("requires max to be a 1d float tensor");
+
+  Value *inputs = op.inputs();
+  if (!HasRankAtLeast(inputs, 1) ||
+      inputs->getType().isa<UnrankedTensorType>()) {
+    return op.emitError("requires inputs to be at least 1d float tensor");
+  }
+
+  auto inputsType = inputs->getType().cast<ShapedType>();
+  int depth = inputsType.getDimSize(inputsType.getRank() - 1);
+  if (op.min()->getType().cast<ShapedType>().getDimSize(0) != depth ||
+      op.max()->getType().cast<ShapedType>().getDimSize(0) != depth) {
+    return op.emitOpError(
+        "requires min and max to have same size as last dimension of inputs");
+  }
+  int64_t num_bits = op.num_bits().getSExtValue();
+  if (num_bits < 2 || num_bits > 16) {
+    return op.emitOpError(
+        "requires num_bits to be between 2 and 16, inclusive");
+  }
+  return success();
+}
+
 //===----------------------------------------------------------------------===//
 // FusedBatchNormOp
 //===----------------------------------------------------------------------===//
@@ -471,6 +638,74 @@ void NegOp::getCanonicalizationPatterns(OwningRewritePatternList &results,
   results.insert<NegNested>(context);
 }
 
+//===----------------------------------------------------------------------===//
+// NotEqualOp
+//===----------------------------------------------------------------------===//
+
+static LogicalResult Verify(NotEqualOp op) {
+  // If we allow inputs to have incompatible type, then nothing to do.
+  if (!op.incompatible_shape_error()) return success();
+
+  // Otherwise, check inputs are broadcastable.
+  return mlir::OpTrait::impl::verifyCompatibleOperandBroadcast(
+      op.getOperation());
+}
+
+void NotEqualOp::build(Builder *builder, OperationState *result, Value *x,
+                       Value *y, BoolAttr incompatible_shape_error) {
+  auto result_type = DeduceEqualCmpOpType(builder, result->location, x, y,
+                                          incompatible_shape_error);
+  return build(builder, result, result_type, x, y, incompatible_shape_error);
+}
+
+//===----------------------------------------------------------------------===//
+// PackOp
+//===----------------------------------------------------------------------===//
+
+static LogicalResult Verify(PackOp op) {
+  // TODO(hinsu): Convert variadic length attributes to derived attributes.
+  Operation::operand_range values = op.values();
+
+  auto num_values = std::distance(values.begin(), values.end());
+  int64_t attr_N = op.N().getLimitedValue();
+  if (num_values != attr_N) {
+    return op.emitOpError()
+           << "requires attribute 'N' to match the number of inputs; expected: "
+           << num_values << " Found: " << attr_N;
+  }
+
+  if (failed(VerifyTypesCompatibility(values,
+                                      /*mask_one_dim=*/false,
+                                      op.getOperation()))) {
+    return failure();
+  }
+
+  int64_t inputs_rank = -1;
+  for (Value *value : values) {
+    if (auto ty = value->getType().dyn_cast<RankedTensorType>()) {
+      // Exit early as input types are verified to be compatible so all ranked
+      // tensors have the same rank.
+      inputs_rank = ty.getRank();
+      break;
+    }
+  }
+  if (inputs_rank == -1) return success();
+
+  // The values can be packed along any of the dimensions between 0 and
+  // inputs rank, inclusive. Also, as the negative axis values wrap around so
+  // the axis value range is [-(R+1), R+1).
+  int64_t range_begin = -inputs_rank - 1;  // Inclusive
+  int64_t range_end = inputs_rank + 1;     // Exclusive
+  int64_t axis = op.axis().getLimitedValue();
+  if (axis < range_begin || axis >= range_end) {
+    return op.emitError() << "attribute 'axis' should be within range ["
+                          << range_begin << ", " << range_end
+                          << "); actual value: " << axis;
+  }
+
+  return success();
+}
+
 //===----------------------------------------------------------------------===//
 // ReciprocalOp
 //===----------------------------------------------------------------------===//
@@ -731,6 +966,16 @@ OpFoldResult ShapeOp::fold(ArrayRef<Attribute> operands) {
   return b.getDenseElementsAttr(resultType, dimensions);
 }
 
+void ShapeOp::build(Builder *builder, OperationState *result, Value *input,
+                    BoolAttr use32Bit) {
+  auto rankedTensorType = input->getType().dyn_cast<RankedTensorType>();
+  int64_t rank = rankedTensorType ? rankedTensorType.getRank() : -1;
+  auto out_type = use32Bit.getValue() ? builder->getIntegerType(32)
+                                      : builder->getIntegerType(64);
+  return ShapeOp::build(builder, result,
+                        builder->getTensorType({rank}, out_type), input);
+}
+
 //===----------------------------------------------------------------------===//
 // ShapeNOp
 //===----------------------------------------------------------------------===//
diff --git a/tensorflow/compiler/mlir/tensorflow/ops/mlir_passthrough_op.cc b/tensorflow/compiler/mlir/tensorflow/ops/mlir_passthrough_op.cc
new file mode 100644
index 00000000000..fe9bfcccba7
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/ops/mlir_passthrough_op.cc
@@ -0,0 +1,60 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/framework/op.h"
+
+namespace tensorflow {
+
+REGISTER_OP("MlirPassthroughOp")
+    .Attr("mlir_module: string")
+    .Attr("Tinputs : list(type) >= 0")
+    .Input("inputs: Tinputs")
+    .Attr("Toutputs : list(type) >= 0")
+    .Output("outputs: Toutputs")
+    .Doc(R"doc(
+Wraps an arbitrary MLIR computation expressed as a module with a main() function.
+
+This operation does not have an associated kernel and is not intended to be
+executed in a regular TensorFlow session. Instead it is intended to be used for
+testing or for special case where a user intends to pass custom MLIR computation
+through a TensorFlow graph with the intent of having custom tooling processing
+it downstream (when targeting a different environment, like TensorFlow lite for
+example).
+The MLIR module is expected to have a main() function that will be used as an
+entry point. The inputs to the operations will be passed as argument to the
+main() function and the returned values of the main function mapped to the
+outputs.
+Example usage:
+
+```
+import tensorflow as tf
+from tensorflow.compiler.mlir.tensorflow.gen_mlir_passthrough_op import mlir_passthrough_op
+
+mlir_module = '''
+func @main(%arg0 : tensor<10xf32>, %arg1 : tensor<10xf32>) -> tensor<10x10xf32> {
+   %add = "magic.op"(%arg0, %arg1) : (tensor<10xf32>, tensor<10xf32>) -> tensor<10x10xf32>
+   return %ret : tensor<10x10xf32>
+}
+'''
+
+@tf.function
+def foo(x, y):
+  return = mlir_passthrough_op([x, y], mlir_module, Toutputs=[tf.float32])
+
+graph_def = foo.get_concrete_function(tf.TensorSpec([10], tf.float32), tf.TensorSpec([10], tf.float32)).graph.as_graph_def()
+```
+)doc");
+
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/canonicalize.mlir b/tensorflow/compiler/mlir/tensorflow/tests/canonicalize.mlir
index 65feaa8b84c..a15d543825d 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/canonicalize.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/canonicalize.mlir
@@ -236,8 +236,8 @@ func @testLogicalNotOfEqual(%arg0: tensor<8x16xf32>, %arg1: tensor<8x16xf32>) ->
   %1 = "tf.LogicalNot"(%0) : (tensor<8x16xi1>) -> tensor<8x16xi1>
   return %1: tensor<8x16xi1>
 
-// CHECK: %0 = "tf.NotEqual"(%arg0, %arg1) : (tensor<8x16xf32>, tensor<8x16xf32>) -> tensor<8x16xi1>
-// CHECK: return %0
+// CHECK: %[[NE:.*]] = "tf.NotEqual"(%arg0, %arg1) {incompatible_shape_error = true}
+// CHECK: return %[[NE]]
 }
 
 // CHECK-LABEL: testLogicalNotOfNotEqual
@@ -246,8 +246,8 @@ func @testLogicalNotOfNotEqual(%arg0: tensor<8x16xf32>, %arg1: tensor<8x16xf32>)
   %1 = "tf.LogicalNot"(%0) : (tensor<8x16xi1>) -> tensor<8x16xi1>
   return %1: tensor<8x16xi1>
 
-// CHECK: %0 = "tf.Equal"(%arg0, %arg1) : (tensor<8x16xf32>, tensor<8x16xf32>) -> tensor<8x16xi1>
-// CHECK: return %0
+// CHECK: %[[NE:.*]] = "tf.Equal"(%arg0, %arg1) {incompatible_shape_error = true}
+// CHECK: return %[[NE]]
 }
 
 // CHECK-LABEL: testLogicalNotOfGreater
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/cluster_formation.mlir b/tensorflow/compiler/mlir/tensorflow/tests/cluster_formation.mlir
index 9e2fdcc1ee5..caf6e73b98f 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/cluster_formation.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/cluster_formation.mlir
@@ -1,4 +1,4 @@
-// RUN: tf-opt %s -split-input-file -tf-device-cluster-formation | FileCheck %s
+// RUN: tf-opt %s -split-input-file -tf-device-cluster-formation | FileCheck %s -dump-input-on-failure
 
 // Simple case, single device cluster.
 
@@ -72,11 +72,8 @@ module {
   // CHECK-SAME: (%[[ARG_0:[a-z0-9]*]]: tensor<?xi32>)
   func @argliveinotherislands(%arg0: tensor<?xi32>) -> tensor<?xi32> {
     %0 = tf_executor.graph {
-      // CHECK: %[[OTHER_ISLAND_OUTPUT:[0-9]*]]:2 = tf_executor.island {
-      %1:2 = tf_executor.island {
-        %3 = "tf.D"(%arg0) : (tensor<?xi32>) -> tensor<?xi32>
-        tf_executor.yield %3 : tensor<?xi32>
-      }
+      // CHECK: %[[OTHER_ISLAND_OUTPUT:[0-9]*]]:2 = tf_executor.island wraps "tf.D"
+      %1:2 = tf_executor.island wraps "tf.D"(%arg0) : (tensor<?xi32>) -> tensor<?xi32>
 
       %2:2 = tf_executor.island {
         // CHECK: %[[TPU0_OUTPUT:[0-9]*]] = "tf_device.launch"
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/cluster_outlining.mlir b/tensorflow/compiler/mlir/tensorflow/tests/cluster_outlining.mlir
index 30272b443a1..1d0e2b245bf 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/cluster_outlining.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/cluster_outlining.mlir
@@ -90,16 +90,13 @@ module {
   // CHECK-SAME: (%[[ARG_0:[a-z0-9]*]]: tensor<?xi32>)
   func @multiplelaunches(%arg0: tensor<?xi32>) -> tensor<?xi32> {
     %0 = tf_executor.graph {
-      %1:2 = tf_executor.island {
-        // CHECK: %[[A_OUTPUT:[0-9]*]] = "tf_device.launch_func"() {device = "tpu0", func = @tpu0_func}
-        %2 = "tf_device.launch"() ( {
+      %1:2 = tf_executor.island wraps
+        // CHECK: %[[A_OUTPUT:[0-9]*]]:2 = {{.*}} "tf_device.launch_func"() {device = "tpu0", func = @tpu0_func}
+        "tf_device.launch"() ( {
           %3 = "tf.A"() : () -> tensor<?xi32>
           "tf_device.return"(%3) : (tensor<?xi32>) -> ()
         }) {device = "tpu0"} : () -> tensor<?xi32>
-
-        // CHECK: tf_executor.yield %[[A_OUTPUT]]
-        tf_executor.yield %2 : tensor<?xi32>
-      }
+      // CHECK: tf_executor.fetch %[[A_OUTPUT]]#0
       tf_executor.fetch %1#0 : tensor<?xi32>
     }
     return %0 : tensor<?xi32>
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/control_to_executor_dialect.mlir b/tensorflow/compiler/mlir/tensorflow/tests/control_to_executor_dialect.mlir
index 48f4c8f77df..25adff97d48 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/control_to_executor_dialect.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/control_to_executor_dialect.mlir
@@ -11,14 +11,8 @@ func @islands_with_control(tensor<*xf32>) -> tensor<*xf32> {
 }
 
 // CHECK-NEXT: %[[GRAPH:[0-9]*]] = tf_executor.graph {
-// CHECK-NEXT:   %[[IDENTITY:[0-9]*]]:2 = tf_executor.island {
-// CHECK-NEXT:     %{{[0-9]*}} = "tf.Identity"(%[[ARG0]]) : (tensor<*xf32>) -> tensor<*xf32>
-// CHECK-NEXT:     tf_executor.yield %{{[0-9]*}} : tensor<*xf32>
-// CHECK-NEXT:   }
-// CHECK-NEXT:   %[[ADD:[0-9]*]]:2 = tf_executor.island(%[[IDENTITY]]#1) {
-// CHECK-NEXT:     %{{[0-9]*}} = "tf.Add"(%[[ARG0]], %[[ARG0]]) : (tensor<*xf32>, tensor<*xf32>) -> tensor<*xf32>
-// CHECK-NEXT:     tf_executor.yield %{{[0-9]*}} : tensor<*xf32>
-// CHECK-NEXT:   }
+// CHECK-NEXT:   %[[IDENTITY:[0-9]*]]:2 = tf_executor.island wraps "tf.Identity"(%[[ARG0]]) : (tensor<*xf32>) -> tensor<*xf32>
+// CHECK-NEXT:   %[[ADD:[0-9]*]]:2 = tf_executor.island(%[[IDENTITY]]#1) wraps "tf.Add"(%[[ARG0]], %[[ARG0]]) : (tensor<*xf32>, tensor<*xf32>) -> tensor<*xf32>
 // CHECK-NEXT:   tf_executor.fetch %[[ADD]]#0 : tensor<*xf32>
 // CHECK-NEXT: }
 // CHECK-NEXT: return %[[GRAPH]] : tensor<*xf32>
@@ -45,40 +39,19 @@ func @LoopTest() {
 }
 
 // CHECK-NEXT:   tf_executor.graph {
-// CHECK-NEXT:     %[[CONST:[0-9]*]]:2 = tf_executor.island {
-// CHECK-NEXT:       %{{[a-z0-9]*}} = "tf.Const"() {device = "", dtype = "tfdtype$DT_INT32", name = "Const", value = dense<1> : tensor<i32>} : () -> tensor<i32>
-// CHECK-NEXT:       tf_executor.yield %{{[a-z0-9]*}} : tensor<i32>
-// CHECK-NEXT:     }
+// CHECK-NEXT:     %[[CONST:[0-9]*]]:2 = tf_executor.island wraps "tf.Const"() {device = "", dtype = "tfdtype$DT_INT32", name = "Const", value = dense<1> : tensor<i32>} : () -> tensor<i32>
 // CHECK-NEXT:     %[[ENTER:[0-9]*]]:2 = tf_executor.Enter %[[CONST]]#0 frame "while/while_context" : (tensor<i32>) -> (tensor<*xi32>, !tf_executor.control) {T =  "tfdtype$DT_INT32", device =  "", name =  "while/Enter"}
-// CHECK-NEXT:     %[[NOOP:[0-9]*]] = tf_executor.island {
-// CHECK-NEXT:       "tf.NoOp"() {device = "", name = "cluster/pivot"} : () -> ()
-// CHECK-NEXT:       tf_executor.yield
-// CHECK-NEXT:     }
+// CHECK-NEXT:     %[[NOOP:[0-9]*]] = tf_executor.island wraps "tf.NoOp"() {device = "", name = "cluster/pivot"} : () -> ()
 // CHECK-NEXT:     %[[NEXTIT_SRC:[0-9]*]]:3 = tf_executor.NextIteration.Source : tensor<*xi32> {T =  "tfdtype$DT_INT32", device =  "", id =  0 : i64, name =  "while/NextIteration"}
 // CHECK-NEXT:     %[[MERGE:[0-9]*]]:3 = tf_executor.Merge %[[NEXTIT_SRC]]#0, %[[ENTER]]#0 : tensor<*xi32> {N = 2 : i64, T =  "tfdtype$DT_INT32", device =  "", name =  "while/Merge"}
-// CHECK-NEXT:     %[[CONST_LESS:[0-9]*]]:2 = tf_executor.island(%[[MERGE]]#2) {
-// CHECK-NEXT:       %{{[a-z0-9]*}} = "tf.Const"() {device =  "", dtype =  "tfdtype$DT_INT32", name =  "while/Less/y", value =  dense<2> : tensor<i32>} : () -> tensor<i32>
-// CHECK-NEXT:       tf_executor.yield %{{[a-z0-9]*}} : tensor<i32>
-// CHECK-NEXT:     }
-// CHECK-NEXT:     %[[LESS:[0-9]*]]:2 = tf_executor.island {
-// CHECK-NEXT:       %{{[a-z0-9]*}} = "tf.Less"(%[[MERGE]]#0, %[[CONST_LESS]]#0) {T =  "tfdtype$DT_INT32", device =  "", name =  "while/Less"} : (tensor<*xi32>, tensor<i32>) -> tensor<*xi1>
-// CHECK-NEXT:       tf_executor.yield %{{[a-z0-9]*}} : tensor<*xi1>
-// CHECK-NEXT:     }
+// CHECK-NEXT:     %[[CONST_LESS:[0-9]*]]:2 = tf_executor.island(%[[MERGE]]#2) wraps "tf.Const"() {device =  "", dtype =  "tfdtype$DT_INT32", name =  "while/Less/y", value =  dense<2> : tensor<i32>} : () -> tensor<i32>
+// CHECK-NEXT:     %[[LESS:[0-9]*]]:2 = tf_executor.island  wraps "tf.Less"(%[[MERGE]]#0, %[[CONST_LESS]]#0) {T =  "tfdtype$DT_INT32", device =  "", name =  "while/Less"} : (tensor<*xi32>, tensor<i32>) -> tensor<*xi1>
 // CHECK-NEXT:     %[[COND:[0-9]*]]:2 = tf_executor.LoopCond %[[LESS:[0-9]*]]#0 : (tensor<*xi1>) -> (tensor<i1>, !tf_executor.control) {device =  "", name =  "while/LoopCond"}
 // CHECK-NEXT:     %[[SWITCH:[0-9]*]]:3 = tf_executor.Switch %[[MERGE]]#0, %[[COND]]#0 : tensor<*xi32> {T =  "tfdtype$DT_INT32", _class =  ["loc = @while/Merge"], device =  "", name =  "while/Switch"}
 // CHECK-NEXT:     %[[EXIT:[0-9]*]]:2 = tf_executor.Exit %[[SWITCH]]#0 : tensor<*xi32> {T =  "tfdtype$DT_INT32", device =  "", name =  "while/Exit"}
-// CHECK-NEXT:     %[[IDENTITY:[0-9]*]]:2 = tf_executor.island {
-// CHECK-NEXT:       %{{[a-z0-9]*}} = "tf.Identity"(%[[SWITCH]]#1) {T =  "tfdtype$DT_INT32", device =  "", name =  "while/Identity"} : (tensor<*xi32>) -> tensor<*xi32>
-// CHECK-NEXT:       tf_executor.yield %{{[a-z0-9]*}} : tensor<*xi32>
-// CHECK-NEXT:     }
-// CHECK-NEXT:     %[[CONST_ADD:[0-9]*]]:2 = tf_executor.island(%[[IDENTITY]]#1) {
-// CHECK-NEXT:       %{{[a-z0-9]*}} = "tf.Const"() {device =  "", dtype =  "tfdtype$DT_INT32", name =  "while/Add/y", value = dense<3> : tensor<i32>} : () -> tensor<i32>
-// CHECK-NEXT:       tf_executor.yield %{{[a-z0-9]*}} : tensor<i32>
-// CHECK-NEXT:     }
-// CHECK-NEXT:     %[[ADD:[0-9]*]]:2 = tf_executor.island {
-// CHECK-NEXT:       %{{[0-9]*}} = "tf.Add"(%[[IDENTITY]]#0, %[[CONST_ADD]]#0) {T =  "tfdtype$DT_INT32", device =  "", name =  "while/Add"} : (tensor<*xi32>, tensor<i32>) -> tensor<*xi32>
-// CHECK-NEXT:       tf_executor.yield %{{[0-9]*}} : tensor<*xi32>
-// CHECK-NEXT:     }
+// CHECK-NEXT:     %[[IDENTITY:[0-9]*]]:2 = tf_executor.island wraps "tf.Identity"(%[[SWITCH]]#1) {T =  "tfdtype$DT_INT32", device =  "", name =  "while/Identity"} : (tensor<*xi32>) -> tensor<*xi32>
+// CHECK-NEXT:     %[[CONST_ADD:[0-9]*]]:2 = tf_executor.island(%[[IDENTITY]]#1) wraps "tf.Const"() {device =  "", dtype =  "tfdtype$DT_INT32", name =  "while/Add/y", value = dense<3> : tensor<i32>} : () -> tensor<i32>
+// CHECK-NEXT:     %[[ADD:[0-9]*]]:2 = tf_executor.island wraps "tf.Add"(%[[IDENTITY]]#0, %[[CONST_ADD]]#0) {T =  "tfdtype$DT_INT32", device =  "", name =  "while/Add"} : (tensor<*xi32>, tensor<i32>) -> tensor<*xi32>
 // CHECK-NEXT:     %[[CT:[0-9]*]] = tf_executor.ControlTrigger %[[NOOP]], %[[ADD]]#1, %[[EXIT]]#1 {_tpu_replicate = "cluster", device = "", name = "gradients/while/mul_2_Da30D05wlPU_grad/SymbolicGradient/b_sync"}
 // CHECK-NEXT:     tf_executor.NextIteration.Sink [%[[NEXTIT_SRC]]#1] %[[ADD]]#0, %[[CT]] : tensor<*xi32> {T =  "tfdtype$DT_INT32", device =  "", id = 0 : i64, name =  "while/NextIteration"}
 // CHECK-NEXT:     tf_executor.fetch
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/executor_canonicalize.mlir b/tensorflow/compiler/mlir/tensorflow/tests/executor_canonicalize.mlir
index 5b4e8e16cbb..19ce07db947 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/executor_canonicalize.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/executor_canonicalize.mlir
@@ -285,9 +285,9 @@ func @empty_island_no_operand_no_data_result() {
   return
 }
 
-// CHECK:        %[[ISLAND_0:[0-9]*]] = tf_executor.island {
+// CHECK:        %[[ISLAND_0:[0-9]*]] = tf_executor.island
 // CHECK-NEXT:     "tf.opA"
-// CHECK:        tf_executor.island(%[[ISLAND_0]]) {
+// CHECK:        tf_executor.island(%[[ISLAND_0]])
 // CHECK-NEXT:     "tf.opB"
 // CHECK-NOT:    tf_executor.island
 
@@ -313,9 +313,9 @@ func @empty_island_one_operand_no_data_result() {
   return
 }
 
-// CHECK:        %[[ISLAND_1:[0-9]*]] = tf_executor.island {
+// CHECK:        %[[ISLAND_1:[0-9]*]] = tf_executor.island
 // CHECK-NEXT:     "tf.opA"
-// CHECK:        tf_executor.island(%[[ISLAND_1]]) {
+// CHECK:        tf_executor.island(%[[ISLAND_1]])
 // CHECK-NEXT:     "tf.opB"
 // CHECK-NOT:    tf_executor.island
 
@@ -342,8 +342,34 @@ func @empty_island_no_operand_one_data_no_control_result(%arg0 : tensor<i1>) {
   return
 }
 
-// CHECK:        tf_executor.island {
+// CHECK:        tf_executor.island
 // CHECK-NEXT:     "tf.opA"(%[[ARG_0]])
 // CHECK:        tf_executor.island {
 // CHECK-NEXT:     "tf.opB"(%[[ARG_0]])
 // CHECK-NOT:    tf_executor.island
+
+
+// Test empty control trigger with no operands is removed.
+// Control result users should also have their respective operands removed.
+// CHECK-LABEL: func @empty_control_trigger
+func @empty_control_trigger() {
+  tf_executor.graph {
+    %0 = tf_executor.ControlTrigger {}
+    %1 = tf_executor.island(%0) {
+      %3 = "tf.opA"() : () -> tensor<i1>
+      tf_executor.yield
+    }
+    %2 = tf_executor.island(%0, %1) {
+      %4 = "tf.opB"() : () -> tensor<i1>
+      tf_executor.yield
+    }
+    tf_executor.fetch
+  }
+  return
+}
+
+// CHECK:        %[[ISLAND_0:[0-9]*]] = tf_executor.island
+// CHECK-NEXT:     "tf.opA"
+// CHECK:        tf_executor.island(%[[ISLAND_0]])
+// CHECK-NEXT:     "tf.opB"
+// CHECK-NOT:    tf_executor.island
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/executor_island_coarsening.mlir b/tensorflow/compiler/mlir/tensorflow/tests/executor_island_coarsening.mlir
index a9e83dd006c..35dc4caba90 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/executor_island_coarsening.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/executor_island_coarsening.mlir
@@ -89,9 +89,7 @@ func @empty_islands(%arg0 : tensor<i1>, %arg1 : tensor<i1>) -> (tensor<i1>, tens
   return %0#0, %0#1 : tensor<i1>, tensor<i1>
 }
 
-// CHECK:        %[[ISLAND:[0-9]*]]:3 = tf_executor.island {
-// CHECK-NEXT:     %[[OP_A:[0-9]*]]:2 = "tf.opA"(%[[ARG_1]], %[[ARG_0]])
-// CHECK-NEXT:     tf_executor.yield %[[OP_A]]#0, %[[OP_A]]#1 : tensor<i1>, tensor<i1>
+// CHECK:        %[[ISLAND:[0-9]*]]:3 = tf_executor.island wraps "tf.opA"(%[[ARG_1]], %[[ARG_0]])
 // CHECK:        tf_executor.fetch %[[ISLAND]]#0, %[[ISLAND]]#1 : tensor<i1>, tensor<i1>
 
 
@@ -228,9 +226,7 @@ func @islands_interleaved(%arg0 : tensor<i32>, %arg1 : tensor<i32>) -> (tensor<i
 // CHECK-NEXT:     %[[OP_C:[0-9]*]] = "tf.opC"(%[[OP_A]])
 // CHECK-NEXT:     %{{[0-9]*}} = "tf.opE"(%[[ARG_0]])
 // CHECK-NEXT:     tf_executor.yield %[[OP_C]] : tensor<i32>
-// CHECK:        tf_executor.island {
-// CHECK-NEXT:     %[[OP_F:[0-9]*]] = "tf.opF"(%[[ARG_1]])
-// CHECK-NEXT:     tf_executor.yield %[[OP_F]] : tensor<i32>
+// CHECK:        tf_executor.island wraps "tf.opF"(%[[ARG_1]])
 // CHECK:        tf_executor.fetch %[[ISLAND_0]]#0, %[[ISLAND_1]]#0 : tensor<i32>, tensor<i32>
 
 
@@ -279,13 +275,9 @@ func @merge_islands_only() {
   return
 }
 
-// CHECK:        %[[ISLAND_0:[0-9]*]]:2 = tf_executor.island {
-// CHECK-NEXT:     %[[OP_A:.*]] = "tf.opA"
-// CHECK-NEXT:     tf_executor.yield %[[OP_A]] : tensor<i32>
+// CHECK:        %[[ISLAND_0:[0-9]*]]:2 = tf_executor.island wraps "tf.opA"
 // CHECK:        %[[ENTER:[0-9]*]]:2 = tf_executor.Enter %[[ISLAND_0]]#0
-// CHECK-NEXT:   %[[ISLAND_1:[0-9]*]] = tf_executor.island {
-// CHECK-NEXT:     "tf.opB"()
-// CHECK-NEXT:     tf_executor.yield
+// CHECK-NEXT:   %[[ISLAND_1:[0-9]*]] = tf_executor.island wraps "tf.opB"()
 // CHECK:        %[[NEXTIT_SRC:[0-9]*]]:3 = tf_executor.NextIteration.Source
 // CHECK-NEXT:   %[[MERGE:[0-9]*]]:3 = tf_executor.Merge %[[NEXTIT_SRC]]#0, %[[ENTER]]#0
 // CHECK-NEXT:   %[[ISLAND_2:[0-9]*]]:2 = tf_executor.island(%[[MERGE]]#2) {
@@ -322,9 +314,7 @@ func @simple_potential_cycle() {
   return
 }
 
-// CHECK:        %[[ISLAND:[0-9]*]]:2 = tf_executor.island {
-// CHECK-NEXT:     %[[OP_A:[0-9]*]] = "tf.opA"
-// CHECK-NEXT:     tf_executor.yield %[[OP_A]] : tensor<1xf32>
+// CHECK:        %[[ISLAND:[0-9]*]]:2 = tf_executor.island wraps "tf.opA"
 // CHECK:        %[[CT:[0-9]*]] = tf_executor.ControlTrigger %[[ISLAND]]#1
 // CHECK-NEXT:   tf_executor.island(%[[CT]]) {
 // CHECK-NEXT:     %[[OP_B:[0-9]*]] = "tf.opB"
@@ -384,9 +374,7 @@ func @merge_into_nested_data_result() {
 // CHECK-NEXT:     [[OP_A:[0-9*]]] = "tf.opA"
 // CHECK-NEXT:     [[INNER_GRAPH:[0-9]*]] = tf_executor.graph {
 // CHECK-NEXT:       [[CT:[0-9]*]] = tf_executor.ControlTrigger
-// CHECK-NEXT:       [[ISLAND_1:[0-9]*]]:2 = tf_executor.island(%[[CT]]) {
-// CHECK-NEXT:         [[OP_B:[0-9]*]] = "tf.opB"(%[[OP_A]])
-// CHECK-NEXT:         tf_executor.yield %[[OP_B]] : tensor<1xf32>
+// CHECK-NEXT:       [[ISLAND_1:[0-9]*]]:2 = tf_executor.island(%[[CT]]) wraps "tf.opB"(%[[OP_A]])
 // CHECK:            tf_executor.fetch %[[ISLAND_1]]#0 : tensor<1xf32>
 // CHECK:          tf_executor.yield
 
@@ -422,18 +410,14 @@ func @merge_islands_inner_graph() {
   return
 }
 
-// CHECK:        tf_executor.island {
-// CHECK-NEXT:     [[OP_A:[0-9*]]] = "tf.opA"
-// CHECK-NEXT:     tf_executor.yield %[[OP_A]] : tensor<1xf32>
-// CHECK:        tf_executor.island {
-// CHECK-NEXT:     [[INNER_GRAPH:[0-9]*]] = tf_executor.graph {
+// CHECK:        tf_executor.island wraps "tf.opA"
+// CHECK:        tf_executor.island wraps "tf_executor.graph"() ( {
 // CHECK-NEXT:       [[ISLAND_1:[0-9]*]]:2 = tf_executor.island {
 // CHECK-NEXT:         "tf.opB"
 // CHECK-NEXT:         [[OP_C:[0-9]*]] = "tf.opC"
 // CHECK-NEXT:         [[OP_D:[0-9]*]] = "tf.opD"(%[[OP_C]])
 // CHECK-NEXT:         tf_executor.yield %[[OP_D]] : tensor<1xf32>
 // CHECK:            tf_executor.fetch %[[ISLAND_1]]#0 : tensor<1xf32>
-// CHECK:          tf_executor.yield %[[INNER_GRAPH]] : tensor<1xf32>
 
 
 // Test merging islands with control island operands and island results only if
@@ -454,7 +438,7 @@ func @merge_islands_closest_control() {
   return
 }
 
-// CHECK: %[[ISLAND:[0-9]*]] = tf_executor.island {
+// CHECK: %[[ISLAND:[0-9]*]] = tf_executor.island
 // CHECK: tf_executor.ControlTrigger %[[ISLAND]]
 // CHECK: %[[CT:[0-9]*]] = tf_executor.ControlTrigger
-// CHECK: tf_executor.island(%[[ISLAND]], %[[CT]]) {
+// CHECK: tf_executor.island(%[[ISLAND]], %[[CT]])
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/executor_island_materialize_const.mlir b/tensorflow/compiler/mlir/tensorflow/tests/executor_island_materialize_const.mlir
new file mode 100644
index 00000000000..49247dede30
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/tests/executor_island_materialize_const.mlir
@@ -0,0 +1,23 @@
+// RUN: tf-opt %s -canonicalize | FileCheck %s --dump-input=fail
+
+// Test that a constant stays inside an island after canonicalization
+
+// CHECK-LABEL: func @constant_in_island
+func @constant_in_island(%arg0 : tensor<i1>) -> tensor<f32> {
+  %0 = tf_executor.graph {
+// CHECK: tf_executor.island
+// CHECK: tf.Const{{.*}}2.0
+    %1:2 = tf_executor.island {
+      %0 = "tf.Const"() {value = dense<2.000000e+00> : tensor<f32>} : () -> tensor<f32>
+      tf_executor.yield %0 : tensor<f32>
+    }
+// Uses two islands for no other reason than preventing canonicalization from
+// eliminating the graph entirely.
+    %2:2 = tf_executor.island(%1#1) {
+      %4 = "tf.opB"(%1#0) : (tensor<f32>) -> tensor<f32>
+      tf_executor.yield %4 : tensor<f32>
+    }
+    tf_executor.fetch %2#0 : tensor<f32>
+  }
+  return %0 : tensor<f32>
+}
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/executor_to_control_dialect.mlir b/tensorflow/compiler/mlir/tensorflow/tests/executor_to_control_dialect.mlir
index 11b9b1a564d..5ff18c3cae3 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/executor_to_control_dialect.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/executor_to_control_dialect.mlir
@@ -97,3 +97,24 @@ func @switchN(%arg0: tensor<i32>, %arg1: tensor<*xf32>) -> tensor<*xf32> {
   }
   return %fetches : tensor<*xf32>
 }
+
+// Test if tf_executor dialect ops with Ref types are mapped correctly to the ops in control dialect.
+// CHECK-LABEL: func @ref_tf_executor_ops
+func @ref_tf_executor_ops(%arg0: tensor<4x!tf.f32ref>, %arg1: tensor<4x!tf.f32ref>, %arg3: tensor<i32>, %arg4: tensor<i1> ) -> tensor<4x!tf.f32ref> {
+  %result = tf_executor.graph {
+          // CHECK: _tf.Enter
+          %0:2 = tf_executor.Enter %arg0 frame "while/while_context" : (tensor<4x!tf.f32ref>) -> (tensor<4x!tf.f32ref>, !tf_executor.control)
+          // CHECK: _tf.Exit
+          %1:2 = tf_executor.Exit %arg0 : tensor<4x!tf.f32ref>
+          // CHECK: _tf.Switch
+          %2:3 = tf_executor.Switch %arg0, %arg4 : (tensor<4x!tf.f32ref>, tensor<i1>) -> (tensor<4x!tf.f32ref>, tensor<4x!tf.f32ref>, !tf_executor.control)
+          // CHECK: _tf.Merge
+          %3:3 = tf_executor.Merge %arg0, %arg1 : (tensor<4x!tf.f32ref>, tensor<4x!tf.f32ref>) -> (tensor<4x!tf.f32ref>, tensor<i32>, !tf_executor.control)
+          // CHECK: _tf.NextIteration.source
+          %4:3 = tf_executor.NextIteration.Source : tensor<4x!tf.f32ref>
+          // CHECK: _tf.NextIteration.sink
+          tf_executor.NextIteration.Sink [%4#1] %4#0 : tensor<4x!tf.f32ref>
+          tf_executor.fetch %0#0 : tensor<4x!tf.f32ref>
+  }
+  return %result : tensor<4x!tf.f32ref>
+}
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/add.pbtxt b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/add.pbtxt
index a2b9efff36b..15289bf47ab 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/add.pbtxt
+++ b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/add.pbtxt
@@ -39,13 +39,10 @@ versions {
 # CHECK: func @main(%arg0: tensor<10xi32>, %arg1: tensor<10xi32>) -> tensor<10xi32>
 # CHECK: attributes {tf.entry_function = {inputs = "input0, input1", outputs = "Add"}} {
 
-# CHECK:   %[[INPUT0:[0-9]+]]:2 = tf_executor.island
-# CHECK-NEXT: "tf.Placeholder.input"(%arg0)
+# CHECK:   %[[INPUT0:[0-9]+]]:2 = tf_executor.island wraps "tf.Placeholder.input"(%arg0)
 
-# CHECK:   %[[INPUT1:[0-9]+]]:2 = tf_executor.island
-# CHECK-NEXT: "tf.Placeholder.input"(%arg1)
+# CHECK:   %[[INPUT1:[0-9]+]]:2 = tf_executor.island wraps "tf.Placeholder.input"(%arg1)
 
-# CHECK:   %[[add:[0-9]+]]:2 = tf_executor.island
-# CHECK-NEXT: "tf.Add"(%[[INPUT0]]#0, %[[INPUT1]]#0)
+# CHECK:   %[[add:[0-9]+]]:2 = tf_executor.island wraps "tf.Add"(%[[INPUT0]]#0, %[[INPUT1]]#0)
 
 # CHECK:   fetch %[[add]]#0
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/arg-control-dep.pbtxt b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/arg-control-dep.pbtxt
index 74adc38d87d..7b3462f37cd 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/arg-control-dep.pbtxt
+++ b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/arg-control-dep.pbtxt
@@ -41,8 +41,7 @@ library {
     }
     # Drop the control dependency on arg for the node "test"
     # CHECK-LABEL: func @foo
-    # CHECK: tf_executor.island {
-    # CHECK-NEXT:   "tf.Const"()
+    # CHECK: tf_executor.island wraps "tf.Const"()
     node_def {
       name: "test"
       op: "Const"
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-as-function.pbtxt b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-as-function.pbtxt
index 1bf5037a75f..5c4c23a67db 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-as-function.pbtxt
+++ b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-as-function.pbtxt
@@ -6,12 +6,9 @@
 
 # CHECK:      func @main(%arg0: tensor<*x!tf.resource>, %arg1: tensor<*x!tf.resource>) -> (tensor<f32>, tensor<f32>)
 # CHECK:      attributes {tf.entry_function = {inputs = "args_0, args_1", outputs = "rets_0_RetVal, rets_1_RetVal"}} {
-# CHECK:          %[[ISLAND_0:[0-9]]]:2 = tf_executor.island {
-# CHECK:            "tf.Const"
-# CHECK:          %[[ISLAND_1:[0-9]]]:2 = tf_executor.island {
-# CHECK:            "tf.Identity"(%[[ISLAND_0]]#0)
-# CHECK:          %[[ISLAND_2:[0-9]]]:2 = tf_executor.island {
-# CHECK:            "tf.StatefulPartitionedCall"
+# CHECK:          %[[ISLAND_0:[0-9]]]:2 = tf_executor.island wraps "tf.Const"
+# CHECK:          %[[ISLAND_1:[0-9]]]:2 = tf_executor.island wraps "tf.Identity"(%[[ISLAND_0]]#0)
+# CHECK:          %[[ISLAND_2:[0-9]]]:2 = tf_executor.island wraps "tf.StatefulPartitionedCall"
 # CHECK-SAME:       f = @[[FUNC:[a-z0-9]*]]
 # CHECK:          tf_executor.fetch %[[ISLAND_1]]#0, %[[ISLAND_2]]#0 : tensor<f32>, tensor<f32>
 # CHECK:      func @[[FUNC]](%arg0: tensor<*xf32>, %arg1: tensor<*x!tf.resource>) -> tensor<*xf32>
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-function-control-ret-diff-island.pbtxt b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-function-control-ret-diff-island.pbtxt
index 9238ea92a20..fa095a19eff 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-function-control-ret-diff-island.pbtxt
+++ b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-function-control-ret-diff-island.pbtxt
@@ -5,7 +5,7 @@
 # FetchOp.
 
 # Match the island containing the "tf.Neg", capture the output
-# CHECK:          %[[ISLAND_0:[0-9]*]]:2 = tf_executor.island {{.*[[:space:]].*}} "tf.Neg"
+# CHECK:          %[[ISLAND_0:[0-9]*]]:2 = tf_executor.island wraps "tf.Neg"
 
 # Check that the tf.Neg control is passed to the fetch
 # CHECK:          tf_executor.fetch {{.*}} %[[ISLAND_0]]#1 : tensor<*xf32>, !tf_executor.control
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-function-control-ret-same-island.pbtxt b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-function-control-ret-same-island.pbtxt
index adad8b109b6..dbb1d14e331 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-function-control-ret-same-island.pbtxt
+++ b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-function-control-ret-same-island.pbtxt
@@ -5,7 +5,7 @@
 # FetchOp.
 
 # Match the island containing the "tf.Neg", capture the output
-# CHECK:          %[[ISLAND:[0-9]*]]:2 = tf_executor.island {{.*[[:space:]].*}} "tf.Neg"
+# CHECK:          %[[ISLAND:[0-9]*]]:2 = tf_executor.island wraps "tf.Neg"
 
 # Check that the tf.Neg data output and control are passed to the fetch
 # CHECK:          tf_executor.fetch %[[ISLAND]]#0, %[[ISLAND]]#1 : tensor<*xf32>, !tf_executor.control
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-function-input-shapes.pbtxt b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-function-input-shapes.pbtxt
new file mode 100644
index 00000000000..fc27e82d20e
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-function-input-shapes.pbtxt
@@ -0,0 +1,110 @@
+# RUN: tf-mlir-translate -graphdef-to-mlir %s -o - | FileCheck %s
+
+# Verify that the _input_shapes attribute of the FunctionDef is respected.
+# This also checks that the output type is correctly inferred based on
+# that.
+#CHECK: func @identity_function0(%arg0: tensor<i32>) -> tensor<i32>
+
+node {
+  name: "Placeholder"
+  op: "Placeholder"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_BOOL
+    }
+  }
+  experimental_debug_info {
+  }
+}
+node {
+  name: "Placeholder_1"
+  op: "Placeholder"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  experimental_debug_info {
+  }
+}
+node {
+  name: "If"
+  op: "If"
+  input: "Placeholder"
+  input: "Placeholder_1"
+  attr {
+    key: "Tcond"
+    value {
+      type: DT_BOOL
+    }
+  }
+  attr {
+    key: "Tin"
+    value {
+      list {
+        type: DT_INT32
+      }
+    }
+  }
+  attr {
+    key: "Tout"
+    value {
+      list {
+        type: DT_INT32
+      }
+    }
+  }
+  attr {
+    key: "else_branch"
+    value {
+      func {
+        name: "identity_function"
+      }
+    }
+  }
+  attr {
+    key: "then_branch"
+    value {
+      func {
+        name: "identity_function"
+      }
+    }
+  }
+  experimental_debug_info {
+  }
+}
+library {
+  function {
+    signature {
+      name: "identity_function"
+      input_arg {
+        name: "identity_input"
+        type: DT_INT32
+      }
+      output_arg {
+        name: "identity_output"
+        type: DT_INT32
+      }
+    }
+    ret {
+      key: "identity_output"
+      value: "identity_input"
+    }
+    attr {
+      key: "_input_shapes"
+      value {
+        list {
+          shape {
+          }
+        }
+      }
+    }
+  }
+}
+versions {
+  producer: 29
+  min_consumer: 12
+}
+
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-function-variable-shapes.pbtxt b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-function-variable-shapes.pbtxt
new file mode 100644
index 00000000000..e75fe8c9d67
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-function-variable-shapes.pbtxt
@@ -0,0 +1,177 @@
+# RUN: tf-mlir-translate -graphdef-to-mlir %s -o - | FileCheck %s
+
+# Verify that the _output_shapes attribute of ReadVariableOp's are used to get
+# variable types.
+# This also checks that the output type is correctly inferred based on
+# that.
+# CHECK: func @__inference_some_function_130(%arg0: tensor<*x!tf.resource>) -> tensor<f32>
+# CHECK: tf.ReadVariableOp"(%arg0) {{.*}} : (tensor<*x!tf.resource>) -> tensor<f32>
+
+
+node {
+  name  : "Variable"
+  op    : "VarHandleOp"
+  attr {
+    key  : "shape"
+    value {
+      shape {
+      }
+    }
+  }
+  attr {
+    key  : "dtype"
+    value {
+      type       : DT_FLOAT
+    }
+  }
+  attr {
+    key  : "shared_name"
+    value {
+      s: "Variable"
+    }
+  }
+  attr {
+    key  : "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+}
+node {
+  name  : "StatefulPartitionedCall"
+  op    : "StatefulPartitionedCall"
+  input : [ "Variable" ]
+  attr {
+    key  : "f"
+    value {
+      func {
+        name: "__inference_some_function_13"
+      }
+    }
+  }
+  attr {
+    key  : "config_proto"
+    value {
+      s: "\n\x07\n\x03GPU\x10\x00\n\x07\n\x03\x43PU\x10\x01\x32\x02J\x00\x38\x01"
+    }
+  }
+  attr {
+    key  : "Tout"
+    value {
+      list {
+        type  : [ DT_FLOAT ]
+      }
+    }
+  }
+  attr {
+    key  : "_gradient_op_type"
+    value {
+      s: "PartitionedCall-29"
+    }
+  }
+  attr {
+    key  : "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key  : "Tin"
+    value {
+      list {
+        type  : [ DT_RESOURCE ]
+      }
+    }
+  }
+}
+library {
+  function {
+    signature {
+      name: "__inference_some_function_13"
+      input_arg {
+        name  : "readvariableop_resource"
+        type  : DT_RESOURCE
+      }
+      output_arg {
+        name  : "identity"
+        type  : DT_FLOAT
+      }
+      is_stateful   : true
+      control_output: [ "ReadVariableOp" ]
+    }
+    node_def {
+      name  : "ReadVariableOp"
+      op    : "ReadVariableOp"
+      input : [ "readvariableop_resource" ]
+      device: "/job:localhost/replica:0/task:0/device:CPU:0"
+      attr {
+        key  : "dtype"
+        value {
+          type       : DT_FLOAT
+        }
+      }
+      attr {
+        key  : "_output_shapes"
+        value {
+          list {
+            shape {
+            }
+          }
+        }
+      }
+    }
+    node_def {
+      name  : "Identity"
+      op    : "Identity"
+      input : [ "ReadVariableOp:value:0", "^ReadVariableOp" ]
+      attr {
+        key  : "T"
+        value {
+          type       : DT_FLOAT
+        }
+      }
+      attr {
+        key  : "_output_shapes"
+        value {
+          list {
+            shape {
+            }
+          }
+        }
+      }
+    }
+    ret {
+      key  : "identity"
+      value: "Identity:output:0"
+    }
+    attr {
+      key  : "_input_shapes"
+      value {
+        list {
+          shape {
+            unknown_rank: true
+          }
+        }
+      }
+    }
+    control_ret {
+      key  : "ReadVariableOp"
+      value: "ReadVariableOp"
+    }
+    arg_attr {
+      key  : 0x00000000
+      value {
+      }
+    }
+  }
+}
+versions {
+  producer     : 148
+  min_consumer : 12
+}
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-scalar-input.pbtxt b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-scalar-input.pbtxt
index 37f7a876814..be059e0b2d2 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-scalar-input.pbtxt
+++ b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-scalar-input.pbtxt
@@ -7,8 +7,7 @@
 # CHECK:  "tf.Placeholder.input"(%arg0)
 
 # CHECK: tf.Relu
-# CHECK:  %[[IDENTITY:[0-9]+]]:3 = tf_executor.island
-# CHECK-NEXT: tf.Identity
+# CHECK:  %[[IDENTITY:[0-9]+]]:3 = tf_executor.island wraps "tf.IdentityN"
 # CHECK:  fetch %[[IDENTITY]]#1, %[[IDENTITY]]#0 : tensor<f32>, tensor<f32>
 
 node {
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/mlir_passthrough_op.pbtxt b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/mlir_passthrough_op.pbtxt
new file mode 100644
index 00000000000..1df903d46ce
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/mlir_passthrough_op.pbtxt
@@ -0,0 +1,101 @@
+# RUN: tf-mlir-translate -graphdef-to-mlir %s | FileCheck %s
+
+# CHECK:"tf.MlirPassthroughOp"
+# CHECK: mlir_module = "\0Afunc @main(%arg0 : tensor<10xf32>, %arg1 : tensor<10xf32>) -> tensor<10x10xf32> {\0A   %add = \22tf.Add\22(%arg0, %arg1) : (tensor<10xf32>, tensor<10xf32>) -> tensor<10xf32>\0A   %ret = \22magic.op\22(%add, %add) : (tensor<10xf32>, tensor<10xf32>) -> tensor<10x10xf32>\0A   return %ret : tensor<10x10xf32>\0A}\0A", name = "MlirPassthroughOp"} : (tensor<10xf32>, tensor<10xf32>) -> tensor<*xf32>
+
+node {
+  name: "x"
+  op: "Placeholder"
+  attr {
+    key: "_user_specified_name"
+    value {
+      s: "x"
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 10
+        }
+      }
+    }
+  }
+}
+node {
+  name: "y"
+  op: "Placeholder"
+  attr {
+    key: "_user_specified_name"
+    value {
+      s: "y"
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 10
+        }
+      }
+    }
+  }
+}
+node {
+  name: "MlirPassthroughOp"
+  op: "MlirPassthroughOp"
+  input: "x"
+  input: "y"
+  attr {
+    key: "Tinputs"
+    value {
+      list {
+        type: DT_FLOAT
+        type: DT_FLOAT
+      }
+    }
+  }
+  attr {
+    key: "Toutputs"
+    value {
+      list {
+        type: DT_FLOAT
+      }
+    }
+  }
+  attr {
+    key: "mlir_module"
+    value {
+      s: "\nfunc @main(%arg0 : tensor<10xf32>, %arg1 : tensor<10xf32>) -> tensor<10x10xf32> {\n   %add = \"tf.Add\"(%arg0, %arg1) : (tensor<10xf32>, tensor<10xf32>) -> tensor<10xf32>\n   %ret = \"magic.op\"(%add, %add) : (tensor<10xf32>, tensor<10xf32>) -> tensor<10x10xf32>\n   return %ret : tensor<10x10xf32>\n}\n"
+    }
+  }
+}
+node {
+  name: "Identity"
+  op: "Identity"
+  input: "MlirPassthroughOp"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+}
+versions {
+  producer: 148
+}
+
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/isolate-placer.mlir b/tensorflow/compiler/mlir/tensorflow/tests/isolate-placer.mlir
index 4566ffb507c..ac6838c9d58 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/isolate-placer.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/isolate-placer.mlir
@@ -13,15 +13,12 @@ func @foo(%arg0: tensor<!tf.resource>) -> tensor<!tf.resource> {
 // The IsolatePlacerInspectionRequiredOpsPass adds Identities for each input/output of function-calling ops.
 
 // Capture the result of input to function call.
-// CHECK:      [[VARIABLE_REG:%[0-9]*]]:2 = tf_executor.island
-// CHECK-NEXT:      "tf.VarHandleOp"()
+// CHECK:      [[VARIABLE_REG:%[0-9]*]]:2 = tf_executor.island wraps "tf.VarHandleOp"()
 
 // Test for the presence of Identity op between input and function call.
-// CHECK: [[IDENTITY_REG:%[0-9]*]]:2 = tf_executor.island
-// CHECK-NEXT: "tf.Identity"([[VARIABLE_REG]]#0)
+// CHECK: [[IDENTITY_REG:%[0-9]*]]:2 = tf_executor.island wraps "tf.Identity"([[VARIABLE_REG]]#0)
 
-// CHECK: [[CALL_RESULT_REG:%[0-9]*]]:2 = tf_executor.island
-// CHECK-NEXT: "tf.StatefulPartitionedCall"([[IDENTITY_REG]]#0)
+// CHECK: [[CALL_RESULT_REG:%[0-9]*]]:2 = tf_executor.island wraps "tf.StatefulPartitionedCall"([[IDENTITY_REG]]#0)
 // CHECK-SAME: f = @[[FUNCTION:[a-zA-Z0-9_]*]]
 
 // Match the inserted Identity op for call output.
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/lower_tf.mlir b/tensorflow/compiler/mlir/tensorflow/tests/lower_tf.mlir
new file mode 100644
index 00000000000..42721d2a406
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/tests/lower_tf.mlir
@@ -0,0 +1,25 @@
+// RUN: tf-opt %s -test-tf-lower-tf | FileCheck %s --dump-input-on-failure
+
+// CHECK-LABEL: simple_pack
+// CHECK-SAME: %[[ARG0:.*]]: tensor<3x5xf32>, %[[ARG1:.*]]: tensor<3x5xf32>
+func @simple_pack(%arg0: tensor<3x5xf32>, %arg1: tensor<3x5xf32>) -> tensor<2x3x5xf32> {
+  // CHECK: %[[AXIS:.*]] = "tf.Const"() {value = dense<0> : tensor<i64>}
+  // CHECK: %[[INP0:.*]] = "tf.ExpandDims"(%[[ARG0]], %[[AXIS]]) : (tensor<3x5xf32>, tensor<i64>) -> tensor<1x3x5xf32>
+  // CHECK: %[[INP1:.*]] = "tf.ExpandDims"(%[[ARG1]], %[[AXIS]]) : (tensor<3x5xf32>, tensor<i64>) -> tensor<1x3x5xf32>
+  // CHECK: "tf.ConcatV2"(%[[INP0]], %[[INP1]], %[[AXIS]]) {N = 2 : i64} : (tensor<1x3x5xf32>, tensor<1x3x5xf32>, tensor<i64>) -> tensor<2x3x5xf32>
+
+  %0 = "tf.Pack"(%arg0, %arg1) {N = 2 : i64} : (tensor<3x5xf32>, tensor<3x5xf32>) -> tensor<2x3x5xf32>
+  return %0 : tensor<2x3x5xf32>
+}
+
+// CHECK-LABEL: pack_with_unranked
+// CHECK-SAME: %[[ARG0:.*]]: tensor<?x5xf32>, %[[ARG1:.*]]: tensor<*xf32>
+func @pack_with_unranked(%arg0: tensor<?x5xf32>, %arg1: tensor<*xf32>) -> tensor<*xf32> {
+  // CHECK: %[[AXIS:.*]] = "tf.Const"() {value = dense<-2> : tensor<i64>}
+  // CHECK: %[[INP0:.*]] = "tf.ExpandDims"(%[[ARG0]], %[[AXIS]]) : (tensor<?x5xf32>, tensor<i64>) -> tensor<?x1x5xf32>
+  // CHECK: %[[INP1:.*]] = "tf.ExpandDims"(%[[ARG1]], %[[AXIS]]) : (tensor<*xf32>, tensor<i64>) -> tensor<*xf32>
+  // CHECK: "tf.ConcatV2"(%[[INP0]], %[[INP1]], %[[AXIS]]) {N = 2 : i64} : (tensor<?x1x5xf32>, tensor<*xf32>, tensor<i64>) -> tensor<*xf32>
+
+  %0 = "tf.Pack"(%arg0, %arg1) {axis = -2 : i64, N = 2 : i64} : (tensor<?x5xf32>, tensor<*xf32>) -> tensor<*xf32>
+  return %0 : tensor<*xf32>
+}
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/materialize_passthrough_op.mlir b/tensorflow/compiler/mlir/tensorflow/tests/materialize_passthrough_op.mlir
new file mode 100644
index 00000000000..dd695f0b871
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/tests/materialize_passthrough_op.mlir
@@ -0,0 +1,15 @@
+// RUN: tf-opt -tf-materialize-passthrough-op %s  | FileCheck %s --dump-input=fail
+
+
+// Check that the MlirPassthroughOp is eliminated and replaced by its attached
+// MLIR module.
+
+// CHECK-LABEL: func @main
+func @main(%arg0 : tensor<10xf32>, %arg1 : tensor<10xf32>) -> tensor<10x10xf32> {
+// CHECK-SAME: (%[[ARG0:.*]]: tensor<10xf32>, %[[ARG1:.*]]: tensor<10xf32>)
+// CHECK-NEXT:    %[[ADD:.*]] = "tf.Add"(%[[ARG0]], %[[ARG1]]) : (tensor<10xf32>, tensor<10xf32>) -> tensor<10xf32>
+// CHECK-NEXT:    %[[MAGIC:.*]] = "magic.op"(%[[ADD]], %[[ADD]]) : (tensor<10xf32>, tensor<10xf32>) -> tensor<10x10xf32>
+// CHECK-NEXT:    return %[[MAGIC]]
+  %0 = "tf.MlirPassthroughOp"(%arg0, %arg1) {Tinputs = ["tfdtype$DT_FLOAT", "tfdtype$DT_FLOAT"], Toutputs = ["tfdtype$DT_FLOAT"], device = "", mlir_module = "\0Afunc @main(%arg0 : tensor<10xf32>, %arg1 : tensor<10xf32>) -> tensor<10x10xf32> {\0A   %add = \22tf.Add\22(%arg0, %arg1) : (tensor<10xf32>, tensor<10xf32>) -> tensor<10xf32>\0A   %ret = \22magic.op\22(%add, %add) : (tensor<10xf32>, tensor<10xf32>) -> tensor<10x10xf32>\0A   return %ret : tensor<10x10xf32>\0A}\0A", name = "MlirPassthroughOp"} : (tensor<10xf32>, tensor<10xf32>) -> tensor<10x10xf32>
+  return %0 : tensor<10x10xf32>
+}
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/mlir2graphdef/ref-while-loop.mlir b/tensorflow/compiler/mlir/tensorflow/tests/mlir2graphdef/ref-while-loop.mlir
new file mode 100644
index 00000000000..f4addb85967
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/tests/mlir2graphdef/ref-while-loop.mlir
@@ -0,0 +1,23 @@
+// RUN: tf-mlir-translate -mlir-to-graphdef %s -o - | FileCheck %s
+
+// Verify the ops generated when Ref type is used in a while loop.
+func @main() {
+  // CHECK:  op: "RefEnter"
+  // CHECK:  op: "RefMerge"
+  // CHECK:  op: "RefSwitch"
+  // CHECK:  op: "RefExit"
+  // CHECK:  op: "RefNextIteration"
+  %0:2 = "_tf.NextIteration.source"() {device = "", T = "tfdtype$DT_INT32"} : () -> (tensor<*x!tf.int32ref>, !_tf.control) loc("while/NextIteration")
+  %1:2 = "_tf.VariableV2"() {device = "", dtype = "tfdtype$DT_INT32", value = dense<0> : tensor<i32>} : () -> (tensor<!tf.int32ref>, !_tf.control) loc("Ref_Variable")
+  %2:2 = "_tf.Enter"(%1#0) {device = "", T = "tfdtype$DT_INT32", frame_name = "while/while_context", is_constant = false, parallel_iterations = 10} : (tensor<!tf.int32ref>) -> (tensor<*x!tf.int32ref>, !_tf.control) loc("while/Enter")
+  %3:3 = "_tf.Merge"(%2#0, %0#0) {device = "", N = 2, T = "tfdtype$DT_INT32"} : (tensor<*x!tf.int32ref>, tensor<*x!tf.int32ref>) -> (tensor<*x!tf.int32ref>, tensor<i32>, !_tf.control) loc("while/Merge")
+  %4:2 = "_tf.Const"(%3#2) {device = "", dtype = "tfdtype$DT_INT32", value = dense<10> : tensor<i32>} : (!_tf.control) -> (tensor<i32>, !_tf.control) loc("while/Less/y")
+  %5:2 = "_tf.Less"(%3#0, %4#0) {device = "", T = "tfdtype$DT_INT32"} : (tensor<*x!tf.int32ref>, tensor<i32>) -> (tensor<*xi1>, !_tf.control) loc("while/Less")
+  %6:2 = "_tf.LoopCond"(%5#0) {device = ""} : (tensor<*xi1>) -> (tensor<i1>, !_tf.control) loc("while/LoopCond")
+  %7:3 = "_tf.Switch"(%3#0, %6#0) {device = "", T = "tfdtype$DT_INT32", _class = ["loc:@while/Merge"]} : (tensor<*x!tf.int32ref>, tensor<i1>) -> (tensor<*x!tf.int32ref>, tensor<*x!tf.int32ref>, !_tf.control) loc("while/Switch")
+  %8:2 = "_tf.Exit"(%7#1) {device = "", T = "tfdtype$DT_INT32"} : (tensor<*x!tf.int32ref>) -> (tensor<*x!tf.int32ref>, !_tf.control) loc("while/Exit")
+  %10:2 = "_tf.Const"(%7#2) {device = "", dtype = "tfdtype$DT_INT32", value = dense<1> : tensor<i32>} : (!_tf.control) -> (tensor<i32>, !_tf.control) loc("while/Add/y")
+  %11:2 = "_tf.AssignAdd"(%7#0, %10#0) {device = "", T = "tfdtype$DT_INT32"} : (tensor<*x!tf.int32ref>, tensor<i32>) -> (tensor<*x!tf.int32ref>, !_tf.control) loc("while/Add")
+  %12 = "_tf.NextIteration.sink"(%11#0) {device = "", T = "tfdtype$DT_INT32"} : (tensor<*x!tf.int32ref>) -> !_tf.control loc("while/NextIteration")
+  return
+}
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/sink_constant.mlir b/tensorflow/compiler/mlir/tensorflow/tests/sink_constant.mlir
new file mode 100644
index 00000000000..10ff24a5336
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/tests/sink_constant.mlir
@@ -0,0 +1,41 @@
+// RUN: tf-opt %s -tf-device-constant-sinking | FileCheck %s --dump-input=fail
+
+// CHECK-LABEL: func @sink_const
+func @sink_const(%arg0 : tensor<16xf32>) -> (tensor<16xf32>, tensor<f32>) {
+  // Verify that the constant are sunk in the tf_device.launch region using them
+  // and removed if no other use is left.
+
+  // Only the 2.0 and 3.0 constants are removed, the 4.0 has a use in the return
+  // CHECK-NOT:"tf.Const"2.0
+  // CHECK-NOT:"tf.Const"3.0
+  %0 = "tf.Const"() {value = dense<2.000000e+00> : tensor<f32>} : () -> tensor<f32>
+  %1 = "tf.Const"() {value = dense<3.000000e+00> : tensor<f32>} : () -> tensor<f32>
+  %2 = "tf.Const"() {value = dense<4.000000e+00> : tensor<f32>} : () -> tensor<f32>
+  %3 = tf_executor.graph {
+    %res, %ctl = tf_executor.island {
+      %3 = "tf_device.launch"() ({
+
+        // In the device region, check that the 3 constants are materialized and
+        // remapped to the uses.
+        // CHECK: tf_device.launch
+        // CHECK-DAG: %[[CST2:.*]] = "tf.Const"{{.*}}2.0
+        // CHECK-DAG: %[[CST3:.*]] = "tf.Const"{{.*}}3.0
+        // CHECK-DAG: %[[CST4:.*]] = "tf.Const"{{.*}}4.0
+        // CHECK-NOT:"tf.Const"
+        // CHECK: %[[MUL1:.*]] = "tf.Mul"(%arg0, %[[CST2]])
+        // CHECK-NEXT: %[[MUL2:.*]] = "tf.Mul"(%[[MUL1]], %[[CST2]])
+        // CHECK-NEXT: %[[MUL3:.*]] = "tf.Mul"(%[[MUL2]], %[[CST3]])
+        // CHECK-NEXT: = "tf.Mul"(%[[MUL3]], %[[CST4]])
+        %3 = "tf.Mul"(%arg0, %0) : (tensor<16xf32>, tensor<f32>) -> tensor<16xf32>
+        %4 = "tf.Mul"(%3, %0) : (tensor<16xf32>, tensor<f32>) -> tensor<16xf32>
+        %5 = "tf.Mul"(%4, %1) : (tensor<16xf32>, tensor<f32>) -> tensor<16xf32>
+        %6 = "tf.Mul"(%5, %2) : (tensor<16xf32>, tensor<f32>) -> tensor<16xf32>
+       "tf_device.return"(%6) : (tensor<16xf32>) -> ()
+      }) {device = "tpu0"} : () -> tensor<16xf32>
+      tf_executor.yield %3 : tensor<16xf32>
+    }
+    tf_executor.fetch %res : tensor<16xf32>
+  }
+  return %3, %2 : tensor<16xf32>, tensor<f32>
+}
+
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/tf-ops.mlir b/tensorflow/compiler/mlir/tensorflow/tests/tf-ops.mlir
index dd6d77f7816..b702f5fe88c 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/tf-ops.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/tf-ops.mlir
@@ -83,6 +83,15 @@ func @testBitcast(%arg0: tensor<3x4x!tf.uint16>) -> tensor<3x4x!tf.quint16> {
 
 // -----
 
+// CHECK-LABEL: func @testReverseV2
+func @testReverseV2(%arg0: tensor<2x4x3x!tf.uint8>, %arg1: tensor<1xi32>) -> tensor<2x4x3x!tf.uint8> {
+  // CHECK: tf.ReverseV2
+  %0 = "tf.ReverseV2"(%arg0, %arg1) : (tensor<2x4x3x!tf.uint8>, tensor<1xi32>) -> tensor<2x4x3x!tf.uint8>
+  return %0 :  tensor<2x4x3x!tf.uint8>
+}
+
+// -----
+
 func @testIdentityWrongType(%arg0: tensor<4x2x!tf.string>) -> tensor<4x2x!tf.stringref> {
   // expected-error @+1 {{requires all operands to be either same as or ref type of results}}
   %0 = "tf.Identity"(%arg0) : (tensor<4x2x!tf.string>) -> tensor<4x2x!tf.stringref>
@@ -459,6 +468,37 @@ func @testInvalidFakeQuantWithMinMaxVarsWrongMaxType(tensor<8x8x8x8xf32>, tensor
 
 // -----
 
+// Test valid tf.FakeQuantWithMinMaxVarsPerChannel
+// CHECK-LABEL: func @FakeQuantWithMinMaxVarsPerChannel
+func @FakeQuantWithMinMaxVarsPerChannel(tensor<1x2x3x8xf32>, tensor<8xf32>, tensor<8xf32>) -> tensor<1x2x3x8xf32> {
+^bb0(%arg0: tensor<1x2x3x8xf32>, %arg1: tensor<8xf32>, %arg2: tensor<8xf32>):
+  // CHECK: "tf.FakeQuantWithMinMaxVarsPerChannel"(%arg0, %arg1, %arg2) : (tensor<1x2x3x8xf32>, tensor<8xf32>, tensor<8xf32>) -> tensor<1x2x3x8xf32>
+  %0 = "tf.FakeQuantWithMinMaxVarsPerChannel"(%arg0, %arg1, %arg2) : (tensor<1x2x3x8xf32>, tensor<8xf32>, tensor<8xf32>) -> tensor<1x2x3x8xf32>
+  return %0 : tensor<1x2x3x8xf32>
+}
+
+// -----
+
+// Test invalid tf.FakeQuantWithMinMaxVarsPerChannel
+func @FakeQuantWithMinMaxVarsPerChannel_ranked_inputs(tensor<f32>, tensor<8xf32>, tensor<8xf32>) -> tensor<f32> {
+^bb0(%arg0: tensor<f32>, %arg1: tensor<8xf32>, %arg2: tensor<8xf32>):
+  // expected-error @+1 {{requires inputs to be at least 1d float tensor}}
+  %0 = "tf.FakeQuantWithMinMaxVarsPerChannel"(%arg0, %arg1, %arg2) : (tensor<f32>, tensor<8xf32>, tensor<8xf32>) -> tensor<f32>
+  return %0 : tensor<f32>
+}
+
+// -----
+
+// Test invalid tf.FakeQuantWithMinMaxVarsPerChannel
+func @FakeQuantWithMinMaxVarsPerChannel_mismatch_min_max(tensor<1x2x3x8xf32>, tensor<1xf32>, tensor<8xf32>) -> tensor<1x2x3x8xf32> {
+^bb0(%arg0: tensor<1x2x3x8xf32>, %arg1: tensor<1xf32>, %arg2: tensor<8xf32>):
+  // expected-error @+1 {{requires min and max to have same size as last dimension of inputs}}
+  %0 = "tf.FakeQuantWithMinMaxVarsPerChannel"(%arg0, %arg1, %arg2) : (tensor<1x2x3x8xf32>, tensor<1xf32>, tensor<8xf32>) -> tensor<1x2x3x8xf32>
+  return %0 : tensor<1x2x3x8xf32>
+}
+
+// -----
+
 // Test valid tf.FusedBatchNorm
 // CHECK-LABEL: func @testFusedBatchNorm
 func @testFusedBatchNorm(tensor<8x8x8x8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>) -> tensor<8x8x8x8xf32> {
@@ -944,25 +984,25 @@ func @testLess(tensor<4xi32>, tensor<4xi32>) -> tensor<4xi32> {
 // -----
 
 // Test valid tf.ConcatV2
-func @testConcatV2(%arg: tensor<8x16xf32>, %axis: tensor<1xi32>) -> tensor<?xf32> {
-  // CHECK: %0 = "tf.ConcatV2"(%arg0, %arg0, %arg1) {N = 2 : i64} : (tensor<8x16xf32>, tensor<8x16xf32>, tensor<1xi32>) -> tensor<?xf32>
-  %0 = "tf.ConcatV2"(%arg, %arg, %axis) {N = 2: i64} : (tensor<8x16xf32>, tensor<8x16xf32>, tensor<1xi32>) -> tensor<?xf32>
+func @testConcatV2(%arg: tensor<8x16xf32>, %axis: tensor<i32>) -> tensor<?xf32> {
+  // CHECK: %0 = "tf.ConcatV2"(%arg0, %arg0, %arg1) {N = 2 : i64} : (tensor<8x16xf32>, tensor<8x16xf32>, tensor<i32>) -> tensor<?xf32>
+  %0 = "tf.ConcatV2"(%arg, %arg, %axis) {N = 2: i64} : (tensor<8x16xf32>, tensor<8x16xf32>, tensor<i32>) -> tensor<?xf32>
   return %0 : tensor<?xf32>
 }
 
 // -----
 
 // tf.ConcatV2 with wrong 'axis' element type
-func @testConcatV2(%arg: tensor<8x16xf32>, %axis: tensor<1xf32>) -> tensor<?xf32> {
+func @testConcatV2(%arg: tensor<8x16xf32>, %axis: tensor<f32>) -> tensor<?xf32> {
   // expected-error @+1 {{operand #2 must be tensor of 32/64-bit integer values}}
-  %0 = "tf.ConcatV2"(%arg, %arg, %axis) {N = 2: i64} : (tensor<8x16xf32>, tensor<8x16xf32>, tensor<1xf32>) -> tensor<?xf32>
+  %0 = "tf.ConcatV2"(%arg, %arg, %axis) {N = 2: i64} : (tensor<8x16xf32>, tensor<8x16xf32>, tensor<f32>) -> tensor<?xf32>
   return %0 : tensor<?xf32>
 }
 
 // -----
 
 // tf.ConcatV2 missing required 'axis' operand
-func @testConcatV2(%arg: tensor<8x16xf32>, %axis: tensor<1xi32>) -> tensor<?xf32> {
+func @testConcatV2() -> tensor<?xf32> {
   // expected-error @+1 {{expected 1 or more operands}}
   %0 = "tf.ConcatV2"() {N = 0: i64} : () -> tensor<?xf32>
   return %0 : tensor<?xf32>
@@ -971,9 +1011,165 @@ func @testConcatV2(%arg: tensor<8x16xf32>, %axis: tensor<1xi32>) -> tensor<?xf32
 // -----
 
 // tf.ConcatV2 with less than required number of values for the variadic operand
-func @testConcatV2(%arg: tensor<8x16xf32>, %axis: tensor<1xi32>) -> tensor<?xf32> {
+func @testConcatV2(%arg: tensor<8x16xf32>, %axis: tensor<i32>) -> tensor<?xf32> {
   // expected-error @+1 {{attribute 'N' failed to satisfy constraint: 64-bit integer attribute whose minimal value is 2}}
-  %0 = "tf.ConcatV2"(%arg, %axis) {N = 1: i64} : (tensor<8x16xf32>, tensor<1xi32>) -> tensor<?xf32>
+  %0 = "tf.ConcatV2"(%arg, %axis) {N = 1: i64} : (tensor<8x16xf32>, tensor<i32>) -> tensor<?xf32>
   return %0 : tensor<?xf32>
 }
 
+// -----
+
+func @testConcatV2(%arg: tensor<8x16xf32>, %axis: tensor<i32>) -> tensor<?xf32> {
+  // expected-error @+1 {{requires attribute 'N' to match the number of inputs; expected: 2 Found: 3}}
+  %0 = "tf.ConcatV2"(%arg, %arg, %axis) {N = 3: i64} : (tensor<8x16xf32>, tensor<8x16xf32>, tensor<i32>) -> tensor<?xf32>
+  return %0 : tensor<?xf32>
+}
+
+// -----
+
+func @testAll(%arg0: tensor<2x2xi1>, %arg1: tensor<i32>) -> tensor<i1> {
+  %0 = "tf.All"(%arg0, %arg1) {keep_dims = false} : (tensor<2x2xi1>, tensor<i32>) -> tensor<i1>
+  return %0 : tensor<i1>
+
+  // CHECK-LABEL: testAll
+  // CHECK: %0 = "tf.All"(%arg0, %arg1) {keep_dims = false} : (tensor<2x2xi1>, tensor<i32>) -> tensor<i1>
+}
+
+// -----
+
+func @testAll64(%arg0: tensor<2x2xi1>, %arg1: tensor<i64>) -> tensor<i1> {
+  %0 = "tf.All"(%arg0, %arg1) {keep_dims = false} : (tensor<2x2xi1>, tensor<i64>) -> tensor<i1>
+  return %0 : tensor<i1>
+
+  // CHECK-LABEL: testAll64
+  // CHECK: %0 = "tf.All"(%arg0, %arg1) {keep_dims = false} : (tensor<2x2xi1>, tensor<i64>) -> tensor<i1>
+}
+
+// -----
+
+func @testAllFloat(%arg0: tensor<2x2xi1>, %arg1: tensor<f32>) -> tensor<i1> {
+  // expected-error @+1 {{'tf.All' op operand #1 must be tensor of 32/64-bit integer values}}
+  %0 = "tf.All"(%arg0, %arg1) {keep_dims = false} : (tensor<2x2xi1>, tensor<f32>) -> tensor<i1>
+  return %0 : tensor<i1>
+}
+
+// -----
+
+func @testAllI32(%arg0: tensor<2x2xi32>, %arg1: tensor<f32>) -> tensor<i32> {
+  // expected-error @+1 {{'tf.All' op operand #0 must be tensor of 1-bit integer values}}
+  %0 = "tf.All"(%arg0, %arg1) {keep_dims = false} : (tensor<2x2xi32>, tensor<f32>) -> tensor<i32>
+  return %0 : tensor<i32>
+}
+
+// -----
+
+func @testEqualOpIncompatibleShapeTrue(%x: tensor<5xf32>, %y: tensor<4xf32>) -> tensor<5xi1> {
+  // expected-error @+1 {{operands don't have broadcast-compatible shapes}}
+  %0 = "tf.Equal"(%x, %y) {incompatible_shape_error = true} : (tensor<5xf32>, tensor<4xf32>) -> tensor<5xi1>
+  return %0 : tensor<5xi1>
+}
+
+// -----
+
+// CHECK-LABEL: testEqualOpIncompatibleShapeFalse
+func @testEqualOpIncompatibleShapeFalse(%x: tensor<5xf32>, %y: tensor<4xf32>) -> tensor<*xi1> {
+  // CHECK: tf.Equal
+  %0 = "tf.Equal"(%x, %y) {incompatible_shape_error = false} : (tensor<5xf32>, tensor<4xf32>) -> tensor<*xi1>
+  return %0 : tensor<*xi1>
+}
+
+// -----
+
+func @testNotEqualOpIncompatibleShapeTrue(%x: tensor<5xf32>, %y: tensor<4xf32>) -> tensor<5xi1> {
+  // expected-error @+1 {{operands don't have broadcast-compatible shapes}}
+  %0 = "tf.NotEqual"(%x, %y) {incompatible_shape_error = true} : (tensor<5xf32>, tensor<4xf32>) -> tensor<5xi1>
+  return %0 : tensor<5xi1>
+}
+
+// -----
+
+// CHECK-LABEL: testNotEqualOpIncompatibleShapeFalse
+func @testNotEqualOpIncompatibleShapeFalse(%x: tensor<5xf32>, %y: tensor<4xf32>) -> tensor<*xi1> {
+  // CHECK: tf.NotEqual
+  %0 = "tf.NotEqual"(%x, %y) {incompatible_shape_error = false} : (tensor<5xf32>, tensor<4xf32>) -> tensor<*xi1>
+  return %0 : tensor<*xi1>
+}
+
+// -----
+
+func @testConcatV2(%arg: tensor<8x16xf32>, %axis: tensor<1x1xi32>) -> tensor<*xf32> { // expected-error @+1 {{requires axis to be of scalar type (or vector type for older versions)}}
+  %0 = "tf.ConcatV2"(%arg, %arg, %axis) {N = 2: i64} : (tensor<8x16xf32>, tensor<8x16xf32>, tensor<1x1xi32>) -> tensor<*xf32>
+  return %0 : tensor<*xf32>
+}
+
+// -----
+
+func @testConcatV2(%arg: tensor<8x16xf32>, %axis: tensor<1x1xi32>) -> tensor<*xf32> {
+  // expected-error @+1 {{requires axis to be of scalar type (or vector type for older versions)}}
+  %0 = "tf.Concat"(%axis, %arg, %arg) {N = 2: i64} : (tensor<1x1xi32>, tensor<8x16xf32>, tensor<8x16xf32>) -> tensor<*xf32>
+  return %0 : tensor<*xf32>
+}
+
+// -----
+
+func @testConcatV2(%arg0: tensor<8x16xf32>, %arg1: tensor<8xf32>, %axis: tensor<i32>) -> tensor<*xf32> {
+  // expected-error @+1 {{operand type 'tensor<8xf32>' is not compatible with preceding operands; expected rank: 2}}
+  %0 = "tf.ConcatV2"(%arg0, %arg1, %axis) {N = 2: i64} : (tensor<8x16xf32>, tensor<8xf32>, tensor<i32>) -> tensor<*xf32>
+  return %0 : tensor<*xf32>
+}
+
+// -----
+
+// Valid Concat operation with concat axis 1 or -1.
+func @testConcatV2(%arg0: tensor<8x16xf32>, %arg1: tensor<8x8xf32>, %axis: tensor<i32>) -> tensor<*xf32> {
+  %0 = "tf.ConcatV2"(%arg0, %arg1, %axis) {N = 2: i64} : (tensor<8x16xf32>, tensor<8x8xf32>, tensor<i32>) -> tensor<*xf32>
+  return %0 : tensor<*xf32>
+}
+
+// -----
+
+func @testConcatV2(%arg0: tensor<8x16xf32>, %arg1: tensor<16x8xf32>, %axis: tensor<i32>) -> tensor<*xf32> {
+  // expected-error @+1 {{operand type 'tensor<16x8xf32>' is not compatible with preceding operands; expected dimension at index 1: 16}}
+  %0 = "tf.ConcatV2"(%arg0, %arg1, %axis) {N = 2: i64} : (tensor<8x16xf32>, tensor<16x8xf32>, tensor<i32>) -> tensor<*xf32>
+  return %0 : tensor<*xf32>
+}
+
+// -----
+
+// Valid Concat operation with concat axis 1 or -1.
+func @testConcatV2(%arg0: tensor<8x8xf32>, %arg1: tensor<?x4xf32>, %arg2: tensor<*xf32>, %arg3: tensor<8x?xf32>, %axis: tensor<i32>) -> tensor<*xf32> {
+  %0 = "tf.ConcatV2"(%arg0, %arg1, %arg2, %arg3, %axis) {N = 4: i64} : (tensor<8x8xf32>, tensor<?x4xf32>, tensor<*xf32>, tensor<8x?xf32>, tensor<i32>) -> tensor<*xf32>
+  return %0 : tensor<*xf32>
+}
+
+// -----
+
+// Valid Pack operation.
+func @testPack(%arg0: tensor<4x8xf32>, %arg1: tensor<4x8xf32>) -> tensor<*xf32> {
+  %0 = "tf.Pack"(%arg0, %arg1) {axis = 1 : i64, N = 2: i64} : (tensor<4x8xf32>, tensor<4x8xf32>) -> tensor<*xf32>
+  return %0 : tensor<*xf32>
+}
+
+// -----
+
+func @testPack(%arg0: tensor<4x8xf32>, %arg1: tensor<4x8xf32>) -> tensor<*xf32> {
+  // expected-error @+1 {{requires attribute 'N' to match the number of inputs; expected: 2 Found: 1}}
+  %0 = "tf.Pack"(%arg0, %arg1) {axis = 1 : i64, N = 1: i64} : (tensor<4x8xf32>, tensor<4x8xf32>) -> tensor<*xf32>
+  return %0 : tensor<*xf32>
+}
+
+// -----
+
+func @testPack(%arg0: tensor<4x8xf32>, %arg1: tensor<4x2xf32>) -> tensor<*xf32> {
+  // expected-error @+1 {{operand type 'tensor<4x2xf32>' is not compatible with preceding operands; expected dimension at index 1: 8}}
+  %0 = "tf.Pack"(%arg0, %arg1) {axis = 1 : i64, N = 2: i64} : (tensor<4x8xf32>, tensor<4x2xf32>) -> tensor<*xf32>
+  return %0 : tensor<*xf32>
+}
+
+// -----
+
+func @testPack(%arg0: tensor<4x8xf32>, %arg1: tensor<4x8xf32>, %axis: tensor<i32>) -> tensor<*xf32> {
+  // expected-error @+1 {{attribute 'axis' should be within range [-3, 3); actual value: 3}}
+  %0 = "tf.Pack"(%arg0, %arg1) {axis = 3 : i64, N = 2: i64} : (tensor<4x8xf32>, tensor<4x8xf32>) -> tensor<*xf32>
+  return %0 : tensor<*xf32>
+}
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/tf_executor_ops.mlir b/tensorflow/compiler/mlir/tensorflow/tests/tf_executor_ops.mlir
index 2890656c013..fca724e196a 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/tf_executor_ops.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/tf_executor_ops.mlir
@@ -271,6 +271,39 @@ func @merge_with_variant_type(%arg0: tensor<!tf.variant>, %arg1: tensor<!tf.vari
   return %result : tensor<!tf.variant<tensor<8xf32>>>
 }
 
+// CHECK-LABEL: func @merge_with_ref_type
+func @merge_with_ref_type(%arg0: tensor<4x!tf.f32ref>, %arg1: tensor<4xf32>) -> tensor<4xf32> {
+  %result = tf_executor.graph {
+
+// CHECK: tf_executor.Merge{{.*}}(tensor<4x!tf.f32ref>, tensor<4xf32>) -> (tensor<4xf32>, tensor<i32>, !tf_executor.control)
+    %value, %idx, %ctlMerge = "tf_executor.Merge"(%arg0, %arg1) : (tensor<4x!tf.f32ref>, tensor<4xf32>) -> (tensor<4xf32>, tensor<i32>, !tf_executor.control)
+    tf_executor.fetch %value : tensor<4xf32>
+  }
+  return %result : tensor<4xf32>
+}
+
+// CHECK-LABEL: func @merge_with_dynamic_shape
+func @merge_with_dynamic_shape(%arg0: tensor<2xf32>, %arg1: tensor<3xf32>) -> tensor<?xf32> {
+  %result = tf_executor.graph {
+
+// CHECK: tf_executor.Merge{{.*}}(tensor<2xf32>, tensor<3xf32>) -> (tensor<?xf32>, tensor<i32>, !tf_executor.control)
+    %value, %idx, %ctlMerge = "tf_executor.Merge"(%arg0, %arg1) : (tensor<2xf32>, tensor<3xf32>) -> (tensor<?xf32>, tensor<i32>, !tf_executor.control)
+    tf_executor.fetch %value : tensor<?xf32>
+  }
+  return %result : tensor<?xf32>
+}
+
+// CHECK-LABEL: func @merge_with_unranked_shape
+func @merge_with_unranked_shape(%arg0: tensor<2xf32>, %arg1: tensor<3xf32>) -> tensor<*xf32> {
+  %result = tf_executor.graph {
+
+// CHECK: tf_executor.Merge{{.*}}(tensor<2xf32>, tensor<3xf32>) -> (tensor<*xf32>, tensor<i32>, !tf_executor.control)
+    %value, %idx, %ctlMerge = "tf_executor.Merge"(%arg0, %arg1) : (tensor<2xf32>, tensor<3xf32>) -> (tensor<*xf32>, tensor<i32>, !tf_executor.control)
+    tf_executor.fetch %value : tensor<*xf32>
+  }
+  return %result : tensor<*xf32>
+}
+
 // CHECK-LABEL: func @enter(%arg0: tensor<*xf32>, %arg1: i1) -> tensor<*xf32> {
 func @enter(%arg0: tensor<*xf32>, %arg1: i1) -> tensor<*xf32> {
   %result = tf_executor.graph {
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/tf_executor_ops_invalid.mlir b/tensorflow/compiler/mlir/tensorflow/tests/tf_executor_ops_invalid.mlir
index ee3d2b91732..5803cc7b516 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/tf_executor_ops_invalid.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/tf_executor_ops_invalid.mlir
@@ -490,7 +490,7 @@ func @invalid_merge(%arg0: tensor<*xf32>, %arg1: tensor<i1>) -> tensor<*xf32> {
     %true, %false, %ctlSwitch = tf_executor.Switch %arg0, %arg1 : tensor<*xf32>
 
     %value, %idx, %ctlMerge = "tf_executor.Merge"(%true, %false, %arg1) : (tensor<*xf32>, tensor<*xf32>, tensor<i1>) -> (tensor<*xf32>, tensor<i32>, !tf_executor.control)
-// expected-error@-1 {{'tf_executor.Merge' op expects all operands to be broadcastable but got 'tensor<*xf32>' vs 'tensor<i1>'}}
+// expected-error@-1 {{'tf_executor.Merge' op expects all operands to be broadcastable with output type but got 'tensor<i1>' vs 'tensor<*xf32>'}}
     tf_executor.fetch %value : tensor<*xf32>
   }
   return %result : tensor<*xf32>
@@ -502,7 +502,7 @@ func @invalid_merge(%arg0: tensor<*xf32>, %arg1: tensor<i1>) -> tensor<*xf32> {
 func @invalid_merge(%arg0: tensor<*xf32>, %arg1: tensor<4xf32>) -> tensor<8xf32> {
   %result = tf_executor.graph {
     %value, %idx, %ctlMerge = "tf_executor.Merge"(%arg0, %arg1) : (tensor<*xf32>, tensor<4xf32>) -> (tensor<8xf32>, tensor<i32>, !tf_executor.control)
-// expected-error@-1 {{'tf_executor.Merge' op expects all operands to be broadcastable but got 'tensor<8xf32>' vs 'tensor<4xf32>'}}
+// expected-error@-1 {{'tf_executor.Merge' op expects all operands to be broadcastable with output type but got 'tensor<4xf32>' vs 'tensor<8xf32>'}}
     tf_executor.fetch %value : tensor<8xf32>
   }
   return %result : tensor<8xf32>
@@ -514,7 +514,7 @@ func @invalid_merge(%arg0: tensor<*xf32>, %arg1: tensor<4xf32>) -> tensor<8xf32>
 func @invalid_merge(%arg0: tensor<*x!tf.variant>, %arg1: tensor<4x!tf.variant>) -> tensor<8x!tf.variant> {
   %result = tf_executor.graph {
     %value, %idx, %ctlMerge = "tf_executor.Merge"(%arg0, %arg1) : (tensor<*x!tf.variant>, tensor<4x!tf.variant>) -> (tensor<8x!tf.variant>, tensor<i32>, !tf_executor.control)
-// expected-error@-1 {{'tf_executor.Merge' op expects all operands to be broadcastable but got 'tensor<8x!tf.variant>' vs 'tensor<4x!tf.variant>'}}
+// expected-error@-1 {{'tf_executor.Merge' op expects all operands to be broadcastable with output type but got 'tensor<4x!tf.variant>' vs 'tensor<8x!tf.variant>'}}
     tf_executor.fetch %value : tensor<8x!tf.variant>
   }
   return %result : tensor<8x!tf.variant>
@@ -522,6 +522,18 @@ func @invalid_merge(%arg0: tensor<*x!tf.variant>, %arg1: tensor<4x!tf.variant>)
 
 // -----
 
+// Check that if result is a ref type, all operands need to be ref too.
+func @inavlid_merge(%arg0: tensor<4x!tf.f32ref>, %arg1: tensor<4xf32>) -> tensor<4x!tf.f32ref> {
+  %result = tf_executor.graph {
+    %value, %idx, %ctlMerge = "tf_executor.Merge"(%arg0, %arg1) : (tensor<4x!tf.f32ref>, tensor<4xf32>) -> (tensor<4x!tf.f32ref>, tensor<i32>, !tf_executor.control)
+    // expected-error@-1 {{'tf_executor.Merge' op expects same operand and output element type but got 'tensor<4xf32>' vs 'tensor<4x!tf.f32ref>'}}
+    tf_executor.fetch %value : tensor<4x!tf.f32ref>
+  }
+  return %result : tensor<4x!tf.f32ref>
+}
+
+// -----
+
 // Check that merge data inputs can't appear after control input.
 func @invalid_merge(%arg0: tensor<*xf32>, %arg1: tensor<i1>) -> tensor<*xf32> {
   %result = tf_executor.graph {
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/tpu_rewrite.mlir b/tensorflow/compiler/mlir/tensorflow/tests/tpu_rewrite.mlir
index dc2f60b6441..e91e772d47f 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/tpu_rewrite.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/tpu_rewrite.mlir
@@ -11,9 +11,9 @@ module {
 
     %1 = "tf_device.launch_func"(%0) {_tpu_replicate = "cluster0", device = "tpu0", func = @tpu0_func} : (tensor<?xi32>) -> tensor<?xi32>
     // CHECK: %[[A_SHAPE_OUTPUT:[0-9]*]] = "tf.Shape"(%[[A_OUTPUT]])
-    // CHECK: %[[COMPILE_OUTPUT:[0-9]*]]:2 = "tf.MLIRCompileToTPU"(%[[A_SHAPE_OUTPUT]])
-    // CHECK-SAME: _tpu_replicate = "cluster0"
-    // CHECK-SAME: module
+    // CHECK: %[[COMPILE_OUTPUT:[0-9]*]]:2 = "tf._TPUCompileMlir"(%[[A_SHAPE_OUTPUT]])
+    // CHECK-SAME: NumDynamicShapes = 1
+    // CHECK-SAME: mlir_module
     // CHECK-SAME: func @main
     // CHECK-SAME: tf.B
     // CHECK-NOT: func = @tpu0_func
@@ -68,9 +68,8 @@ module {
 
     %1 = "tf_device.launch_func"(%0) {_tpu_replicate = "cluster0", device = "tpu0", func = @tpu0_func} : (tensor<?xi32>) -> tensor<?xi32>
     // CHECK: %[[A_SHAPE_OUTPUT:[0-9]*]] = "tf.Shape"(%[[A_OUTPUT]])
-    // CHECK: %[[COMPILE_OUTPUT:[0-9]*]]:2 = "tf.MLIRCompileToTPU"(%[[A_SHAPE_OUTPUT]])
-    // CHECK-SAME: _tpu_replicate = "cluster0"
-    // CHECK-SAME: module
+    // CHECK: %[[COMPILE_OUTPUT:[0-9]*]]:2 = "tf._TPUCompileMlir"(%[[A_SHAPE_OUTPUT]])
+    // CHECK-SAME: mlir_module
     // CHECK-SAME: func @main
     // CHECK-SAME: tf.B
     // CHECK-SAME: func @nested_func
@@ -112,9 +111,8 @@ module {
 
     %1 = "tf_device.launch_func"(%0) {_tpu_replicate = "cluster0", device = "tpu0", func = @tpu0_func} : (tensor<?xi32>) -> tensor<?xi32>
     // CHECK: %[[A_SHAPE_OUTPUT:[0-9]*]] = "tf.Shape"(%[[A_OUTPUT]])
-    // CHECK: %[[COMPILE_OUTPUT:[0-9]*]]:2 = "tf.MLIRCompileToTPU"(%[[A_SHAPE_OUTPUT]])
-    // CHECK-SAME: _tpu_replicate = "cluster0"
-    // CHECK-SAME: module
+    // CHECK: %[[COMPILE_OUTPUT:[0-9]*]]:2 = "tf._TPUCompileMlir"(%[[A_SHAPE_OUTPUT]])
+    // CHECK-SAME: mlir_module
     // CHECK-SAME: func @main
     // CHECK-SAME: tf.B
     // CHECK-SAME: func @referenced_func
@@ -155,9 +153,8 @@ module {
 
     %1 = "tf_device.launch_func"(%0) {_tpu_replicate = "cluster0", device = "tpu0", func = @tpu0_func} : (tensor<?xi32>) -> tensor<?xi32>
     // CHECK: %[[A_SHAPE_OUTPUT:[0-9]*]] = "tf.Shape"(%[[A_OUTPUT]])
-    // CHECK: %[[COMPILE_OUTPUT:[0-9]*]]:2 = "tf.MLIRCompileToTPU"(%[[A_SHAPE_OUTPUT]])
-    // CHECK-SAME: _tpu_replicate = "cluster0"
-    // CHECK-SAME: module
+    // CHECK: %[[COMPILE_OUTPUT:[0-9]*]]:2 = "tf._TPUCompileMlir"(%[[A_SHAPE_OUTPUT]])
+    // CHECK-SAME: mlir_module
     // CHECK-SAME: func @main
     // CHECK-SAME: tf.B
     // CHECK-SAME: @referenced_func1
@@ -206,9 +203,8 @@ module {
 
     %1 = "tf_device.launch_func"(%0) {_tpu_replicate = "cluster0", device = "tpu0", func = @tpu0_func} : (tensor<?xi32>) -> tensor<?xi32>
     // CHECK: %[[A_SHAPE_OUTPUT:[0-9]*]] = "tf.Shape"(%[[A_OUTPUT]])
-    // CHECK: %[[COMPILE_OUTPUT:[0-9]*]]:2 = "tf.MLIRCompileToTPU"(%[[A_SHAPE_OUTPUT]])
-    // CHECK-SAME: _tpu_replicate = "cluster0"
-    // CHECK-SAME: module
+    // CHECK: %[[COMPILE_OUTPUT:[0-9]*]]:2 = "tf._TPUCompileMlir"(%[[A_SHAPE_OUTPUT]])
+    // CHECK-SAME: mlir_module
     // CHECK-SAME: func @main
     // CHECK-SAME: tf.B
     // CHECK-COUNT-2: call @referenced_func
@@ -251,9 +247,8 @@ module {
 
     %1 = "tf_device.launch_func"(%0) {_tpu_replicate = "cluster0", device = "tpu0", func = @tpu0_func0} : (tensor<?xi32>) -> tensor<?xi32>
     // CHECK: %[[A_SHAPE_OUTPUT:[0-9]*]] = "tf.Shape"(%[[A_OUTPUT]])
-    // CHECK: %[[COMPILE0_OUTPUT:[0-9]*]]:2 = "tf.MLIRCompileToTPU"(%[[A_SHAPE_OUTPUT]])
-    // CHECK-SAME: _tpu_replicate = "cluster0"
-    // CHECK-SAME: module
+    // CHECK: %[[COMPILE0_OUTPUT:[0-9]*]]:2 = "tf._TPUCompileMlir"(%[[A_SHAPE_OUTPUT]])
+    // CHECK-SAME: mlir_module
     // CHECK-SAME: func @main
     // CHECK-SAME: tf.B
     // CHECK-NOT: func = @tpu0_func0
@@ -263,9 +258,8 @@ module {
 
     %2 = "tf_device.launch_func"(%1) {_tpu_replicate = "cluster1", device = "tpu0", func = @tpu0_func1} : (tensor<?xi32>) -> tensor<?xi32>
     // CHECK: %[[EXECUTE0_SHAPE_OUTPUT:[0-9]*]] = "tf.Shape"(%[[EXECUTE0_OUTPUT]])
-    // CHECK: %[[COMPILE1_OUTPUT:[0-9]*]]:2 = "tf.MLIRCompileToTPU"(%[[EXECUTE0_SHAPE_OUTPUT]])
-    // CHECK-SAME: _tpu_replicate = "cluster1"
-    // CHECK-SAME: module
+    // CHECK: %[[COMPILE1_OUTPUT:[0-9]*]]:2 = "tf._TPUCompileMlir"(%[[EXECUTE0_SHAPE_OUTPUT]])
+    // CHECK-SAME: mlir_module
     // CHECK-SAME: func @main
     // CHECK-SAME: tf.D
     // CHECK-NOT: func = @tpu0_func1
@@ -303,9 +297,8 @@ module {
 
     %1 = "tf_device.launch_func"(%0) {_tpu_replicate = "cluster0", device = "tpu0", func = @tpu0_func} : (tensor<?xi32>) -> tensor<?xi32>
     // CHECK: %[[A_SHAPE_OUTPUT:[0-9]*]] = "tf.Shape"(%[[A_OUTPUT]])
-    // CHECK: %[[COMPILE0_OUTPUT:[0-9]*]]:2 = "tf.MLIRCompileToTPU"(%[[A_SHAPE_OUTPUT]])
-    // CHECK-SAME: _tpu_replicate = "cluster0"
-    // CHECK-SAME: module
+    // CHECK: %[[COMPILE0_OUTPUT:[0-9]*]]:2 = "tf._TPUCompileMlir"(%[[A_SHAPE_OUTPUT]])
+    // CHECK-SAME: mlir_module
     // CHECK-SAME: func @main
     // CHECK-SAME: tf.B
     // CHECK-NOT: func = @tpu0_func
@@ -315,9 +308,8 @@ module {
 
     %2 = "tf_device.launch_func"(%1) {_tpu_replicate = "cluster1", device = "tpu0", func = @tpu0_func} : (tensor<?xi32>) -> tensor<?xi32>
     // CHECK: %[[EXECUTE0_SHAPE_OUTPUT:[0-9]*]] = "tf.Shape"(%[[EXECUTE0_OUTPUT]])
-    // CHECK: %[[COMPILE1_OUTPUT:[0-9]*]]:2 = "tf.MLIRCompileToTPU"(%[[EXECUTE0_SHAPE_OUTPUT]])
-    // CHECK-SAME: _tpu_replicate = "cluster1"
-    // CHECK-SAME: module
+    // CHECK: %[[COMPILE1_OUTPUT:[0-9]*]]:2 = "tf._TPUCompileMlir"(%[[EXECUTE0_SHAPE_OUTPUT]])
+    // CHECK-SAME: mlir_module
     // CHECK-SAME: func @main
     // CHECK-SAME: tf.B
     // CHECK-NOT: func = @tpu0_func
@@ -351,9 +343,8 @@ module {
 
     %1 = "tf_device.launch_func"(%0) {_tpu_replicate = "cluster0", device = "tpu0", func = @tpu0_func} : (tensor<?xi32>) -> tensor<?xi32>
     // CHECK: %[[A_SHAPE_OUTPUT:[0-9]*]] = "tf.Shape"(%[[A_OUTPUT]])
-    // CHECK: %[[COMPILE_OUTPUT:[0-9]*]]:2 = "tf.MLIRCompileToTPU"(%[[A_SHAPE_OUTPUT]])
-    // CHECK-SAME: _tpu_replicate = "cluster0"
-    // CHECK-SAME: module
+    // CHECK: %[[COMPILE_OUTPUT:[0-9]*]]:2 = "tf._TPUCompileMlir"(%[[A_SHAPE_OUTPUT]])
+    // CHECK-SAME: mlir_module
     // CHECK-SAME: func @main
     // CHECK-SAME: tf.B
     // CHECK-SAME: func @referenced_func2
@@ -404,3 +395,44 @@ module {
 }
 
 
+// -----
+
+
+// Tests that TPUCompilationResult operations are properly rewritten
+
+// CHECK-LABEL: func @tpu_compilation_result
+func @tpu_compilation_result(%arg0: tensor<?xi32>) -> (tensor<?xi32>, tensor<!tf.string>, tensor<!tf.string>) {
+
+  // CHECK: %[[COMPILE_OUTPUT:[0-9]*]]:2 = "tf._TPUCompileMlir"
+  // CHECK: %[[EXECUTE_OUTPUT:[0-9]*]] = "tf.TPUExecute"
+  %1 = "tf_device.launch_func"(%arg0) {_tpu_replicate = "cluster0", device = "tpu0", func = @tpu0_func} : (tensor<?xi32>) -> tensor<?xi32>
+
+  %compile_result = "tf.TPUCompilationResult"() {_tpu_replicate = "cluster0"} : () -> tensor<!tf.string>
+  %compile_result2 = "tf.TPUCompilationResult"() {_tpu_replicate = "cluster0"} : () -> tensor<!tf.string>
+
+  // CHECK: return %[[EXECUTE_OUTPUT]], %[[COMPILE_OUTPUT]]#0, %[[COMPILE_OUTPUT]]#0
+  return %1, %compile_result, %compile_result2 : tensor<?xi32>, tensor<!tf.string>, tensor<!tf.string>
+}
+
+func @tpu0_func(%arg0: tensor<?xi32>) -> tensor<?xi32> {
+  %0 = "tf.B"(%arg0) : (tensor<?xi32>) -> tensor<?xi32>
+  return %0 : tensor<?xi32>
+}
+
+
+// -----
+
+// Tests that TPUReplicatedInput and TPUReplicatedOutput operations are properly rewritten
+
+func @main(%arg0 : tensor<0xf32>, %arg1 : tensor<0xf32>) -> tensor<0xf32> {
+  // CHECK: %[[EXECUTE_OUTPUT:[0-9]*]] = "tf.TPUExecute"(%arg0, %arg1
+  %0 = "tf.TPUReplicatedInput"(%arg0) {N = 1 : i64} : (tensor<0xf32>) -> tensor<0xf32>
+  %1 = "tf.TPUReplicatedInput"(%arg1) {N = 1 : i64} : (tensor<0xf32>) -> tensor<0xf32>
+  %2 = "tf_device.launch_func"(%0, %1) {device = "", _tpu_replicate = "cluster", func = @_func} : (tensor<0xf32>, tensor<0xf32>) -> tensor<0xf32>
+  %3 = "tf.TPUReplicatedOutput"(%2) {num_replicas = 1 : i64} : (tensor<0xf32>) -> tensor<0xf32>
+  return %3 : tensor<0xf32>
+}
+func @_func(%arg0: tensor<0xf32>, %arg1: tensor<0xf32>) -> tensor<0xf32> {
+  %0 = "tf.Const"() {value = dense<3.000000e+00> : tensor<0xf32>} : () -> tensor<0xf32>
+  return %0 : tensor<0xf32>
+}
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/bridge.cc b/tensorflow/compiler/mlir/tensorflow/transforms/bridge.cc
index a186837bb79..4655f0f8e41 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/bridge.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/bridge.cc
@@ -23,10 +23,11 @@ limitations under the License.
 namespace mlir {
 namespace TFTPU {
 
-void createTPUBridge(PassManager &bridge) {
+void createTPUBridge(OpPassManager &bridge) {
   bridge.addPass(tf_executor::CreateTFExecutorIslandCoarseningPass());
   bridge.addPass(createCanonicalizerPass());
   bridge.addPass(CreateTPUClusterFormationPass());
+  bridge.addPass(tf_executor::CreateTFExecutorConstantSinkingPass());
   bridge.addPass(TFDevice::CreateClusterOutliningPass());
   bridge.addPass(CreateTPURewritePass());
 }
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/canonicalize.td b/tensorflow/compiler/mlir/tensorflow/transforms/canonicalize.td
index 0653c1d109e..ebdc11b8fbf 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/canonicalize.td
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/canonicalize.td
@@ -93,11 +93,13 @@ def LogOfSoftmax : Pat<(TF_LogOp (TF_SoftmaxOp $arg)), (TF_LogSoftmaxOp $arg)>;
 def LogicalNotNested : Pat<(TF_LogicalNotOp (TF_LogicalNotOp $arg)),
                            (replaceWithValue $arg)>;
 
-def LogicalNotOfEqual : Pat<(TF_LogicalNotOp (TF_EqualOp $arg0, $arg1)),
-                            (TF_NotEqualOp $arg0, $arg1)>;
+def LogicalNotOfEqual : Pat<
+    (TF_LogicalNotOp (TF_EqualOp $arg0, $arg1, $shape_error)),
+    (TF_NotEqualOp $arg0, $arg1, $shape_error)>;
 
-def LogicalNotOfNotEqual : Pat<(TF_LogicalNotOp (TF_NotEqualOp $arg0, $arg1)),
-                               (TF_EqualOp $arg0, $arg1)>;
+def LogicalNotOfNotEqual : Pat<
+    (TF_LogicalNotOp (TF_NotEqualOp $arg0, $arg1, $shape_error)),
+    (TF_EqualOp $arg0, $arg1, $shape_error)>;
 
 def LogicalNotOfGreater : Pat<(TF_LogicalNotOp (TF_GreaterOp $arg0, $arg1)),
                               (TF_LessEqualOp $arg0, $arg1)>;
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/lower_tf.cc b/tensorflow/compiler/mlir/tensorflow/transforms/lower_tf.cc
new file mode 100644
index 00000000000..c65544ed5e1
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/lower_tf.cc
@@ -0,0 +1,97 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/mlir/tensorflow/transforms/lower_tf.h"
+
+#include "mlir/IR/Attributes.h"  // TF:local_config_mlir
+#include "mlir/IR/MLIRContext.h"  // TF:local_config_mlir
+#include "mlir/IR/PatternMatch.h"  // TF:local_config_mlir
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
+
+namespace mlir {
+namespace TF {
+namespace {
+
+// Infers ExpandDims op output type for the given input type `ty` and dimension
+// to expand at the given `axis`.
+Type InferExpandDimsType(Type ty, int64_t axis, Builder *builder) {
+  auto ranked_ty = ty.dyn_cast<RankedTensorType>();
+
+  // Unranked type.
+  if (!ranked_ty) return ty;
+
+  auto shape = llvm::to_vector<4>(ranked_ty.getShape());
+  if (axis < 0) axis += ranked_ty.getRank() + 1;
+
+  shape.insert(shape.begin() + axis, 1);
+  return builder->getTensorType(shape, ranked_ty.getElementType());
+}
+
+// Lowers Pack op to ConcatV2 op after changing shape of the inputs with
+// ExpandDims op.
+//
+// Sample result with 2 inputs to pack:
+//
+//   %axis = "tf.Const"() {value = dense<1> : tensor<i64>}
+//   %inp0 = "tf.ExpandDims"(%operand0, %axis): tensor<2xf32> -> tensor<2x1xf32>
+//   %inp1 = "tf.ExpandDims"(%operand1, %axis): tensor<2xf32> -> tensor<2x1xf32>
+//   %result = "tf.ConcatV2"(%operand0, %operand1, %axis) { N = 2 : i64 }:
+//
+class LowerPackOp : public OpRewritePattern<TF::PackOp> {
+ public:
+  explicit LowerPackOp(MLIRContext *context)
+      : OpRewritePattern<TF::PackOp>(context) {}
+
+  PatternMatchResult matchAndRewrite(TF::PackOp op,
+                                     PatternRewriter &rewriter) const override {
+    Location loc = op.getLoc();
+    auto axis_value = rewriter.create<TF::ConstOp>(
+        loc, DenseElementsAttr::get(
+                 rewriter.getTensorType({}, rewriter.getIntegerType(64)),
+                 op.axis()));
+    int64_t axis = op.axis().getLimitedValue();
+
+    Type prev_input_ty, inferred_ty;
+    SmallVector<Value *, 4> expanded_inputs;
+    expanded_inputs.reserve(op.N().getLimitedValue());
+    for (Value *input : op.values()) {
+      // If input type is different than the previous input type, infer the
+      // output type. Otherwise, use the already inferred output type from the
+      // previous iteration.
+      Type input_ty = input->getType();
+      if (input_ty != prev_input_ty) {
+        inferred_ty = InferExpandDimsType(input_ty, axis, &rewriter);
+        prev_input_ty = input_ty;
+      }
+      expanded_inputs.push_back(rewriter.create<TF::ExpandDimsOp>(
+          loc, inferred_ty, input, axis_value));
+    }
+
+    rewriter.replaceOpWithNewOp<TF::ConcatV2Op>(
+        op, op.getType(), expanded_inputs, axis_value,
+        op.getAttrOfType<IntegerAttr>("N"));
+    return matchSuccess();
+  }
+};
+
+}  // namespace
+
+void PopulateLoweringTFPatterns(MLIRContext *context,
+                                OwningRewritePatternList *patterns) {
+  patterns->insert<LowerPackOp>(context);
+}
+
+}  // namespace TF
+}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/lower_tf.h b/tensorflow/compiler/mlir/tensorflow/transforms/lower_tf.h
new file mode 100644
index 00000000000..4b85ac3b46a
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/lower_tf.h
@@ -0,0 +1,33 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_TENSORFLOW_TRANSFORMS_LOWER_TF_H_
+#define TENSORFLOW_COMPILER_MLIR_TENSORFLOW_TRANSFORMS_LOWER_TF_H_
+
+#include "mlir/IR/MLIRContext.h"  // TF:local_config_mlir
+#include "mlir/IR/PatternMatch.h"  // TF:local_config_mlir
+
+namespace mlir {
+namespace TF {
+
+// Populates TensorFlow lowering patterns to lower some of the TensorFlow
+// operations that can be represented using other TensorFlow operations.
+void PopulateLoweringTFPatterns(MLIRContext *context,
+                                OwningRewritePatternList *patterns);
+
+}  // namespace TF
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TENSORFLOW_TRANSFORMS_LOWER_TF_H_
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/lower_tf_pass.cc b/tensorflow/compiler/mlir/tensorflow/transforms/lower_tf_pass.cc
new file mode 100644
index 00000000000..309d0147bc0
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/lower_tf_pass.cc
@@ -0,0 +1,42 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "mlir/IR/PatternMatch.h"  // TF:local_config_mlir
+#include "mlir/Pass/Pass.h"  // TF:local_config_mlir
+#include "tensorflow/compiler/mlir/tensorflow/transforms/lower_tf.h"
+
+namespace mlir {
+namespace TF {
+namespace {
+
+// Lowers some of the TensorFlow operations that can be represented using other
+// TensorFlow operations.
+struct LowerTF : public FunctionPass<LowerTF> {
+  void runOnFunction() override {
+    // Add lowering patterns to the list.
+    OwningRewritePatternList patterns;
+    mlir::TF::PopulateLoweringTFPatterns(&getContext(), &patterns);
+
+    applyPatternsGreedily(getFunction(), patterns);
+  }
+};
+
+}  // namespace
+}  // namespace TF
+}  // namespace mlir
+
+static mlir::PassRegistration<mlir::TF::LowerTF> pass(
+    "test-tf-lower-tf",
+    "Lowers some of the TensorFlow ops to other TensorFlow ops");
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/materialize_mlir_passthrough_op.cc b/tensorflow/compiler/mlir/tensorflow/transforms/materialize_mlir_passthrough_op.cc
new file mode 100644
index 00000000000..0f74fda2336
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/materialize_mlir_passthrough_op.cc
@@ -0,0 +1,109 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <memory>
+#include <tuple>
+
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/Support/Casting.h"
+#include "mlir/IR/Block.h"  // TF:local_config_mlir
+#include "mlir/IR/Diagnostics.h"  // TF:local_config_mlir
+#include "mlir/IR/Module.h"  // TF:local_config_mlir
+#include "mlir/IR/OpDefinition.h"  // TF:local_config_mlir
+#include "mlir/IR/Operation.h"  // TF:local_config_mlir
+#include "mlir/IR/Types.h"  // TF:local_config_mlir
+#include "mlir/IR/Value.h"  // TF:local_config_mlir
+#include "mlir/Parser.h"  // TF:local_config_mlir
+#include "mlir/Pass/Pass.h"  // TF:local_config_mlir
+#include "mlir/Pass/PassRegistry.h"  // TF:local_config_mlir
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
+
+#define DEBUG_TYPE "tf-materialize-passthrough-op"
+
+namespace mlir {
+namespace {
+
+class MaterializePassthroughOpPass
+    : public FunctionPass<MaterializePassthroughOpPass> {
+ public:
+  void runOnFunction() override;
+};
+
+void MaterializePassthroughOpPass::runOnFunction() {
+  getFunction().walk([](Operation *op) {
+    auto passthrough_op = dyn_cast<TF::MlirPassthroughOpOp>(op);
+    if (!passthrough_op) return;
+    std::string module_string = passthrough_op.mlir_module();
+    // Parse the module.
+    auto nested_module = parseSourceString(module_string, op->getContext());
+    if (!nested_module) {
+      op->emitError() << "could not parse attached MLIR module";
+      return;
+    }
+    FuncOp main = dyn_cast<FuncOp>(nested_module->lookupSymbol("main"));
+    if (!main) {
+      op->emitError() << "MLIR Opaque Op expects a main() entry point\n";
+      return;
+    }
+    if (main.getNumArguments() != op->getNumOperands()) {
+      op->emitError() << "mismatch between MLIR Opaque Op number of operands ("
+                      << op->getNumOperands()
+                      << ") and main() entry point in the module ("
+                      << main.getNumArguments() << " args)\n";
+      return;
+    }
+    if (main.getType().getNumResults() != op->getNumResults()) {
+      op->emitError() << "mismatch between MLIR Opaque Op number of results ("
+                      << op->getNumResults()
+                      << ") and main() entry point in the module ("
+                      << main.getType().getNumResults() << " results)\n";
+      return;
+    }
+    Region &body = main.getBody();
+    if (body.getBlocks().size() != 1) {
+      op->emitError() << "MLIR Opaque Op expects a main() entry point with a "
+                         "single block\n";
+      return;
+    }
+    Block &block = body.front();
+    for (const auto &arg_mapping :
+         llvm::zip(block.getArguments(), op->getOperands())) {
+      std::get<0>(arg_mapping)->replaceAllUsesWith(std::get<1>(arg_mapping));
+    }
+    op->getBlock()->getOperations().splice(op->getIterator(),
+                                           block.getOperations(), block.begin(),
+                                           std::prev(block.end()));
+    Operation &return_op = block.front();
+    for (auto ret_mapping :
+         llvm::zip(op->getResults(), return_op.getOperands())) {
+      std::get<0>(ret_mapping)->replaceAllUsesWith(std::get<1>(ret_mapping));
+    }
+    op->erase();
+  });
+}
+
+}  // namespace
+
+namespace TF {
+std::unique_ptr<FunctionPassBase> CreateMaterializePassthroughOpPass() {
+  return std::make_unique<MaterializePassthroughOpPass>();
+}
+}  // namespace TF
+
+static PassRegistration<MaterializePassthroughOpPass> pass(
+    "tf-materialize-passthrough-op",
+    "Materialize the MlirPassthroughOp by replacing it with the MLIR module "
+    "attached as an attribute");
+
+}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/passes.h b/tensorflow/compiler/mlir/tensorflow/transforms/passes.h
index 9501f49475f..15ddebdffe8 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/passes.h
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/passes.h
@@ -26,6 +26,10 @@ namespace TF {
 // dialect to MLIR Control Flow Graph (CFG) form.
 std::unique_ptr<FunctionPassBase> CreateTFFunctionalControlFlowToCFG();
 
+// Materialize the MlirPassthroughOp by replacing it with the MLIR module
+// attached as an attribute.
+std::unique_ptr<FunctionPassBase> CreateMaterializePassthroughOpPass();
+
 // Optimizes Tensorflow graph.
 std::unique_ptr<FunctionPassBase> CreateTFOptimizePass();
 
@@ -53,6 +57,11 @@ std::unique_ptr<FunctionPassBase> CreateTFExecutorGraphPruningPass();
 // Prune a tf_executor.graph operation from dead nodes.
 void prune_graph(GraphOp graph);
 
+// Sink `tf.Const` operations in the LaunchOp region using them. This is
+// performed in order to limit the number of values implicitly captured in this
+// region before outlining.
+std::unique_ptr<FunctionPassBase> CreateTFExecutorConstantSinkingPass();
+
 }  // namespace tf_executor
 
 namespace TFDevice {
@@ -75,7 +84,7 @@ std::unique_ptr<ModulePassBase> CreateTPURewritePass();
 
 // Populates the supplied passmanager with the passes required to run the
 // bridge. NOLINTNEXTLINE - MLIR contract is pass by mutable reference.
-void createTPUBridge(PassManager& bridge);
+void createTPUBridge(OpPassManager& bridge);
 
 }  // namespace TFTPU
 
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/sink_constant.cc b/tensorflow/compiler/mlir/tensorflow/transforms/sink_constant.cc
new file mode 100644
index 00000000000..86344e5fa3e
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/sink_constant.cc
@@ -0,0 +1,98 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <cstddef>
+#include <utility>
+
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/Support/Debug.h"
+#include "mlir/Pass/Pass.h"  // TF:local_config_mlir
+#include "mlir/Pass/PassManager.h"  // TF:local_config_mlir
+#include "mlir/Support/LLVM.h"  // TF:local_config_mlir
+#include "mlir/Transforms/Passes.h"  // TF:local_config_mlir
+#include "mlir/Transforms/RegionUtils.h"  // TF:local_config_mlir
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_device.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_executor.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
+#include "tensorflow/compiler/mlir/tensorflow/transforms/passes.h"
+#include "tensorflow/compiler/mlir/tensorflow/utils/error_util.h"
+
+#define DEBUG_TYPE "tf-executor-sink-constant"
+
+namespace mlir {
+namespace tf_executor {
+
+namespace {
+using ::mlir::TF::ConstOp;
+
+class ExecutorConstantSinking
+    : public mlir::FunctionPass<ExecutorConstantSinking> {
+  void runOnFunction() override {
+    getFunction().walk([](tf_device::LaunchOp launch) {
+      LLVM_DEBUG(llvm::dbgs() << "Visit " << *launch.getOperation() << "\n");
+      // For each launch op, we find the values used that come from a constant
+      // defined above and sink these constants in the region body.
+      // The sunk_constant map keeps a mapping from a ConstOp defined above to
+      // a sunk clone of it. This allows for reusing a sunk constant with
+      // multiple uses in the region.
+      llvm::DenseMap<Value *, TF::ConstOp> sunk_constant;
+      Region &body = launch.body();
+      visitUsedValuesDefinedAbove(body, [&](OpOperand *use) {
+        Value *constant = use->get();
+        auto const_op =
+            dyn_cast_or_null<TF::ConstOp>(constant->getDefiningOp());
+        if (!const_op) return;
+
+        // We found a constant, try to insert it in the map and re-use its
+        // cloned value if any.
+        auto map_entry = sunk_constant.try_emplace(constant, nullptr);
+        if (!map_entry.second) {
+          // This constant has already been cloned into the region, reuse it.
+          use->set(map_entry.first->getSecond().getResult());
+          LLVM_DEBUG(llvm::dbgs() << "Re-use sunk constant " << *use->get()
+                                  << "\n     in " << *use->get() << "\n");
+          if (constant->use_empty()) const_op.erase();
+          return;
+        }
+        if (constant->hasOneUse()) {
+          LLVM_DEBUG(llvm::dbgs() << "Moved constant " << *constant << "\n");
+          const_op.getOperation()->moveBefore(&body.begin()->front());
+          return;
+        }
+        map_entry.first->getSecond() = const_op.clone();
+        body.begin()->getOperations().insert(body.begin()->begin(),
+                                             map_entry.first->getSecond());
+        use->set(map_entry.first->getSecond().getResult());
+        LLVM_DEBUG(llvm::dbgs() << "Sunk cloned constant " << *use->get()
+                                << "\n     in " << *use->get() << "\n");
+      });
+    });
+  }
+};
+
+static mlir::PassRegistration<ExecutorConstantSinking> pass(
+    "tf-device-constant-sinking",
+    "Sink constants implicitly captured in a tf_device.launch region. This "
+    "reduces the number of arguments when outlining later.");
+
+}  // anonymous namespace
+
+std::unique_ptr<FunctionPassBase> CreateTFExecutorConstantSinkingPass() {
+  return std::make_unique<ExecutorConstantSinking>();
+}
+
+}  // namespace tf_executor
+}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/tpu_rewrite_pass.cc b/tensorflow/compiler/mlir/tensorflow/transforms/tpu_rewrite_pass.cc
index 05de214f992..91fc073e1f3 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/tpu_rewrite_pass.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/tpu_rewrite_pass.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/Support/raw_ostream.h"
 #include "mlir/IR/Attributes.h"  // TF:local_config_mlir
@@ -22,6 +23,7 @@ limitations under the License.
 #include "mlir/Pass/Pass.h"  // TF:local_config_mlir
 #include "mlir/Pass/PassRegistry.h"  // TF:local_config_mlir
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_device.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_executor.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_types.h"
 #include "tensorflow/compiler/mlir/tensorflow/transforms/passes.h"
@@ -137,7 +139,7 @@ std::string EncapsulateFuncAndSerialize(FuncOp entry_func) {
 Operation* BuildCompileOp(tf_device::LaunchFuncOp launch_func,
                           OpBuilder* builder) {
   // TODO(b/139377366): Use tf_tpu.compile build method when it is defined.
-  OperationState compile_op_state(launch_func.getLoc(), "tf.MLIRCompileToTPU");
+  OperationState compile_op_state(launch_func.getLoc(), "tf._TPUCompileMlir");
 
   // Build a shape op for each input to launch_func.
   // TODO(b/139377366): When shape inference is ready, we can use compile time
@@ -153,6 +155,9 @@ Operation* BuildCompileOp(tf_device::LaunchFuncOp launch_func,
     compile_op_operands.emplace_back(shape_op.getResult());
   }
   compile_op_state.addOperands(compile_op_operands);
+  compile_op_state.addAttribute(
+      "NumDynamicShapes",
+      builder->getI64IntegerAttr(compile_op_operands.size()));
 
   SymbolRefAttr func_attr = launch_func.getAttrOfType<SymbolRefAttr>("func");
   if (!func_attr) {
@@ -163,13 +168,8 @@ Operation* BuildCompileOp(tf_device::LaunchFuncOp launch_func,
       func_attr.getValue());
 
   std::string txt_module = EncapsulateFuncAndSerialize(func);
-  compile_op_state.addAttribute("module", builder->getStringAttr(txt_module));
-
-  // Copy all launch_func attributes other than `func`.
-  for (auto attr : launch_func.getAttrs()) {
-    if (attr.first == "func") continue;
-    compile_op_state.attributes.emplace_back(attr);
-  }
+  compile_op_state.addAttribute("mlir_module",
+                                builder->getStringAttr(txt_module));
 
   // Result #0 is a string indicating whether compilation is successful or not.
   compile_op_state.addTypes(
@@ -239,8 +239,21 @@ void BuildTPUCompileSucceededAssertOp(Operation* compile_op,
 // Operations that jit-compiles and executes function in `tf_device.launch_func`
 // on TPU.
 void Rewrite(tf_device::LaunchFuncOp launch_func, OpBuilder* builder) {
+  // Skip non-tpu device launch_func.
+  auto replicate_attr = launch_func.getAttrOfType<StringAttr>("_tpu_replicate");
+  if (!replicate_attr) return;
+
   builder->setInsertionPoint(launch_func);
   Operation* compile_op = BuildCompileOp(launch_func, builder);
+
+  // After rewrite, find if there is a TPUCompilationResultOp in the block with
+  // the same _tpu_replicate attribute and replace it with the result of the
+  // compile op. This op is used as a placeholder to hook during graph creation
+  // the other ops that are intended to consume the compile result.
+  Block* block = launch_func.getOperation()->getBlock();
+  for (auto compile_result_op : block->getOps<TF::TPUCompilationResultOp>())
+    compile_result_op.output()->replaceAllUsesWith(compile_op->getResult(0));
+
   BuildTPUCompileSucceededAssertOp(compile_op, builder);
   // TODO(ycao): Right now we only support single-core case. The right thing to
   // do is to read from launch_func attributes to determine how many execute
@@ -253,11 +266,20 @@ void Rewrite(tf_device::LaunchFuncOp launch_func, OpBuilder* builder) {
 void TPURewritePass::runOnModule() {
   OpBuilder builder(&getContext());
   getModule().walk([&](tf_device::LaunchFuncOp op) {
-    // Skip non-tpu device launch_func.
-    if (!op.getAttrOfType<StringAttr>("_tpu_replicate")) return;
     Rewrite(op, &builder);
   });
 
+  // Eliminate TPUReplicatedInput and TPUReplicatedOutput now that the rewrite
+  // is complete.
+  getModule().walk([&](Operation* op) {
+    auto op_name = op->getName().getStringRef();
+    if (op_name != "tf.TPUReplicatedInput" &&
+        op_name != "tf.TPUReplicatedOutput")
+      return;
+    op->getResult(0)->replaceAllUsesWith(op->getOperand(0));
+    op->erase();
+  });
+
   // TODO(b/139377366): Remove functions that are no longer needed.
 }
 
diff --git a/tensorflow/compiler/mlir/tensorflow/translate/export_graphdef.cc b/tensorflow/compiler/mlir/tensorflow/translate/export_graphdef.cc
index 9c1ffd4466e..e9aaf56462c 100644
--- a/tensorflow/compiler/mlir/tensorflow/translate/export_graphdef.cc
+++ b/tensorflow/compiler/mlir/tensorflow/translate/export_graphdef.cc
@@ -67,8 +67,11 @@ namespace tensorflow {
 using llvm::cast;
 using llvm::dyn_cast;
 using llvm::isa;
+using mlir::BlockArgument;
 using mlir::Dialect;
 using mlir::Operation;
+using mlir::OperationState;
+using mlir::Value;
 using stream_executor::port::StatusOr;
 
 namespace {
@@ -125,35 +128,34 @@ class Exporter {
   explicit Exporter(Graph* graph, const Dialect* tf_dialect)
       : graph_(graph), tf_dialect_(tf_dialect) {}
 
-  Status AddArgumentNode(mlir::BlockArgument* arg, unsigned index);
-  Status AddInstructionNode(mlir::Operation* inst);
-  Status AddNextIterationNode(mlir::Operation* inst);
-  Status AddEdge(mlir::Operation* inst);
+  Status AddArgumentNode(BlockArgument* arg, unsigned index);
+  Status AddInstructionNode(Operation* inst);
+  Status AddNextIterationNode(Operation* inst);
+  Status AddEdge(Operation* inst);
 
-  StatusOr<std::unique_ptr<NodeDef>> GetArgumentNode(mlir::BlockArgument* arg,
+  StatusOr<std::unique_ptr<NodeDef>> GetArgumentNode(BlockArgument* arg,
                                                      unsigned index);
-  StatusOr<std::unique_ptr<NodeDef>> GetReturnNode(mlir::Operation* inst,
+  StatusOr<std::unique_ptr<NodeDef>> GetReturnNode(Operation* inst,
                                                    unsigned index);
   // Adds one edge between src_node and dst_node. If it is not a control edge,
   // an index is used to find out the right operand of the dst_node.
-  Status AddEdgeBetweenNodes(mlir::Value* src, Node* dst_node,
-                             unsigned dst_index);
+  Status AddEdgeBetweenNodes(Value* src, Node* dst_node, unsigned dst_index);
 
   // Returns a unique name for `op`.
-  std::string UniqueName(mlir::Operation* op);
+  std::string UniqueName(Operation* op);
 
   // Returns a unique name starting with a given prefix.
   std::string UniqueName(llvm::StringRef prefix);
 
   Graph* graph_;
-  absl::flat_hash_map<mlir::Operation*, string> op_to_name_;
+  absl::flat_hash_map<Operation*, string> op_to_name_;
   absl::flat_hash_map<string, int64> name_to_count_;
-  absl::flat_hash_map<mlir::Operation*, Node*> nodes_;
-  absl::flat_hash_map<const mlir::BlockArgument*, Node*> args_;
+  absl::flat_hash_map<Operation*, Node*> nodes_;
+  absl::flat_hash_map<const BlockArgument*, Node*> args_;
   // One single return operation can return multiple results, and each of them
   // will be converted to one node in the graph.
   typedef absl::InlinedVector<Node*, 4> NodeVector;
-  absl::flat_hash_map<mlir::Operation*, NodeVector> returns_;
+  absl::flat_hash_map<Operation*, NodeVector> returns_;
 
   // Each NextIteration node in the original graph is converted to a pair of
   // source and sink operations in the MLIR, and we use the following two maps
@@ -163,8 +165,8 @@ class Exporter {
   // are inserted to the name_to_inst_ first, and the other "sink" operation
   // can be paired by checking this map and both are inserted to the
   // source_to_sink_ map.
-  absl::flat_hash_map<string, mlir::Operation*> name_to_inst_;
-  absl::flat_hash_map<mlir::Operation*, mlir::Operation*> source_to_sink_;
+  absl::flat_hash_map<string, Operation*> name_to_inst_;
+  absl::flat_hash_map<Operation*, Operation*> source_to_sink_;
 
   const mlir::Dialect* tf_dialect_;
 };
@@ -183,15 +185,15 @@ std::string Exporter::UniqueName(llvm::StringRef prefix) {
   return name;
 }
 
-std::string Exporter::UniqueName(mlir::Operation* op) {
+std::string Exporter::UniqueName(Operation* op) {
   auto& name = op_to_name_[op];
   if (!name.empty()) return name;
   name = UniqueName(GetName(op));
   return name;
 }
 
-StatusOr<std::unique_ptr<NodeDef>> Exporter::GetArgumentNode(
-    mlir::BlockArgument* arg, unsigned index) {
+StatusOr<std::unique_ptr<NodeDef>> Exporter::GetArgumentNode(BlockArgument* arg,
+                                                             unsigned index) {
   auto node_def = absl::make_unique<NodeDef>();
   node_def->set_name(UniqueName(
       arg->getParentRegion()->getParentOfType<mlir::FuncOp>().getName().str()));
@@ -208,8 +210,8 @@ StatusOr<std::unique_ptr<NodeDef>> Exporter::GetArgumentNode(
   return node_def;
 }
 
-StatusOr<std::unique_ptr<NodeDef>> Exporter::GetReturnNode(
-    mlir::Operation* inst, unsigned index) {
+StatusOr<std::unique_ptr<NodeDef>> Exporter::GetReturnNode(Operation* inst,
+                                                           unsigned index) {
   auto node_def = absl::make_unique<NodeDef>();
   auto* inst_op = inst->getOperand(index);
   node_def->set_name(
@@ -227,7 +229,7 @@ StatusOr<std::unique_ptr<NodeDef>> Exporter::GetReturnNode(
   return node_def;
 }
 
-Status Exporter::AddEdgeBetweenNodes(mlir::Value* src, Node* dst_node,
+Status Exporter::AddEdgeBetweenNodes(Value* src, Node* dst_node,
                                      unsigned dst_index) {
   if (auto* input_result = dyn_cast<mlir::OpResult>(src)) {
     auto* input_inst = input_result->getOwner();
@@ -236,25 +238,28 @@ Status Exporter::AddEdgeBetweenNodes(mlir::Value* src, Node* dst_node,
     if (it != source_to_sink_.end()) {
       input_inst = source_to_sink_[input_inst];
     }
-    TF_RET_CHECK(nodes_.find(input_inst) != nodes_.end())
+    auto node_it = nodes_.find(input_inst);
+    TF_RET_CHECK(node_it != nodes_.end())
         << "Use of OpResult encountered before def!";
     if (input_result->getType().isa<mlir::TFControlFlow::TFControlType>()) {
-      graph_->AddControlEdge(nodes_[input_inst], dst_node);
+      graph_->AddControlEdge(node_it->second, dst_node);
     } else {
-      graph_->AddEdge(nodes_[input_inst], input_result->getResultNumber(),
+      graph_->AddEdge(node_it->second, input_result->getResultNumber(),
                       dst_node, dst_index);
     }
-  } else if (auto* input_arg = dyn_cast<mlir::BlockArgument>(src)) {
-    TF_RET_CHECK(args_.find(input_arg) != args_.end())
-        << "Use of BlockArgument encounted before def!";
-    auto* input_node = args_[input_arg];
-    // For argument, there is only one result output, so the index is always 0.
-    graph_->AddEdge(input_node, 0, dst_node, dst_index);
+    return Status::OK();
   }
+
+  auto* input_arg = cast<BlockArgument>(src);
+  auto input_node_it = args_.find(input_arg);
+  TF_RET_CHECK(input_node_it != args_.end())
+      << "Use of BlockArgument encounted before def!";
+  // For argument, there is only one result output, so the index is always 0.
+  graph_->AddEdge(input_node_it->second, 0, dst_node, dst_index);
   return Status::OK();
 }
 
-Status Exporter::AddEdge(mlir::Operation* inst) {
+Status Exporter::AddEdge(Operation* inst) {
   auto* dst_node = nodes_[inst];
   bool is_return_op = isa<mlir::ReturnOp>(inst);
   for (int index = 0, e = inst->getNumOperands(); index < e; index++) {
@@ -273,79 +278,86 @@ Status Exporter::AddEdge(mlir::Operation* inst) {
   return Status::OK();
 }
 
-Status Exporter::AddInstructionNode(mlir::Operation* inst) {
+Status Exporter::AddInstructionNode(Operation* inst) {
   Status status;
-  if (!inst->isKnownTerminator()) {
-    std::unique_ptr<NodeDef> node_def;
-    auto name = UniqueName(inst);
-    // Convert registered TF ops to NodeDef. Only registered ops are handled to
-    // ensure that PopulateDerivedAttrs adds the correct attributes.
-    TF_ASSIGN_OR_RETURN(node_def,
-                        ConvertTFDialectOpToNodeDef(
-                            inst, name, /*ignore_unregistered_attrs=*/false));
 
-    Node* node = graph_->AddNode(*node_def, &status);
-    TF_RETURN_IF_ERROR(status);
-    nodes_[inst] = node;
-  } else if (isa<mlir::ReturnOp>(inst)) {
-    for (int index = 0, end = inst->getNumOperands(); index != end; index++) {
+  // If the op is a ReturnOp then create a return node per operand.
+  if (isa<mlir::ReturnOp>(inst)) {
+    auto& return_nodes = returns_[inst];
+    for (int index : llvm::seq<int>(0, inst->getNumOperands())) {
       TF_ASSIGN_OR_RETURN(auto node_def, GetReturnNode(inst, index));
       Node* node = graph_->AddNode(*node_def, &status);
       TF_RETURN_IF_ERROR(status);
-      if (returns_.find(inst) == returns_.end()) {
-        returns_[inst] = NodeVector();
-      }
-      returns_[inst].push_back(node);
+      return_nodes.push_back(node);
     }
-  } else {
-    return errors::InvalidArgument("Operation input was not an Value!");
+    return Status::OK();
   }
+
+  if (inst->isKnownTerminator())
+    return errors::InvalidArgument("std.return is only allowed terminator");
+
+  std::unique_ptr<NodeDef> node_def;
+  auto name = UniqueName(inst);
+  // Convert registered TF ops to NodeDef. Only registered ops are handled to
+  // ensure that PopulateDerivedAttrs adds the correct attributes.
+  TF_ASSIGN_OR_RETURN(node_def,
+                      ConvertTFDialectOpToNodeDef(
+                          inst, name, /*ignore_unregistered_attrs=*/false));
+
+  Node* node = graph_->AddNode(*node_def, &status);
+  TF_RETURN_IF_ERROR(status);
+  nodes_[inst] = node;
   return Status::OK();
 }
 
-Status Exporter::AddArgumentNode(mlir::BlockArgument* arg, unsigned index) {
-  // If it is an argument from the "main" function, it has only one user, which
-  // is an input node. We recover the original input node and skip adding the
-  // argument node. The new input node will be handled as normal in the
-  // following steps.
-  if (arg->getParentRegion()->getParentOfType<mlir::FuncOp>().getName() ==
-      "main") {
-    if (!arg->hasOneUse()) {
-      return errors::FailedPrecondition(
-          "Arg in 'main' should only have one user.");
-    }
-    auto* input = *arg->user_begin();
-    auto input_name = input->getName().getStringRef();
-    input_name.consume_back(".input");
-    mlir::OpBuilder builder(arg->getOwner());
-    auto loc = mlir::NameLoc::get(builder.getIdentifier(UniqueName(input)),
-                                  builder.getContext());
-    mlir::OperationState state(loc, input_name.str());
-    state.attributes.append(input->getAttrs().begin(), input->getAttrs().end());
-    for (auto* op : input->getOperands()) {
-      // Skip the argument in the new operation.
-      if (llvm::isa<mlir::BlockArgument>(op)) continue;
-      state.operands.push_back(op);
-    }
-    for (auto* r : input->getResults()) state.types.push_back(r->getType());
-    auto* inst = builder.createOperation(state);
-    // If it is one of the specified input names, then the new
-    // instruction should have the same name.
-    op_to_name_[inst].assign(op_to_name_[input]);
-    for (int index = 0, e = input->getNumResults(); index != e; ++index) {
-      input->getResult(index)->replaceAllUsesWith(inst->getResult(index));
-    }
-    input->dropAllReferences();
-    input->erase();
-    return Status::OK();
-  } else {
+bool IsEntryFunctionArg(BlockArgument* arg) {
+  return arg->getParentRegion()->getParentOfType<mlir::FuncOp>().getName() ==
+         "main";
+}
+
+Status Exporter::AddArgumentNode(BlockArgument* arg, unsigned index) {
+  if (!IsEntryFunctionArg(arg)) {
     TF_ASSIGN_OR_RETURN(auto node_def, GetArgumentNode(arg, index));
     Status status;
     Node* node = graph_->AddNode(*node_def, &status);
     TF_RETURN_IF_ERROR(status);
     args_[arg] = node;
-    return Status::OK();
+    return status;
   }
+
+  // If it is an argument from the "main" function, it has only one user, which
+  // is an input node. We recover the original input node and skip adding the
+  // argument node. The new input node will be handled as normal in the
+  // following steps.
+  if (!arg->hasOneUse()) {
+    return errors::FailedPrecondition(
+        "Arg in 'main' should only have one user.");
+  }
+  auto* input = *arg->user_begin();
+  auto input_name = input->getName().getStringRef();
+  input_name.consume_back(".input");
+  mlir::OpBuilder builder(arg->getOwner());
+  auto loc = mlir::NameLoc::get(builder.getIdentifier(UniqueName(input)),
+                                builder.getContext());
+  OperationState state(loc, input_name.str());
+  state.attributes.append(input->getAttrs().begin(), input->getAttrs().end());
+  for (auto* op : input->getOperands()) {
+    // Skip the argument in the new operation.
+    if (llvm::isa<BlockArgument>(op)) continue;
+    state.operands.push_back(op);
+  }
+  state.types.append(input->getResultTypes().begin(),
+                     input->getResultTypes().end());
+  auto* inst = builder.createOperation(state);
+  // If it is one of the specified input names, then the new
+  // instruction should have the same name.
+  op_to_name_[inst].assign(op_to_name_[input]);
+  for (int index : llvm::seq<int>(0, input->getNumResults())) {
+    input->getResult(index)->replaceAllUsesWith(inst->getResult(index));
+  }
+  input->dropAllReferences();
+  input->erase();
+  return Status::OK();
 }
 
 // Handles an NextIteration node specially:
@@ -353,7 +365,7 @@ Status Exporter::AddArgumentNode(mlir::BlockArgument* arg, unsigned index) {
 //   map by using its name attribute;
 // - NextIteration "sink" is paired with the "source" with the name attribute.
 //   It is added to the graph like the other operations.
-Status Exporter::AddNextIterationNode(mlir::Operation* inst) {
+Status Exporter::AddNextIterationNode(Operation* inst) {
   auto name = GetName(inst);
   if (inst->getName().getStringRef().endswith(".source")) {
     name_to_inst_[name] = inst;
@@ -363,10 +375,9 @@ Status Exporter::AddNextIterationNode(mlir::Operation* inst) {
   return AddInstructionNode(inst);
 }
 
-StatusOr<std::unique_ptr<Graph>> Exporter::Convert(const ExporterConfigs& confs,
-                                                   const Dialect* tf_dialect,
-                                                   mlir::FuncOp function,
-                                                   FunctionDefLibrary* flib) {
+StatusOr<std::unique_ptr<Graph>> Exporter::Convert(
+    const ExporterConfigs& configs, const Dialect* tf_dialect,
+    mlir::FuncOp function, FunctionDefLibrary* flib) {
   if (function.getBlocks().size() != 1) {
     return errors::FailedPrecondition(
         "Input FuncOp must have only one basic block!");
@@ -420,10 +431,10 @@ StatusOr<std::unique_ptr<Graph>> Exporter::Convert(const ExporterConfigs& confs,
     TF_RET_CHECK(output_names.size() == term->getNumOperands())
         << "output names (" << output_names.size()
         << ") != terminator operands (" << term->getNumOperands() << ")";
-    int i = 0;
-    for (auto it : term->getOperands()) {
-      exporter.name_to_count_[output_names[i].str()] = 1;
-      exporter.op_to_name_[it->getDefiningOp()] = output_names[i++];
+    for (auto it : llvm::enumerate(term->getOperands())) {
+      exporter.name_to_count_[output_names[it.index()].str()] = 1;
+      exporter.op_to_name_[it.value()->getDefiningOp()] =
+          output_names[it.index()];
     }
   }
   if (!input_names.empty()) {
@@ -435,8 +446,9 @@ StatusOr<std::unique_ptr<Graph>> Exporter::Convert(const ExporterConfigs& confs,
   }
 
   // Adds nodes for basic block (function) arguments.
-  for (int index = 0, e = block.getNumArguments(); index != e; index++) {
-    auto* arg = block.getArgument(index);
+  for (auto it : llvm::enumerate(block.getArguments())) {
+    int index = it.index();
+    auto* arg = it.value();
     mlir::Type type = arg->getType();
     if (!type.isa<mlir::TensorType>()) {
       return errors::InvalidArgument(
@@ -447,7 +459,7 @@ StatusOr<std::unique_ptr<Graph>> Exporter::Convert(const ExporterConfigs& confs,
     TF_RETURN_IF_ERROR(exporter.AddArgumentNode(arg, index));
   }
   // Adds nodes for operations.
-  for (mlir::Operation& inst : block) {
+  for (Operation& inst : block) {
     auto op_name = GetTensorFlowOpName(inst.getName().getStringRef());
     if (op_name.ok()) {
       // If it is TF Control dialect specific op, look up custom operation
@@ -459,13 +471,12 @@ StatusOr<std::unique_ptr<Graph>> Exporter::Convert(const ExporterConfigs& confs,
           function.getParentOfType<mlir::ModuleOp>().lookupSymbol<mlir::FuncOp>(
               op_name.ValueOrDie());
       if (func != nullptr) {
-        TF_RETURN_IF_ERROR(ConvertLibFunction(confs, tf_dialect, func, flib));
+        TF_RETURN_IF_ERROR(ConvertLibFunction(configs, tf_dialect, func, flib));
         TF_RETURN_IF_ERROR(graph->AddFunctionLibrary(*flib));
       }
     }
 
-    for (auto* result : inst.getResults()) {
-      mlir::Type type = result->getType();
+    for (auto type : inst.getResultTypes()) {
       if (!type.isa<mlir::TensorType>() &&
           !type.isa<mlir::TFControlFlow::TFControlType>()) {
         return errors::InvalidArgument(
@@ -481,7 +492,7 @@ StatusOr<std::unique_ptr<Graph>> Exporter::Convert(const ExporterConfigs& confs,
     }
   }
   // Adds edges between the argument, operation and return nodes.
-  for (mlir::Operation& inst : block) {
+  for (Operation& inst : block) {
     TF_RETURN_IF_ERROR(exporter.AddEdge(&inst));
   }
   // Fixes the edges between the inserted nodes and special "_SOURCE" and
@@ -584,7 +595,7 @@ Status Exporter::Convert(mlir::ModuleOp module, const ExporterConfigs& configs,
 }
 }  // namespace
 
-Status ConvertMlirToGraph(mlir::ModuleOp module, const ExporterConfigs& confs,
+Status ConvertMlirToGraph(mlir::ModuleOp module, const ExporterConfigs& configs,
                           std::unique_ptr<Graph>* graph,
                           FunctionLibraryDefinition* flib_def) {
   mlir::PassManager pass_manager(module.getContext());
@@ -593,24 +604,24 @@ Status ConvertMlirToGraph(mlir::ModuleOp module, const ExporterConfigs& confs,
     return errors::FailedPrecondition(
         "Failed to convert TFExecutor Dialect to Control Dialect.");
   }
-  return Exporter::Convert(module, confs, graph, flib_def);
+  return Exporter::Convert(module, configs, graph, flib_def);
 }
 
 StatusOr<std::unique_ptr<GraphDef>> ConvertMlirToGraphdef(
-    mlir::ModuleOp module, const ExporterConfigs& confs) {
+    mlir::ModuleOp module, const ExporterConfigs& configs) {
   FunctionLibraryDefinition flib_def(OpRegistry::Global(),
                                      FunctionDefLibrary());
   auto graph = absl::make_unique<Graph>(flib_def);
-  TF_RETURN_IF_ERROR(ConvertMlirToGraph(module, confs, &graph, &flib_def));
+  TF_RETURN_IF_ERROR(ConvertMlirToGraph(module, configs, &graph, &flib_def));
   auto graphdef = absl::make_unique<GraphDef>();
   graph->ToGraphDef(graphdef.get());
-  if (!confs.export_library) graphdef->clear_library();
-  if (!confs.export_shapes) {
+  if (!configs.export_library) graphdef->clear_library();
+  if (!configs.export_shapes) {
     for (auto& node_def : *graphdef->mutable_node()) {
       node_def.mutable_attr()->erase("shape");
     }
   }
-  if (!confs.export_debug_info) {
+  if (!configs.export_debug_info) {
     for (auto& node_def : *graphdef->mutable_node()) {
       node_def.clear_experimental_debug_info();
     }
diff --git a/tensorflow/compiler/mlir/tensorflow/translate/import_model.cc b/tensorflow/compiler/mlir/tensorflow/translate/import_model.cc
index 34cdc609164..c06bd3ec5c3 100644
--- a/tensorflow/compiler/mlir/tensorflow/translate/import_model.cc
+++ b/tensorflow/compiler/mlir/tensorflow/translate/import_model.cc
@@ -26,6 +26,7 @@ limitations under the License.
 #include "absl/strings/string_view.h"
 #include "absl/strings/strip.h"
 #include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/Twine.h"
@@ -476,7 +477,8 @@ Status ImporterBase::AddNodesToShapeRefiner() {
     auto it = specs_.inputs.find(node->name());
     if (it != specs_.inputs.end()) {
       auto node_name = node->op_def().name();
-      if (node_name != "Placeholder" && node_name != "LegacyFedInput") {
+      if (node_name != "Placeholder" && node_name != "LegacyFedInput" &&
+          node_name != "_Arg") {
         // We do not handle the case where the input node has multple outputs
         if (node->num_outputs() > 1) {
           return errors::FailedPrecondition(absl::StrCat(
@@ -496,6 +498,35 @@ Status ImporterBase::AddNodesToShapeRefiner() {
     TF_RETURN_WITH_CONTEXT_IF_ERROR(shape_refiner_->AddNode(node),
                                     GetLocationStr(*node));
 
+    // We currently have no other way to get shapes from ReadVariableOp's.
+    // Some graphs seem to have _output_shapes attributes on them, so use that
+    // if possible.
+    // TODO(silvasean): Ideally, we would do this in a separate shape inference
+    // pass to avoid adding complexity to the importer. But right now, we don't
+    // have an MLIR-native shape inference pass, so we need to do this while we
+    // still have the Graph around, i.e. here, in the importer.
+    if (node->op_def().name() == "ReadVariableOp") {
+      // TODO(silvasean): In some graphs, this seems to be annotated on every
+      // node. Why and by whom?
+      // TODO(b/140588338): We should ideally incorporate that information for
+      // all nodes, but right now, this can result in e.g. an Identity node with
+      // signature such as
+      // `(tensor<?x?xf32>) -> tensor<?x9216xf32>` which fails the verifier
+      // (which checks for exact type equality; _output_shapes results in
+      // us shoehorning in the more-precise type on the output).
+      if (const AttrValue* attr = node->attrs().Find("_output_shapes")) {
+        auto& list = attr->list();
+        for (auto shape : llvm::enumerate(list.shape())) {
+          auto* node_context = shape_refiner_->GetContext(node);
+          shape_inference::ShapeHandle handle;
+          TF_RETURN_WITH_CONTEXT_IF_ERROR(
+              node_context->MakeShapeFromShapeProto(shape.value(), &handle),
+              GetLocationStr(*node));
+          node_context->set_output(shape.index(), handle);
+        }
+      }
+    }
+
     // If it is the argument node, the shape handle is set explicitly, so it
     // can be propagated to the body nodes of the function.
     if (StringPiece(node->type_string()) == FunctionLibraryDefinition::kArgOp) {
@@ -845,10 +876,28 @@ Status ImporterBase::ConvertLibFunction(llvm::StringRef func_name) {
     attributes.push_back(builder_.getNamedAttr(grad_string, gradient_attr));
   }
 
-  // Converts the graph to a MLIR function and adds it to the module. Uses the
-  // default node spec without any inputs or outputs as the function graph has
-  // special '_Arg' and '_Retval' ops for argument and return values.
+  // Converts the graph to a MLIR function and adds it to the module.
+  // We populate the NodeSpec so that all the _Arg ops get their shape
+  // added correctly.
   NodeSpecs specs;
+  for (const auto& name_and_value : func_def->attr()) {
+    if (name_and_value.first == "_input_shapes") {
+      auto& list = name_and_value.second.list();
+      auto& signature = func_def->signature();
+      for (int i = 0; i < list.shape_size(); i++) {
+        auto& input_arg = signature.input_arg(i);
+        auto& array_info = specs.inputs[input_arg.name()];
+        array_info.imported_dtype = input_arg.type();
+        array_info.shape = list.shape(i);
+        // TODO(b/140464702): These fields should not be exposed here.
+        // Seems like a layering violation. Initialize them anyway.
+        array_info.final_dtype = input_arg.type();
+        array_info.min_value = 0.0;
+        array_info.max_value = 0.0;
+      }
+    }
+  }
+
   ImporterBase child_importer(graph_flib_, debug_info_, specs, module_,
                               tf_name_to_mlir_name_);
   TF_RETURN_IF_ERROR(child_importer.PrepareConvert(*fbody->graph));
@@ -1090,9 +1139,10 @@ mlir::Location ImporterBase::GetLocation(const NodeDef& node_def) {
     for (int i = 0, e = original_nodes.size(); i != e; ++i) {
       auto node_name = original_nodes[i];
       auto func_name = (i < original_funcs.size()) ? original_funcs[i] : "";
-      // Use the catenation of function and node names as the lookup key. This
-      // is to match the utility of generating the GraphDebugInfo.
-      node_call_sites.push_back(node_name_to_call_site(func_name + node_name));
+      // Use the catenation of function and node names as the lookup key.
+      // This matches the way that the key is formed on the python side.
+      std::string key = node_name + "@" + func_name;
+      node_call_sites.push_back(node_name_to_call_site(key));
     }
     return mlir::FusedLoc::get(node_call_sites, context_);
   }
@@ -1399,16 +1449,22 @@ StatusOr<mlir::FunctionType> ImporterBase::InferLibFunctionType(
     const FunctionBody& fbody) {
   mlir::Builder builder(context_);
 
+  // The FunctionBody contains a graph with a single-output _Arg node for each
+  // function argument and a single-input _Retval node for each function return
+  // value.
+  //
+  // We already populated the ShapeRefiner with all the information about the
+  // shapes of these graph edges, so we just query it to build the corresponding
+  // MLIR function type signature.
+
   llvm::SmallVector<mlir::Type, 4> arg_types;
   arg_types.reserve(fbody.arg_types.size());
-  for (auto dataType : fbody.arg_types) {
-    mlir::Type element_type;
-    TF_RETURN_IF_ERROR(
-        ::tensorflow::ConvertDataType(dataType, builder, &element_type));
-    // TODO(hinsu): Derive shape of function arguments based on shapes available
-    // at call sites of this function. That way it is possible to have a
-    // partially known shape in some cases instead of unranked tensor types.
-    arg_types.push_back(builder.getTensorType(element_type));
+  for (auto arg : fbody.arg_nodes) {
+    // Find node in the graph using the node id instead of using `arg` directly
+    // because the graph has been cloned.
+    auto* node = graph_->FindNodeId(arg->id());
+    TF_ASSIGN_OR_RETURN(auto type, InferOutputType(*node, /*idx=*/0, builder));
+    arg_types.push_back(type);
   }
 
   llvm::SmallVector<mlir::Type, 4> ret_types;
@@ -1417,9 +1473,6 @@ StatusOr<mlir::FunctionType> ImporterBase::InferLibFunctionType(
     // Find node in the graph using the node id instead of using `ret` directly
     // because the graph has been cloned.
     auto* node = graph_->FindNodeId(ret->id());
-
-    // Return type of the function is type of the only input of the respective
-    // return node in the function.
     TF_ASSIGN_OR_RETURN(auto type, InferInputType(*node, /*idx=*/0, builder));
     ret_types.push_back(type);
   }
@@ -1721,4 +1774,13 @@ StatusOr<mlir::OwningModuleRef> ConvertSavedModelToMlir(
                                      add_default_attributes, context);
 }
 
+std::string MlirModuleToString(mlir::ModuleOp module) {
+  std::string txt_module;
+  {
+    llvm::raw_string_ostream os{txt_module};
+    module.print(os);
+  }
+  return txt_module;
+}
+
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/mlir/tensorflow/translate/import_model.h b/tensorflow/compiler/mlir/tensorflow/translate/import_model.h
index 98bb607fa6a..6ca4c0098d7 100644
--- a/tensorflow/compiler/mlir/tensorflow/translate/import_model.h
+++ b/tensorflow/compiler/mlir/tensorflow/translate/import_model.h
@@ -16,6 +16,8 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_MLIR_TENSORFLOW_TRANSLATE_IMPORT_MODEL_H_
 #define TENSORFLOW_COMPILER_MLIR_TENSORFLOW_TRANSLATE_IMPORT_MODEL_H_
 
+#include <string>
+
 #include "mlir/IR/MLIRContext.h"  // TF:local_config_mlir
 #include "mlir/IR/Module.h"  // TF:local_config_mlir
 #include "tensorflow/cc/saved_model/loader.h"
@@ -48,6 +50,9 @@ stream_executor::port::StatusOr<mlir::OwningModuleRef> ConvertSavedModelToMlir(
     const SavedModelBundle& saved_model, const GraphDebugInfo& debug_info,
     mlir::MLIRContext* context, bool add_default_attributes = true);
 
+// Serialize a MLIR module to a string.
+std::string MlirModuleToString(mlir::ModuleOp m);
+
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_COMPILER_MLIR_TENSORFLOW_TRANSLATE_IMPORT_MODEL_H_
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/compile_mlir_util.cc b/tensorflow/compiler/mlir/tensorflow/utils/compile_mlir_util.cc
new file mode 100644
index 00000000000..648aeef36da
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/utils/compile_mlir_util.cc
@@ -0,0 +1,212 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/mlir/tensorflow/utils/compile_mlir_util.h"
+
+#include "absl/types/span.h"
+#include "llvm/ADT/StringRef.h"
+#include "mlir/IR/Function.h"  // TF:local_config_mlir
+#include "mlir/IR/MLIRContext.h"  // TF:local_config_mlir
+#include "mlir/IR/Module.h"  // TF:local_config_mlir
+#include "mlir/IR/StandardTypes.h"  // TF:local_config_mlir
+#include "mlir/Parser.h"  // TF:local_config_mlir
+#include "tensorflow/compiler/mlir/tensorflow/utils/convert_type.h"
+#include "tensorflow/compiler/mlir/tensorflow/utils/error_util.h"
+#include "tensorflow/compiler/mlir/xla/mlir_hlo_to_hlo.h"
+#include "tensorflow/compiler/mlir/xla/transforms/passes.h"
+#include "tensorflow/compiler/mlir/xla/type_to_shape.h"
+#include "tensorflow/compiler/tf2xla/shape_util.h"
+
+namespace tensorflow {
+namespace {
+
+// Parses the MLIR module from the mlir_module_string.
+Status ParseMlirModule(llvm::StringRef mlir_module_string,
+                       mlir::MLIRContext* mlir_context,
+                       mlir::OwningModuleRef* mlir_module) {
+  TF_RET_CHECK(!mlir_module_string.empty())
+      << "unexpected empty serialized MLIR module string";
+  TF_RET_CHECK(mlir_module) << "unexpected null MLIR module pointer";
+
+  // Parse the module.
+  *mlir_module = mlir::parseSourceString(mlir_module_string, mlir_context);
+  if (!*mlir_module) {
+    return errors::InvalidArgument("could not parse MLIR module");
+  }
+
+  return Status::OK();
+}
+
+// Converts arg_shapes to xla::Shape's and store into xla_input_shapes.
+Status GetXlaInputShapes(
+    mlir::ModuleOp module, absl::Span<TensorShape> arg_shapes,
+    const xla::CustomShapeRepresentationFn shape_representation_fn,
+    std::vector<xla::Shape>* xla_input_shapes) {
+  xla_input_shapes->clear();
+
+  mlir::FuncOp main_func = module.lookupSymbol<mlir::FuncOp>("main");
+  mlir::FunctionType func_type = main_func.getType();
+
+  int num_args = func_type.getNumInputs();
+  xla_input_shapes->reserve(num_args);
+
+  std::vector<xla::Shape> individual_arg_shapes;
+  individual_arg_shapes.reserve(num_args);
+  for (int i = 0; i < num_args; ++i) {
+    individual_arg_shapes.emplace_back();
+    xla::Shape& xla_shape = individual_arg_shapes.back();
+
+    DataType dtype;
+    TF_RETURN_IF_ERROR(ConvertToDataType(func_type.getInput(i), &dtype));
+    TF_ASSIGN_OR_RETURN(xla_shape,
+                        shape_representation_fn(arg_shapes[i], dtype));
+  }
+  xla_input_shapes->push_back(
+      xla::ShapeUtil::MakeTupleShape(individual_arg_shapes));
+  return Status::OK();
+}
+
+// Calculates computation output shape and build OutputDescription for each
+// output based on static shapes in MLIR module
+Status GetOutputInfo(
+    mlir::ModuleOp module,
+    const xla::CustomShapeRepresentationFn shape_representation_fn,
+    xla::Shape* xla_output_shape,
+    std::vector<XlaCompiler::OutputDescription>* outputs) {
+  mlir::FuncOp main_func = module.lookupSymbol<mlir::FuncOp>("main");
+  mlir::FunctionType func_type = main_func.getType();
+
+  outputs->clear();
+  outputs->reserve(func_type.getNumResults());
+
+  std::vector<xla::Shape> shapes;
+  shapes.reserve(func_type.getNumResults());
+
+  for (mlir::Type type : func_type.getResults()) {
+    TF_ASSIGN_OR_RETURN(xla::Shape shape,
+                        TypeToShape(type, shape_representation_fn));
+    auto tensor_type = type.dyn_cast<mlir::RankedTensorType>();
+    shapes.push_back(shape);
+
+    // Construct OutputDescription for result.
+    outputs->emplace_back();
+    XlaCompiler::OutputDescription& out_desc = outputs->back();
+    TF_RETURN_IF_ERROR(ConvertToDataType(tensor_type, &out_desc.type));
+    // TODO(ycao): Support constant output.
+    out_desc.is_constant = false;
+    TF_RETURN_IF_ERROR(XLAShapeToTensorShape(shape, &out_desc.shape));
+    // Input_index is only meaningful for resource output. Since MLIR-based
+    // TF-Compiler bridge doesn't support resource output yet. Setting it to
+    // meaningless value -1.
+    // TODO(ycao): Support resource-type output.
+    out_desc.input_index = -1;
+    // MLIR-based TF-Compiler bridge doesn't support tensorlist output yet.
+    // TODO(ycao): Support tensorlist-type output.
+    out_desc.is_tensor_list = false;
+  }
+
+  // XLA computation always uses Tuple shape.
+  *xla_output_shape = xla::ShapeUtil::MakeTupleShape(shapes);
+  return Status::OK();
+}
+
+// Gets information about how computation updates Tensorflow resources.
+// TODO(ycao): Implement logic to compute resource updates when we need to
+// support graphs with resource updates in MLIR-based TF compiler bridge.
+void GetResourceUpdatesForMlir(
+    std::vector<XlaCompiler::ResourceUpdate>* resource_updates) {
+  resource_updates->clear();
+}
+
+// Creates a vector that maps from the parameters of the XLA computation to
+// their original argument positions.
+// MLIR-based TF-Compiler bridge doesn't have constant analysis yet, thus no
+// inputs are known constants. Therefore, the input mapping between input to
+// computation arguments is a trivial in-order 1-1 mapping.
+// TODO(ycao): Support computation with compile-time constant, which requires
+// non-trivial input mapping as implemented now.
+void GetInputMappingForMlir(int num_inputs, std::vector<int>* input_mapping) {
+  input_mapping->resize(num_inputs, 0);
+  std::iota(input_mapping->begin(), input_mapping->end(), 0);
+}
+
+// Lowers MLIR module to XLA HLO inside an XlaComputation.
+Status ConvertMLIRToXlaComputation(mlir::ModuleOp module_op,
+                                   xla::XlaComputation* xla_computation) {
+  {
+    // Make sure we catch any error reported by MLIR and forward it to the TF
+    // error reporting system. Report a generic error if pass manager failed
+    // without emitting a diagnostic.
+    mlir::StatusScopedDiagnosticHandler error_handler(module_op.getContext());
+    mlir::xla_hlo::legalizeTF(module_op);
+    if (!error_handler.ok()) {
+      return error_handler.Combine(
+          errors::Internal("MLIR TF to XLA legalization failed"));
+    }
+  }
+
+  xla::HloProto hlo_proto;
+  TF_RETURN_IF_ERROR(mlir::ConvertMlirHloToHlo(module_op, &hlo_proto,
+                                               /*use_tuple_args=*/true,
+                                               /*always_return_tuple=*/true));
+  *xla_computation = xla::XlaComputation(hlo_proto.hlo_module());
+  return Status::OK();
+}
+
+}  //  namespace
+
+Status CompileSerializedMlirToXlaHlo(
+    llvm::StringRef mlir_module_string, absl::Span<TensorShape> arg_shapes,
+    const XlaCompiler::ShapeRepresentationFn shape_representation_fn,
+    XlaCompiler::CompilationResult* compilation_result) {
+  mlir::MLIRContext mlir_context;
+  mlir::OwningModuleRef mlir_module;
+
+  TF_RETURN_IF_ERROR(
+      ParseMlirModule(mlir_module_string, &mlir_context, &mlir_module));
+  auto module_op = mlir_module.get();
+
+  // Convert MLIR module to XLA HLO proto contained in XlaComputation.
+  compilation_result->computation = std::make_shared<xla::XlaComputation>();
+  TF_RETURN_IF_ERROR(ConvertMLIRToXlaComputation(
+      module_op, compilation_result->computation.get()));
+
+  // Construct mapping from XlaComputation's arg to input edges of execute
+  // node.
+  GetInputMappingForMlir(arg_shapes.size(), &compilation_result->input_mapping);
+
+  auto shape_representation_fn_no_fast_memory =
+      [shape_representation_fn](const TensorShape& shape, DataType dtype) {
+        return shape_representation_fn(shape, dtype, /*use_fast_memory=*/false);
+      };
+
+  // Compute all input shapes.
+  TF_RETURN_IF_ERROR(GetXlaInputShapes(module_op, arg_shapes,
+                                       shape_representation_fn_no_fast_memory,
+                                       &compilation_result->xla_input_shapes));
+
+  // Compute all output descriptions.
+  TF_RETURN_IF_ERROR(GetOutputInfo(
+      module_op, shape_representation_fn_no_fast_memory,
+      &compilation_result->xla_output_shape, &compilation_result->outputs));
+
+  // Compute what resource variables need to be updated after XlaComputation's
+  // execution.
+  GetResourceUpdatesForMlir(&compilation_result->resource_updates);
+
+  return Status::OK();
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/compile_mlir_util.h b/tensorflow/compiler/mlir/tensorflow/utils/compile_mlir_util.h
new file mode 100644
index 00000000000..e7bfd264675
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/utils/compile_mlir_util.h
@@ -0,0 +1,35 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_TENSORFLOW_UTILS_COMPILE_MLIR_UTIL_H_
+#define TENSORFLOW_COMPILER_MLIR_TENSORFLOW_UTILS_COMPILE_MLIR_UTIL_H_
+
+#include "absl/types/span.h"
+#include "llvm/ADT/StringRef.h"
+#include "tensorflow/compiler/tf2xla/xla_compiler.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/stream_executor/lib/statusor.h"
+
+namespace tensorflow {
+
+// Compiles a serialized MLIR module into XLA HLO, generates all accompnaying
+// metadata and stores them in CompilationResult.
+Status CompileSerializedMlirToXlaHlo(
+    llvm::StringRef mlir_module_string, absl::Span<TensorShape> arg_shapes,
+    const XlaCompiler::ShapeRepresentationFn shape_representation_fn,
+    XlaCompiler::CompilationResult* compilation_result);
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TENSORFLOW_UTILS_COMPILE_MLIR_UTIL_H_
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/compile_mlir_util_test.cc b/tensorflow/compiler/mlir/tensorflow/utils/compile_mlir_util_test.cc
new file mode 100644
index 00000000000..eee531a2550
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/utils/compile_mlir_util_test.cc
@@ -0,0 +1,117 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/mlir/tensorflow/utils/compile_mlir_util.h"
+
+#include "tensorflow/compiler/tf2xla/shape_util.h"
+#include "tensorflow/compiler/tf2xla/xla_compiler.h"
+#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/test.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tensorflow/stream_executor/lib/statusor.h"
+
+namespace tensorflow {
+namespace {
+
+// A dummy shape representation function that simply converts given shape into
+// an xla::Shape without assigning any layouts.
+xla::StatusOr<xla::Shape> TestShapeRepresentation(const TensorShape& shape,
+                                                  DataType type,
+                                                  bool use_fast_memory) {
+  xla::Shape xla_shape;
+  TF_RETURN_IF_ERROR(TensorShapeToXLAShape(type, shape, &xla_shape));
+  return xla_shape;
+}
+
+TEST(CompileSerializedMlirToXlaHloTest, InvalidSerliazedMlirModule) {
+  string invalid_mlir_module = "totally @invalid MLIR module {here} <-";
+  std::vector<TensorShape> arg_shapes;
+  XlaCompiler::CompilationResult compilation_result;
+
+  Status s = CompileSerializedMlirToXlaHlo(
+      invalid_mlir_module, absl::Span<TensorShape>(arg_shapes),
+      TestShapeRepresentation, &compilation_result);
+  EXPECT_EQ(s.code(), tensorflow::errors::Code::INVALID_ARGUMENT);
+}
+
+TEST(CompileSerializedMlirToXlaHloTest, Success) {
+  string mlir_module = R"(
+    module {
+      func @main(%arg0: tensor<f32>, %arg1: tensor<f32>) -> tensor<f32> {
+        %0 = "tf.AddV2"(%arg0, %arg1) {T = "tfdtype$DT_FLOAT", name = "add"} : (tensor<f32>, tensor<f32>) -> tensor<f32>
+        return %0 : tensor<f32>
+      }
+    }
+  )";
+
+  std::vector<TensorShape> arg_shapes(2, TensorShape());
+  XlaCompiler::CompilationResult compilation_result;
+
+  Status s = CompileSerializedMlirToXlaHlo(
+      mlir_module, absl::Span<TensorShape>(arg_shapes), TestShapeRepresentation,
+      &compilation_result);
+  ASSERT_TRUE(s.ok());
+
+  const xla::HloModuleConfig module_config(
+      compilation_result.computation->GetProgramShape().ValueOrDie());
+  auto status_or_hlo_module = xla::HloModule::CreateFromProto(
+      compilation_result.computation->proto(), module_config);
+  ASSERT_TRUE(status_or_hlo_module.ok());
+  string expected_hlo_module_string = R"(HloModule main.6
+
+ENTRY %main.6 (arg_tuple.1: (f32[], f32[])) -> (f32[]) {
+  %arg_tuple.1 = (f32[], f32[]) parameter(0)
+  %get-tuple-element.2 = f32[] get-tuple-element((f32[], f32[]) %arg_tuple.1), index=0
+  %get-tuple-element.3 = f32[] get-tuple-element((f32[], f32[]) %arg_tuple.1), index=1
+  %add.4 = f32[] add(f32[] %get-tuple-element.2, f32[] %get-tuple-element.3)
+  ROOT %tuple.5 = (f32[]) tuple(f32[] %add.4)
+}
+
+)";
+  EXPECT_EQ(status_or_hlo_module.ValueOrDie()->ToString(),
+            expected_hlo_module_string);
+
+  // Expect an iota like input mapping.
+  EXPECT_EQ(compilation_result.input_mapping, std::vector<int>({0, 1}));
+
+  // Expect a single tuple-shape, containing two F32 scalars.
+  EXPECT_EQ(compilation_result.xla_input_shapes.size(), 1);
+  xla::Shape expected_input_shape =
+      xla::ShapeUtil::MakeTupleShape({xla::ShapeUtil::MakeShape(xla::F32, {}),
+                                      xla::ShapeUtil::MakeShape(xla::F32, {})});
+  EXPECT_EQ(compilation_result.xla_input_shapes.front(), expected_input_shape);
+
+  // Expect output shape is a tuple shape containing a single F32 Scalar type.
+  const xla::Shape output_shape =
+      xla::ShapeUtil::MakeShape(xla::PrimitiveType::F32, {});
+  const xla::Shape tuple_output_shape =
+      xla::ShapeUtil::MakeTupleShape({output_shape});
+  EXPECT_EQ(compilation_result.xla_output_shape, tuple_output_shape);
+
+  // Expect exactly 1 OutputDescrpition.
+  EXPECT_EQ(compilation_result.outputs.size(), 1);
+  const XlaCompiler::OutputDescription& output_desc =
+      compilation_result.outputs.front();
+  EXPECT_EQ(output_desc.type, DataType::DT_FLOAT);
+  EXPECT_EQ(output_desc.shape, TensorShape());
+  EXPECT_FALSE(output_desc.is_constant);
+  EXPECT_FALSE(output_desc.is_tensor_list);
+
+  // Expect no resource updates from computation.
+  EXPECT_TRUE(compilation_result.resource_updates.empty());
+}
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/export_utils.cc b/tensorflow/compiler/mlir/tensorflow/utils/export_utils.cc
index dbb3cf08717..804b1372ffc 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/export_utils.cc
+++ b/tensorflow/compiler/mlir/tensorflow/utils/export_utils.cc
@@ -31,7 +31,9 @@ limitations under the License.
 #include "mlir/IR/Module.h"  // TF:local_config_mlir
 #include "mlir/IR/Operation.h"  // TF:local_config_mlir
 #include "mlir/IR/OperationSupport.h"  // TF:local_config_mlir
+#include "mlir/IR/TypeUtilities.h"  // TF:local_config_mlir
 #include "mlir/Support/DebugStringHelper.h"  // TF:local_config_mlir
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_types.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/convert_tensor.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/convert_type.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/mangling_util.h"
@@ -184,6 +186,27 @@ void UpdateCompositeWhileOp(NodeDef* node_def) {
   }
 }
 
+// Returns true if the control dialect op should map to Ref node in TensorFlow
+// Graph. For NextIteration it uses the 1st operand type. For all others
+// (Enter/Exit/Merge/Switch), if the output type is ref,
+// they correspond to the Ref equivalent op in TF Graph.
+static bool IsRefTypeControlOp(mlir::Operation* op) {
+  auto op_name_or_status = GetTensorFlowOpName(op->getName().getStringRef());
+  if (!op_name_or_status.ok()) return false;
+
+  auto op_name = op_name_or_status.ConsumeValueOrDie();
+  if (op_name.equals("NextIteration"))
+    return mlir::getElementTypeOrSelf(op->getOperand(0)->getType())
+        .isa<mlir::TF::TensorFlowRefType>();
+
+  if (op_name.equals("Enter") || op_name.equals("Exit") ||
+      op_name.equals("Switch") || op_name.equals("Merge")) {
+    return getElementTypeOrSelf(op->getResult(0)->getType())
+        .isa<mlir::TF::TensorFlowRefType>();
+  }
+  return false;
+}
+
 }  // anonymous namespace
 
 StatusOr<llvm::StringRef> GetTensorFlowOpName(llvm::StringRef op_name) {
@@ -208,9 +231,21 @@ StatusOr<std::unique_ptr<NodeDef>> GetOperationNodeDef(
   auto node_def = absl::make_unique<NodeDef>();
   // Note: we do not use NodeBuilder or NodeDefBuilder as that would require
   // mapping back from the inputs to the input arguments.
-  TF_ASSIGN_OR_RETURN(auto op_name,
+
+  // Some control flow ops in TensorFlow Graph have their respective "Ref" ops
+  // as well. For example there is Enter and RefEnter op. RefEnter forwards
+  // the input ref buffer to output. However both Enter and RefEnter are
+  // mapped to tf_executor::EnterOp during import and then to _tf.Enter op in
+  // control dialect. Check if it is a Ref op to correctly map to the TensorFlow
+  // Graph op.
+  llvm::SmallString<64> op_name;
+  if (IsRefTypeControlOp(inst)) op_name = "Ref";
+
+  TF_ASSIGN_OR_RETURN(auto tf_name,
                       GetTensorFlowOpName(inst->getName().getStringRef()));
-  node_def->set_op(op_name);
+  op_name.append(tf_name);
+
+  node_def->set_op(op_name.str());
   node_def->set_name(name);
 
   // Add inputs to the NodeDef based on the number of operands. This is required
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/import_utils.cc b/tensorflow/compiler/mlir/tensorflow/utils/import_utils.cc
index 52fb7cac5b7..5be0ebd6894 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/import_utils.cc
+++ b/tensorflow/compiler/mlir/tensorflow/utils/import_utils.cc
@@ -37,6 +37,30 @@ inline llvm::StringRef StringViewToRef(absl::string_view view) {
 
 namespace tensorflow {
 
+Status LoadProtoFromBuffer(absl::string_view input,
+                           tensorflow::protobuf::Message* proto) {
+  tensorflow::protobuf::TextFormat::Parser parser;
+  // Don't produce errors when attempting to parse text format as it would fail
+  // when the input is actually a binary file.
+  NoOpErrorCollector collector;
+  parser.RecordErrorsTo(&collector);
+  // Attempt to parse as text.
+  tensorflow::protobuf::io::ArrayInputStream input_stream(input.data(),
+                                                          input.size());
+  if (parser.Parse(&input_stream, proto)) {
+    return Status::OK();
+  }
+  // Else attempt to parse as binary.
+  proto->Clear();
+  tensorflow::protobuf::io::ArrayInputStream binary_stream(input.data(),
+                                                           input.size());
+  if (proto->ParseFromZeroCopyStream(&binary_stream)) {
+    return Status::OK();
+  }
+  LOG(ERROR) << "Error parsing Protobuf";
+  return errors::InvalidArgument("Could not parse input proto");
+}
+
 Status LoadProtoFromFile(absl::string_view input_filename,
                          tensorflow::protobuf::Message* proto) {
   auto file_or_err =
@@ -45,26 +69,10 @@ Status LoadProtoFromFile(absl::string_view input_filename,
     return errors::InvalidArgument("Could not open input file");
 
   auto& input_file = *file_or_err;
-  std::string content(input_file->getBufferStart(),
-                      input_file->getBufferSize());
+  absl::string_view content(input_file->getBufferStart(),
+                            input_file->getBufferSize());
 
-  tensorflow::protobuf::TextFormat::Parser parser;
-  // Don't produce errors when attempting to parse text format as it would fail
-  // when the input is actually a binary file.
-  NoOpErrorCollector collector;
-  parser.RecordErrorsTo(&collector);
-  // Attempt to parse as text.
-  if (parser.ParseFromString(content, proto)) {
-    return Status::OK();
-  }
-  // Else attempt to parse as binary.
-  proto->Clear();
-  std::istringstream istream(content);
-  if (proto->ParseFromIstream(&istream)) {
-    return Status::OK();
-  }
-  LOG(ERROR) << "Error parsing Protobuf: " << input_filename;
-  return errors::InvalidArgument("Could not parse input file");
+  return LoadProtoFromBuffer(content, proto);
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/import_utils.h b/tensorflow/compiler/mlir/tensorflow/utils/import_utils.h
index 1158b9a6173..a7d00cf890e 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/import_utils.h
+++ b/tensorflow/compiler/mlir/tensorflow/utils/import_utils.h
@@ -22,6 +22,11 @@ limitations under the License.
 
 namespace tensorflow {
 
+// Reads text (.pbtext) or binary (.pb) format of a proto message from the given
+// buffer. Returns error status of the file is not found or malformed proto.
+Status LoadProtoFromBuffer(absl::string_view input,
+                           tensorflow::protobuf::Message* proto);
+
 // Reads text (.pbtext) or binary (.pb) format of a proto message from the given
 // file path. Returns error status of the file is not found or malformed proto.
 Status LoadProtoFromFile(absl::string_view input_filename,
diff --git a/tensorflow/compiler/mlir/tf_mlir_opt_main.cc b/tensorflow/compiler/mlir/tf_mlir_opt_main.cc
index 4a6f9c837aa..f70868e217f 100644
--- a/tensorflow/compiler/mlir/tf_mlir_opt_main.cc
+++ b/tensorflow/compiler/mlir/tf_mlir_opt_main.cc
@@ -55,8 +55,6 @@ static llvm::cl::opt<bool> verify_passes(
     llvm::cl::desc("Run the verifier after each transformation pass"),
     llvm::cl::init(true));
 
-static std::vector<const mlir::PassRegistryEntry *> *pass_list;
-
 int main(int argc, char **argv) {
   tensorflow::InitMlir y(&argc, &argv);
 
@@ -64,9 +62,8 @@ int main(int argc, char **argv) {
   mlir::registerPassManagerCLOptions();
 
   // Parse pass names in main to ensure static initialization completed.
-  llvm::cl::list<const mlir::PassRegistryEntry *, bool, mlir::PassNameParser>
-      pass_list("", llvm::cl::desc("Compiler passes to run"));
-  ::pass_list = &pass_list;
+  mlir::PassPipelineCLParser pass_pipeline("", "Compiler passes to run");
+
   llvm::cl::ParseCommandLineOptions(argc, argv,
                                     "TF MLIR modular optimizer driver\n");
 
@@ -78,7 +75,7 @@ int main(int argc, char **argv) {
   auto output = mlir::openOutputFile(output_filename, &error_message);
   QCHECK(output) << error_message;
 
-  if (failed(mlir::MlirOptMain(output->os(), std::move(file), pass_list,
+  if (failed(mlir::MlirOptMain(output->os(), std::move(file), pass_pipeline,
                                split_input_file, verify_diagnostics,
                                verify_passes)))
     return 1;
diff --git a/tensorflow/compiler/mlir/xla/BUILD b/tensorflow/compiler/mlir/xla/BUILD
index 39cd431165e..bcede30ea73 100644
--- a/tensorflow/compiler/mlir/xla/BUILD
+++ b/tensorflow/compiler/mlir/xla/BUILD
@@ -12,6 +12,7 @@ package_group(
     packages = [
         "//babelfish/device/...",
         "//learning/brain/experimental/mlir/...",
+        "//learning/brain/google/xla/kernels/...",
         "//tensorflow/compiler/mlir/...",
         "//tensorflow/compiler/xla/...",
         "//third_party/mlir_edge/...",
@@ -33,6 +34,8 @@ gentbl(
     tbl_outs = [
         ("-gen-op-decls", "ir/hlo_ops.h.inc"),
         ("-gen-op-defs", "ir/hlo_ops.cc.inc"),
+        ("-gen-struct-attr-decls", "ir/hlo_structs.h.inc"),
+        ("-gen-struct-attr-defs", "ir/hlo_structs.cc.inc"),
     ],
     tblgen = "@local_config_mlir//:mlir-tblgen",
     td_file = "ir/hlo_ops.td",
@@ -84,6 +87,7 @@ cc_library(
     deps = [
         ":hlo",
         "//tensorflow/compiler/mlir/tensorflow",
+        "//tensorflow/compiler/mlir/tensorflow:lower_tf_lib",
         "@llvm//:support",
         "@local_config_mlir//:Analysis",
         "@local_config_mlir//:IR",
@@ -224,9 +228,15 @@ cc_library(
     srcs = ["type_to_shape.cc"],
     hdrs = ["type_to_shape.h"],
     deps = [
+        "//tensorflow/compiler/mlir/tensorflow:convert_tensor",
+        "//tensorflow/compiler/mlir/tensorflow:convert_type",
         "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:xla_data_proto",
-        "//tensorflow/core:lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core/platform:logging",
+        "//tensorflow/core/platform:types",
+        "@llvm//:support",
         "@local_config_mlir//:IR",
         "@local_config_mlir//:Support",
     ],
@@ -241,6 +251,7 @@ tf_cc_test(
         "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_proto_cc",
         "//tensorflow/core:test_main",
         "@local_config_mlir//:IR",
     ],
@@ -258,6 +269,7 @@ cc_library(
         ":type_to_shape",
         "//tensorflow/compiler/mlir/tensorflow:error_util",
         "//tensorflow/compiler/xla:comparison_util",
+        "//tensorflow/compiler/xla:literal_util",
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/client:xla_builder",
@@ -338,6 +350,7 @@ tf_native_cc_binary(
     deps = [
         "@llvm//:support",
         "@llvm//:tablegen",
+        "@local_config_mlir//:Support",
         "@local_config_mlir//:TableGen",
     ],
 )
diff --git a/tensorflow/compiler/mlir/xla/hlo_function_importer.cc b/tensorflow/compiler/mlir/xla/hlo_function_importer.cc
index 0f71859d9a1..d8b096cd85a 100644
--- a/tensorflow/compiler/mlir/xla/hlo_function_importer.cc
+++ b/tensorflow/compiler/mlir/xla/hlo_function_importer.cc
@@ -89,6 +89,17 @@ StatusOr<DenseElementsAttr> CreateDenseAttrFromLiteral(ShapedType type,
   }
 #undef DENSE_ELEMENT_ATTR_BUILDER
 }
+
+// Returns whether the instruction is a default dot operation.
+bool DotIsDefault(const HloInstruction* instruction) {
+  auto dot_dimensions = instruction->dot_dimension_numbers();
+  DotDimensionNumbers default_dimension_numbers;
+  default_dimension_numbers.add_lhs_contracting_dimensions(
+      instruction->operand(0)->shape().dimensions_size() == 1 ? 0 : 1);
+  default_dimension_numbers.add_rhs_contracting_dimensions(0);
+  return xla::protobuf_util::ProtobufEquals(dot_dimensions,
+                                            default_dimension_numbers);
+}
 }  // namespace
 
 StatusOr<mlir::FuncOp> HloFunctionImporter::ImportFunction(
@@ -230,13 +241,18 @@ StatusOr<mlir::Operation*> HloFunctionImporter::ImportInstruction(
       MakeAndReturn(BroadcastInDimOp);
     }
     case HloOpcode::kDot: {
-      // TODO(b/129153247) Add support for batch and contracting dimensions.
-      TF_RETURN_IF_ERROR(ValidateDotDimensions(instruction));
-
       // TODO(b/129709049) The HLO text format elides this in the all DEFAULT
       // case and the parser sticks it in. Maybe we should too.
       attributes.push_back(ConvertPrecisionConfig(instruction));
-      MakeAndReturn(DotOp);
+
+      // Consider consolidating DotOps together.
+      if (DotIsDefault(instruction)) {
+        MakeAndReturn(DotOp);
+      }
+
+      attributes.push_back(builder_->getNamedAttr(
+          "dot_dimension_numbers", ConvertDotDimensionNumbers(instruction)));
+      MakeAndReturn(DotGeneralOp);
     }
     case HloOpcode::kCall: {
       TF_ASSIGN_OR_RETURN(FuncOp function,
@@ -311,7 +327,8 @@ StatusOr<mlir::Operation*> HloFunctionImporter::ImportInstruction(
           ->create<mlir::xla_hlo::SliceOp>(
               loc, result_type, operands[0],
               ConvertDimensions(instruction->slice_starts()),
-              ConvertDimensions(instruction->slice_limits()))
+              ConvertDimensions(instruction->slice_limits()),
+              ConvertDimensions(instruction->slice_strides()))
           .getOperation();
     }
     case HloOpcode::kConcatenate: {
@@ -324,8 +341,12 @@ StatusOr<mlir::Operation*> HloFunctionImporter::ImportInstruction(
           .getOperation();
     }
     case HloOpcode::kReduce: {
+      // Operands in the first half are reduction inputs and the remaining
+      // operands are corresponding initial values.
+      size_t num_inputs = operands.size() / 2;
       auto reduce = func_builder->create<mlir::xla_hlo::ReduceOp>(
-          loc, result_type, operands,
+          loc, result_type, llvm::makeArrayRef(operands).take_front(num_inputs),
+          llvm::makeArrayRef(operands).drop_front(num_inputs),
           ConvertDimensions(instruction->dimensions()));
       TF_RETURN_IF_ERROR(
           ImportComputation(instruction->to_apply(), &reduce.body()));
@@ -537,37 +558,54 @@ mlir::NamedAttribute HloFunctionImporter::ConvertComparisonDirection(
           ComparisonDirectionToString(instruction->comparison_direction())));
 }
 
-mlir::ElementsAttr HloFunctionImporter::ConvertDimensions(
+mlir::DenseIntElementsAttr HloFunctionImporter::ConvertDimensions(
     llvm::ArrayRef<int64> op_dimensions) {
   llvm::SmallVector<APInt, 8> dimensions;
   dimensions.reserve(op_dimensions.size());
   for (auto value : op_dimensions) dimensions.emplace_back(APInt(64, value));
 
   return DenseIntElementsAttr::get(
-      builder_->getTensorType(dimensions.size(), builder_->getIntegerType(64)),
-      dimensions);
+             builder_->getTensorType(dimensions.size(),
+                                     builder_->getIntegerType(64)),
+             dimensions)
+      .cast<DenseIntElementsAttr>();
 }
 
-mlir::ElementsAttr HloFunctionImporter::Convert(
+mlir::DenseIntElementsAttr HloFunctionImporter::Convert(
     llvm::ArrayRef<int64_t> op_dimensions) {
-  return builder_->getDenseIntElementsAttr(
-      builder_->getTensorType(op_dimensions.size(),
-                              builder_->getIntegerType(64)),
-      op_dimensions);
+  return builder_
+      ->getDenseIntElementsAttr(
+          builder_->getTensorType(op_dimensions.size(),
+                                  builder_->getIntegerType(64)),
+          op_dimensions)
+      .cast<DenseIntElementsAttr>();
 }
 
-Status HloFunctionImporter::ValidateDotDimensions(HloInstruction* instruction) {
-  DotDimensionNumbers expected_dimension_numbers;
-  expected_dimension_numbers.add_lhs_contracting_dimensions(
-      instruction->operand(0)->shape().dimensions_size() == 1 ? 0 : 1);
-  expected_dimension_numbers.add_rhs_contracting_dimensions(0);
-  if (!xla::protobuf_util::ProtobufEquals(instruction->dot_dimension_numbers(),
-                                          expected_dimension_numbers)) {
-    return tensorflow::errors::Internal(
-        absl::StrCat("Dot operation has unsupported dimension numbers: ",
-                     instruction->dot_dimension_numbers().DebugString()));
-  }
-  return Status::OK();
+mlir::xla_hlo::DotDimensionNumbers
+HloFunctionImporter::ConvertDotDimensionNumbers(HloInstruction* instruction) {
+  auto dot_dimensions = instruction->dot_dimension_numbers();
+  std::vector<int64_t> rhs_contracting_dimensions(
+      dot_dimensions.rhs_contracting_dimensions().begin(),
+      dot_dimensions.rhs_contracting_dimensions().end());
+  std::vector<int64_t> lhs_contracting_dimensions(
+      dot_dimensions.lhs_contracting_dimensions().begin(),
+      dot_dimensions.lhs_contracting_dimensions().end());
+  std::vector<int64_t> rhs_batch_dimensions(
+      dot_dimensions.rhs_batch_dimensions().begin(),
+      dot_dimensions.rhs_batch_dimensions().end());
+  std::vector<int64_t> lhs_batch_dimensions(
+      dot_dimensions.lhs_batch_dimensions().begin(),
+      dot_dimensions.lhs_batch_dimensions().end());
+
+  // Push the attributes into our new DictionaryAttr.
+  auto lhs_batch_dims_attr = Convert(lhs_batch_dimensions);
+  auto rhs_batch_dims_attr = Convert(rhs_batch_dimensions);
+  auto lhs_contracting_dims_attr = Convert(lhs_contracting_dimensions);
+  auto rhs_contracting_dims_attr = Convert(rhs_contracting_dimensions);
+
+  return mlir::xla_hlo::DotDimensionNumbers::get(
+      lhs_batch_dims_attr, rhs_batch_dims_attr, lhs_contracting_dims_attr,
+      rhs_contracting_dims_attr, context_);
 }
 
 }  // namespace xla
diff --git a/tensorflow/compiler/mlir/xla/hlo_function_importer.h b/tensorflow/compiler/mlir/xla/hlo_function_importer.h
index b2e932bb09a..c6b61f94f5e 100644
--- a/tensorflow/compiler/mlir/xla/hlo_function_importer.h
+++ b/tensorflow/compiler/mlir/xla/hlo_function_importer.h
@@ -25,6 +25,7 @@ limitations under the License.
 #include "mlir/IR/Module.h"  // TF:local_config_mlir
 #include "mlir/IR/StandardTypes.h"  // TF:local_config_mlir
 #include "tensorflow/compiler/mlir/tensorflow/utils/error_util.h"
+#include "tensorflow/compiler/mlir/xla/ir/hlo_ops.h"
 #include "tensorflow/compiler/xla/status.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
@@ -98,14 +99,15 @@ class HloFunctionImporter {
       xla::HloInstruction* instruction);
 
   // Converts the dimensions of an HLO instruction into an MLIR attribute.
-  mlir::ElementsAttr ConvertDimensions(
+  mlir::DenseIntElementsAttr ConvertDimensions(
       llvm::ArrayRef<tensorflow::int64> op_dimensions);
 
-  // Converts Array ref to an ElementsAttr.
-  mlir::ElementsAttr Convert(llvm::ArrayRef<int64_t> op_dimensions);
+  // Converts Array ref to an DenseIntElementsAttr.
+  mlir::DenseIntElementsAttr Convert(llvm::ArrayRef<int64_t> op_dimensions);
 
-  // Ensures dot instruction has only default contracting and batch dimensions.
-  Status ValidateDotDimensions(xla::HloInstruction* instruction);
+  // Converts the dot dimensions to attributes.
+  mlir::xla_hlo::DotDimensionNumbers ConvertDotDimensionNumbers(
+      xla::HloInstruction* instruction);
 
   mlir::MLIRContext* context_;
   mlir::ModuleOp module_;
diff --git a/tensorflow/compiler/mlir/xla/ir/hlo_ops.cc b/tensorflow/compiler/mlir/xla/ir/hlo_ops.cc
index a5df379d90b..6c0f5179025 100644
--- a/tensorflow/compiler/mlir/xla/ir/hlo_ops.cc
+++ b/tensorflow/compiler/mlir/xla/ir/hlo_ops.cc
@@ -44,6 +44,10 @@ limitations under the License.
 #include "mlir/IR/Value.h"  // TF:local_config_mlir
 #include "tensorflow/compiler/mlir/xla/ir/hlo_ops.h.inc"
 
+namespace mlir {
+#include "tensorflow/compiler/mlir/xla/ir/hlo_structs.cc.inc"
+}  // namespace mlir
+
 using namespace mlir;
 using namespace mlir::xla_hlo;
 
@@ -68,8 +72,10 @@ Operation* XlaHloDialect::materializeConstant(OpBuilder& builder,
   return nullptr;
 }
 
-#define GET_OP_CLASSES
-#include "tensorflow/compiler/mlir/xla/ir/hlo_ops.cc.inc"
+template <typename T>
+static LogicalResult Verify(T op) {
+  return success();
+}
 
 //===----------------------------------------------------------------------===//
 // ConstOp
@@ -102,6 +108,35 @@ void ConstOp::build(Builder* builder, OperationState* result, Attribute value) {
   result->addAttribute("value", value);
 }
 
+//===----------------------------------------------------------------------===//
+// IotaOp
+//===----------------------------------------------------------------------===//
+
+OpFoldResult IotaOp::fold(ArrayRef<Attribute> operands) {
+  const auto output_type = getResult()->getType().cast<ShapedType>();
+  const auto output_size = output_type.getNumElements();
+  const auto dimension = iota_dimension().getLimitedValue();
+  const auto max_dim_size = output_type.getDimSize(dimension);
+  int bitwidth = output_type.getElementType().getIntOrFloatBitWidth();
+
+  llvm::SmallVector<APInt, 10> values;
+  values.reserve(output_size);
+
+  int64_t increase_stride = output_size;
+  for (int i = 0; i <= dimension; i++) {
+    increase_stride /= output_type.getDimSize(i);
+  }
+
+  int64_t current_value = 0;
+  for (int i = 0; i < output_size; i++) {
+    int64_t value = (current_value / increase_stride) % max_dim_size;
+    values.push_back(APInt(bitwidth, value));
+    ++current_value;
+  }
+
+  return DenseIntElementsAttr::get(output_type, values);
+}
+
 //===----------------------------------------------------------------------===//
 // ConvertOp
 //===----------------------------------------------------------------------===//
@@ -175,32 +210,211 @@ OpFoldResult ConvertOp::fold(ArrayRef<Attribute> operands) {
 }
 
 //===----------------------------------------------------------------------===//
-// IotaOp
+// GetTupleElementOp
 //===----------------------------------------------------------------------===//
 
-OpFoldResult IotaOp::fold(ArrayRef<Attribute> operands) {
-  const auto output_type = getResult()->getType().cast<ShapedType>();
-  const auto output_size = output_type.getNumElements();
-  const auto dimension = iota_dimension().getLimitedValue();
-  const auto max_dim_size = output_type.getDimSize(dimension);
-  int bitwidth = output_type.getElementType().getIntOrFloatBitWidth();
-
-  llvm::SmallVector<APInt, 10> values;
-  values.reserve(output_size);
-
-  int64_t increase_stride = output_size;
-  for (int i = 0; i <= dimension; i++) {
-    increase_stride /= output_type.getDimSize(i);
+static LogicalResult Verify(GetTupleElementOp op) {
+  auto indexVal = op.index().getZExtValue();
+  auto operandType = op.getOperand()->getType().cast<TupleType>();
+  if (indexVal >= operandType.size()) {
+    return op.emitOpError(
+        llvm::formatv("index {0} is out of bounds of operand with size {1}",
+                      indexVal, operandType.size()));
   }
 
-  int64_t current_value = 0;
-  for (int i = 0; i < output_size; i++) {
-    int64_t value = (current_value / increase_stride) % max_dim_size;
-    values.push_back(APInt(bitwidth, value));
-    ++current_value;
+  auto expectedType = operandType.getType(indexVal);
+  if (op.getType() != expectedType) {
+    return op.emitOpError(llvm::formatv("has return type {0}, but expected {1}",
+                                        op.getType(), expectedType));
+  }
+  return success();
+}
+
+//===----------------------------------------------------------------------===//
+// TupleOp
+//===----------------------------------------------------------------------===//
+
+static LogicalResult Verify(TupleOp op) {
+  SmallVector<Type, 8> operandTypes = {op.operand_type_begin(),
+                                       op.operand_type_end()};
+  auto expectedType = TupleType::get(operandTypes, op.getContext());
+  if (op.getType() != expectedType) {
+    return op.emitOpError(llvm::formatv("has return type {0}, but expected {1}",
+                                        op.getType(), expectedType));
+  }
+  return success();
+}
+
+//===----------------------------------------------------------------------===//
+// BroadcastOp
+//===----------------------------------------------------------------------===//
+
+// TODO(b/129012527) These should be expressed as type constraints.
+static LogicalResult Verify(BroadcastOp op) {
+  auto sizes = op.broadcast_sizes();
+  auto sizesType = sizes.getType();
+  auto sizesRank = sizesType.getRank();
+  if (sizesRank != 1) {
+    return op.emitOpError(llvm::formatv(
+        "broadcast_sizes has rank {0} instead of rank 1", sizesRank));
   }
 
-  return DenseIntElementsAttr::get(output_type, values);
+  auto resultType = op.getResult()->getType().cast<RankedTensorType>();
+  auto resultRank = resultType.getRank();
+  auto operandType = op.operand()->getType().cast<RankedTensorType>();
+  auto operandRank = operandType.getRank();
+  auto sizesSize = sizesType.getNumElements();
+  auto expectedRank = operandRank + sizesSize;
+
+  if (resultRank != expectedRank) {
+    return op.emitOpError(
+        llvm::formatv("result rank ({0}) does not match operand rank "
+                      "({2}) plus size of broadcast_sizes ({3})",
+                      resultRank, operandRank, sizesSize));
+  }
+
+  llvm::SmallVector<int64_t, 10> expectedShape(sizes.getValues<int64_t>());
+
+  auto operandShape = operandType.getShape();
+  expectedShape.insert(expectedShape.end(), operandShape.begin(),
+                       operandShape.end());
+
+  auto resultShape = resultType.getShape();
+  if (resultShape != llvm::makeArrayRef(expectedShape)) {
+    return op.emitOpError(llvm::formatv(
+        "result has shape [{0}] instead of [{1}]",
+        llvm::make_range(resultShape.begin(), resultShape.end()),
+        llvm::make_range(expectedShape.begin(), expectedShape.end())));
+  }
+
+  return success();
+}
+
+//===----------------------------------------------------------------------===//
+// BroadcastInDimOp
+//===----------------------------------------------------------------------===//
+
+static LogicalResult Verify(BroadcastInDimOp op) {
+  auto operandType = op.operand()->getType().cast<RankedTensorType>();
+  auto operandRank = operandType.getRank();
+  if (!op.broadcast_dimensions()) {
+    if (operandRank == 0) {
+      return success();
+    }
+    return op.emitOpError(
+        llvm::formatv("broadcast_dimensions is absent, but required because "
+                      "operand has non-zero rank ({0})",
+                      operandRank));
+  }
+
+  auto dimensions = *op.broadcast_dimensions();
+  auto dimensionsType = op.broadcast_dimensions()->getType();
+  auto dimensionsRank = dimensionsType.getRank();
+  if (dimensionsRank != 1) {
+    return op.emitOpError(llvm::formatv(
+        "broadcast_dimensions has rank {0} instead of rank 1", dimensionsRank));
+  }
+
+  auto dimensionsSize = dimensionsType.getNumElements();
+  if (dimensionsSize != operandRank) {
+    return op.emitOpError(llvm::formatv(
+        "broadcast_dimensions size ({0}) does not match operand rank ({1})",
+        dimensionsSize, operandRank));
+  }
+
+  auto resultType = op.getResult()->getType().cast<RankedTensorType>();
+  auto resultRank = resultType.getRank();
+  if (resultRank < operandRank) {
+    return op.emitOpError(
+        llvm::formatv("result rank ({0}) is less than operand rank ({1})",
+                      resultRank, operandRank));
+  }
+
+  for (int i = 0; i != dimensionsSize; ++i) {
+    auto dimIndex = dimensions.getValue<int64_t>(i);
+    if (dimIndex >= resultRank) {
+      return op.emitOpError(
+          llvm::formatv("broadcast_dimensions contains invalid value {0} for "
+                        "result result with rank {1}",
+                        dimIndex, resultRank));
+    }
+
+    auto dimSize = operandType.getDimSize(i);
+    auto resultDimSize = resultType.getDimSize(dimIndex);
+    if (dimSize != 1 && dimSize != resultDimSize) {
+      return op.emitOpError(
+          llvm::formatv("size of operand dimension {0} ({1}) is not equal to "
+                        "1 or size of result dimension {2} ({3})",
+                        i, dimSize, dimIndex, resultDimSize));
+    }
+  }
+
+  return success();
+}
+
+//===----------------------------------------------------------------------===//
+// ClampOp
+//===----------------------------------------------------------------------===//
+
+static LogicalResult Verify(ClampOp op) {
+  auto operandType = op.operand()->getType().cast<RankedTensorType>();
+  auto operandShape = operandType.getShape();
+  auto minType = op.min()->getType().cast<RankedTensorType>();
+
+  auto minShape = minType.getShape();
+  if (minShape != operandShape && minType.getRank() != 0) {
+    return op.emitOpError(llvm::formatv(
+        "min shape [{0}] is not scalar and does not match operand shape [{1}]",
+        llvm::make_range(minShape.begin(), minShape.end()),
+        llvm::make_range(operandShape.begin(), operandShape.end())));
+  }
+
+  auto maxType = op.max()->getType().cast<RankedTensorType>();
+  auto maxShape = maxType.getShape();
+  if (maxShape != operandShape && maxType.getRank() != 0) {
+    return op.emitOpError(llvm::formatv(
+        "max shape [{0}] is not scalar and does not match operand shape [{1}]",
+        llvm::make_range(maxShape.begin(), maxShape.end()),
+        llvm::make_range(operandShape.begin(), operandShape.end())));
+  }
+
+  return success();
+}
+
+//===----------------------------------------------------------------------===//
+// ConcatenateOp
+//===----------------------------------------------------------------------===//
+
+OpFoldResult ConcatenateOp::fold(ArrayRef<Attribute> operands) {
+  if (getNumOperands() == 1) return getOperand(0);
+  return {};
+}
+
+static LogicalResult Verify(ConcatenateOp op) {
+  auto firstType = op.getOperand(0)->getType().cast<RankedTensorType>();
+
+  auto firstShape = firstType.getShape();
+  int numOperands = op.getNumOperands();
+  for (int i = 1; i < numOperands; i++) {
+    auto secondType = op.getOperand(i)->getType().cast<RankedTensorType>();
+
+    if (firstType.getRank() != secondType.getRank()) {
+      return op.emitOpError(
+          llvm::formatv("operands (0) and ({0}) do not match rank.", i));
+    }
+
+    auto secondShape = secondType.getShape();
+    for (int d = 0; d < firstType.getRank(); ++d) {
+      if (firstShape[d] != secondShape[d] && d != op.dimension()) {
+        return op.emitOpError(llvm::formatv(
+            "operands (0) and ({0}) non-concat dimensions do not match "
+            "({1}) != ({2}).",
+            i, llvm::make_range(firstShape.begin(), firstShape.end()),
+            llvm::make_range(secondShape.begin(), secondShape.end())));
+      }
+    }
+  }
+  return success();
 }
 
 //===----------------------------------------------------------------------===//
@@ -225,6 +439,89 @@ OpFoldResult ReshapeOp::fold(ArrayRef<Attribute> operands) {
   return {};
 }
 
+//===----------------------------------------------------------------------===//
+// SelectOp
+//===----------------------------------------------------------------------===//
+
+static LogicalResult Verify(SelectOp op) {
+  auto onTrueType = op.on_true()->getType().cast<RankedTensorType>();
+  auto onFalseType = op.on_false()->getType().cast<RankedTensorType>();
+
+  if (onTrueType != onFalseType) {
+    return op.emitOpError(
+        llvm::formatv("on_true type ({0}) does not match on_false type ({1})",
+                      onTrueType, onFalseType));
+  }
+
+  auto predType = op.pred()->getType().cast<RankedTensorType>();
+  auto predShape = predType.getShape();
+  auto predRank = predType.getRank();
+  auto selectShape = onTrueType.getShape();
+
+  if (predRank != 0 && predShape != selectShape) {
+    return op.emitOpError(llvm::formatv(
+        "pred shape ([{0}]) is not scalar and does not match operand shapes "
+        "([{1}])",
+        llvm::make_range(predShape.begin(), predShape.end()),
+        llvm::make_range(selectShape.begin(), selectShape.end())));
+  }
+
+  return success();
+}
+
+//===----------------------------------------------------------------------===//
+// PadOp
+//===----------------------------------------------------------------------===//
+
+static LogicalResult Verify(PadOp op) {
+  auto input_type = op.operand()->getType().cast<RankedTensorType>();
+  auto pad_type = op.padding_value()->getType().cast<RankedTensorType>();
+
+  if (pad_type.getRank() != 0) {
+    return op.emitOpError(
+        llvm::formatv("padding value type should be a rank-0 "
+                      "tensor, is rank {0}",
+                      pad_type.getRank()));
+  }
+
+  const auto& padding_low = op.edge_padding_low();
+  if (padding_low.getType().getNumElements() != input_type.getRank()) {
+    return op.emitOpError(llvm::formatv(
+        "edge_padding_low length ({0}) must match operand rank ({1}).",
+        padding_low.getType().getNumElements(), input_type.getRank()));
+  }
+
+  const auto& padding_high = op.edge_padding_high();
+  if (padding_high.getType().getNumElements() != input_type.getRank()) {
+    return op.emitOpError(llvm::formatv(
+        "edge_padding_high length ({0}) must match operand rank ({1}).",
+        padding_high.getType().getNumElements(), input_type.getRank()));
+  }
+
+  auto input_shape = input_type.getShape();
+  auto output_shape =
+      op.getResult()->getType().cast<RankedTensorType>().getShape();
+  if (input_shape.size() != output_shape.size()) {
+    return op.emitOpError(
+        llvm::formatv("Operand rank ({0}) and result rank({0}) should match",
+                      input_shape.size(), output_shape.size()));
+  }
+
+  for (int i = 0, e = input_shape.size(); i < e; i++) {
+    int expected_output = input_shape[i] +
+                          padding_low.getValue<IntegerAttr>(i).getInt() +
+                          padding_high.getValue<IntegerAttr>(i).getInt();
+    if (expected_output != output_shape[i]) {
+      return op.emitOpError(
+          llvm::formatv("Expected output shape ({0}) and "
+                        "output shape ({1}) should match.",
+                        expected_output, output_shape[i]));
+    }
+  }
+
+  return success();
+}
+
 //===----------------------------------------------------------------------===//
 // TransposeOp
 //===----------------------------------------------------------------------===//
@@ -237,3 +534,51 @@ OpFoldResult TransposeOp::fold(ArrayRef<Attribute> operands) {
   }
   return getOperand();
 }
+
+static LogicalResult Verify(TransposeOp op) {
+  auto permutationType = op.permutation().getType();
+  auto permutationRank = permutationType.getRank();
+  if (permutationRank != 1) {
+    return op.emitOpError(llvm::formatv(
+        "permutation has rank {0} instead of rank 1", permutationRank));
+  }
+
+  auto operandType = op.operand()->getType().cast<RankedTensorType>();
+  auto operandRank = operandType.getRank();
+  auto permutationSize = permutationType.getNumElements();
+  if (permutationSize != operandRank) {
+    return op.emitOpError(llvm::formatv(
+        "permutation size ({0}) does not match operand rank ({1})",
+        permutationSize, operandRank));
+  }
+
+  auto resultType = op.getResult()->getType().cast<RankedTensorType>();
+  auto resultRank = resultType.getRank();
+  if (resultRank != operandRank) {
+    return op.emitOpError(
+        llvm::formatv("result rank ({0}) does not match operand rank ({1})",
+                      resultRank, operandRank));
+  }
+
+  auto resultShape = resultType.getShape();
+
+  auto expectedShape = SmallVector<int64_t, 10>(operandRank);
+  for (int i = 0; i != operandRank; ++i) {
+    auto permutedDim = op.permutation().getValue<IntegerAttr>(i).getInt();
+    expectedShape[i] = operandType.getDimSize(permutedDim);
+  }
+
+  if (resultShape != llvm::makeArrayRef(expectedShape)) {
+    return op.emitOpError(llvm::formatv(
+        "result shape is [{0}"
+        "] instead of [{1}"
+        "]",
+        llvm::make_range(resultShape.begin(), resultShape.end()),
+        llvm::make_range(expectedShape.begin(), expectedShape.end())));
+  }
+
+  return success();
+}
+
+#define GET_OP_CLASSES
+#include "tensorflow/compiler/mlir/xla/ir/hlo_ops.cc.inc"
diff --git a/tensorflow/compiler/mlir/xla/ir/hlo_ops.h b/tensorflow/compiler/mlir/xla/ir/hlo_ops.h
index 3260a829734..09a9cec968f 100644
--- a/tensorflow/compiler/mlir/xla/ir/hlo_ops.h
+++ b/tensorflow/compiler/mlir/xla/ir/hlo_ops.h
@@ -32,6 +32,8 @@ limitations under the License.
 namespace mlir {
 class OpBuilder;
 
+#include "tensorflow/compiler/mlir/xla/ir/hlo_structs.h.inc"
+
 namespace xla_hlo {
 
 class XlaHloDialect : public Dialect {
diff --git a/tensorflow/compiler/mlir/xla/ir/hlo_ops.td b/tensorflow/compiler/mlir/xla/ir/hlo_ops.td
index 5827225988b..e6efdc82d9d 100644
--- a/tensorflow/compiler/mlir/xla/ir/hlo_ops.td
+++ b/tensorflow/compiler/mlir/xla/ir/hlo_ops.td
@@ -38,6 +38,10 @@ class HLO_Op<string mnemonic, list<OpTrait> traits> :
     Op<HLO_Dialect, mnemonic, traits> {
   // Whether this operation has a custom conversion to HLO or not.
   bit hasCustomHLOConverter = 0b0;
+
+  // TODO(b/129012527) Much of this custom verification should be expressed as
+  // type constraints.
+  let verifier = [{ return Verify(*this); }];
 }
 
 //===----------------------------------------------------------------------===//
@@ -140,9 +144,9 @@ def HLO_TanhOp: HLO_UnaryElementwiseOp<"tanh",
 class HLO_BinaryElementwiseOp<string mnemonic, list<OpTrait> traits> :
         HLO_Op<mnemonic, traits> {
   let arguments = (ins
-      HLO_Tensor:$lhs,
-      HLO_Tensor:$rhs,
-      BroadcastDimAttr:$broadcast_dimensions
+    HLO_Tensor:$lhs,
+    HLO_Tensor:$rhs,
+    BroadcastDimAttr:$broadcast_dimensions
   );
   let results = (outs HLO_Tensor);
   let parser = [{ return mlir::impl::parseBinaryOp(parser, result); }];
@@ -196,11 +200,13 @@ def HLO_WhileOp: HLO_Op<"while", [NoSideEffect, SameOperandsAndResultType]> {
 
 def HLO_ReduceOp: HLO_Op<"reduce", [
       NoSideEffect,
+      SameVariadicOperandSize,
       SingleBlockImplicitTerminator<"ReturnOp">
     ]>, BASE_HLO_ReduceOp {
   let arguments = (ins
-    Variadic<HLO_TensorOrTuple>:$operands_and_init,
-    ElementsAttr:$dimensions
+    Variadic<HLO_TensorOrTuple>:$operands,
+    Variadic<HLO_TensorOrTuple>:$init_values,
+    I64ElementsAttr:$dimensions
   );
 
   let results = (outs Variadic<HLO_Tensor>);
@@ -241,10 +247,10 @@ def HLO_TupleOp : HLO_Op<"tuple", [NoSideEffect]>, BASE_HLO_TupleOp {
 def HLO_CompareOp: HLO_Op<"compare",
       [NoSideEffect, SameOperandsAndResultShape]>, BASE_HLO_CompareOp {
   let arguments = (ins
-      HLO_Tensor:$lhs,
-      HLO_Tensor:$rhs,
-      BroadcastDimAttr:$broadcast_dimensions,
-      HLO_ComparisonDirectionAttr:$comparison_direction
+    HLO_Tensor:$lhs,
+    HLO_Tensor:$rhs,
+    BroadcastDimAttr:$broadcast_dimensions,
+    HLO_ComparisonDirectionAttr:$comparison_direction
   );
   let results = (outs HLO_PredTensor);
 }
@@ -256,11 +262,12 @@ def HLO_CompareOp: HLO_Op<"compare",
 def HLO_SliceOp: HLO_Op<
       "slice",
       [NoSideEffect, SameOperandsAndResultElementType,
-       AllTypesMatch<["start_indices", "limit_indices"]>]> {
-  let arguments = (
-    ins HLO_Tensor:$operand,
-    ElementsAttr:$start_indices,
-    ElementsAttr:$limit_indices
+       AllTypesMatch<["start_indices", "limit_indices", "strides"]>]> {
+  let arguments = (ins
+    HLO_Tensor:$operand,
+    I64ElementsAttr:$start_indices,
+    I64ElementsAttr:$limit_indices,
+    I64ElementsAttr:$strides
   );
 
   let results = (outs HLO_Tensor);
@@ -309,58 +316,10 @@ def HLO_BroadcastOp : HLO_Op<"broadcast",
       [NoSideEffect, SameOperandsAndResultElementType]>, BASE_HLO_BroadcastOp {
   let arguments = (ins
     HLO_Tensor:$operand,
-    ElementsAttr:$broadcast_sizes
+    I64ElementsAttr:$broadcast_sizes
   );
 
   let results = (outs HLO_Tensor);
-
-  // TODO(b/129012527) These should be expressed as type constraints.
-  let verifier = [{
-    auto sizes = broadcast_sizes().dyn_cast<DenseIntElementsAttr>();
-    if (!sizes) {
-      return emitOpError(llvm::formatv(
-          "broadcast_sizes must be a DenseIntElementsAttr; got {0}",
-          broadcast_sizes()));
-    }
-    auto sizesType = sizes.getType().cast<RankedTensorType>();
-    auto sizesRank = sizesType.getRank();
-    if (sizesRank != 1) {
-      return emitOpError(llvm::formatv(
-          "broadcast_sizes has rank {0} instead of rank 1", sizesRank));
-    }
-
-    auto resultType = getResult()->getType().cast<RankedTensorType>();
-    auto resultRank = resultType.getRank();
-    auto operandType = operand()->getType().cast<RankedTensorType>();
-    auto operandRank = operandType.getRank();
-    auto sizesSize = sizesType.getNumElements();
-    auto expectedRank = operandRank + sizesSize;
-
-    if (resultRank != expectedRank) {
-      return emitOpError(
-          llvm::formatv("result rank ({0}) does not match operand rank "
-                        "({2}) plus size of broadcast_sizes ({3})",
-                        resultRank, operandRank, sizesSize));
-    }
-
-    llvm::SmallVector<int64_t, 10> expectedShape(sizes.getValues<int64_t>());
-
-    auto operandShape = operandType.getShape();
-    expectedShape.insert(expectedShape.end(), operandShape.begin(),
-                         operandShape.end());
-
-    auto resultShape = resultType.getShape();
-    if (resultShape != llvm::makeArrayRef(expectedShape)) {
-      return emitOpError(llvm::formatv(
-          "result has shape [{0}"
-          "] instead of [{1}"
-          "]",
-          llvm::make_range(resultShape.begin(), resultShape.end()),
-          llvm::make_range(expectedShape.begin(), expectedShape.end())));
-    }
-
-    return success();
-  }];
 }
 
 def HLO_BroadcastInDimOp : HLO_Op<"broadcast_in_dim",
@@ -372,72 +331,6 @@ def HLO_BroadcastInDimOp : HLO_Op<"broadcast_in_dim",
 
   let results = (outs HLO_Tensor);
 
-  // TODO(b/129012527) These should be expressed as type constraints.
-  let verifier = [{
-    auto operandType = operand()->getType().cast<RankedTensorType>();
-    auto operandRank = operandType.getRank();
-    if (!broadcast_dimensions()) {
-      if (operandRank == 0) {
-        return success();
-      }
-      return emitOpError(
-          llvm::formatv("broadcast_dimensions is absent, but required because "
-                        "operand has non-zero rank ({0})",
-                        operandRank));
-    }
-
-    auto dimensions = broadcast_dimensions()->dyn_cast<DenseIntElementsAttr>();
-    if (!dimensions) {
-      return emitOpError(llvm::formatv(
-          "broadcast_sizes must be a DenseIntElementsAttr; got {0}",
-          broadcast_dimensions()));
-    }
-
-    auto dimensionsType = broadcast_dimensions()->getType().cast<RankedTensorType>();
-    auto dimensionsRank = dimensionsType.getRank();
-    if (dimensionsRank != 1) {
-      return emitOpError(
-          llvm::formatv("broadcast_dimensions has rank {0} instead of rank 1",
-                        dimensionsRank));
-    }
-
-    auto dimensionsSize = dimensionsType.getNumElements();
-    if (dimensionsSize != operandRank) {
-      return emitOpError(llvm::formatv(
-          "broadcast_dimensions size ({0}) does not match operand rank ({1})",
-          dimensionsSize, operandRank));
-    }
-
-    auto resultType = getResult()->getType().cast<RankedTensorType>();
-    auto resultRank = resultType.getRank();
-    if (resultRank < operandRank) {
-      return emitOpError(
-          llvm::formatv("result rank ({0}) is less than operand rank ({1})",
-                        resultRank, operandRank));
-    }
-
-    for (int i = 0; i != dimensionsSize; ++i) {
-      auto dimIndex = dimensions.getValue<int64_t>(i);
-      if (dimIndex >= resultRank) {
-        return emitOpError(
-            llvm::formatv("broadcast_dimensions contains invalid value {0} for "
-                          "result result with rank {1}",
-                          dimIndex, resultRank));
-      }
-
-      auto dimSize = operandType.getDimSize(i);
-      auto resultDimSize = resultType.getDimSize(dimIndex);
-      if (dimSize != 1 && dimSize != resultDimSize) {
-        return emitOpError(
-            llvm::formatv("size of operand dimension {0} ({1}) is not equal to "
-                          "1 or size of result dimension {2} ({3})",
-                          i, dimSize, dimIndex, resultDimSize));
-      }
-    }
-
-    return success();
-  }];
-
   // TODO(b/130357376): One of the arguments comes from the new shape, which is
   // not handled by the codegen.
   let hasCustomHLOConverter = 1;
@@ -452,74 +345,19 @@ def HLO_ClampOp : HLO_Op<"clamp",
   );
 
   let results = (outs HLO_Tensor);
-
-  // TODO(b/129012527) These should be expressed as type constraints.
-  let verifier = [{
-    auto operandType = operand()->getType().cast<RankedTensorType>();
-    auto operandShape = operandType.getShape();
-    auto minType = min()->getType().cast<RankedTensorType>();
-
-    auto minShape = minType.getShape();
-    if (minShape != operandShape && minType.getRank() != 0) {
-      return emitOpError(llvm::formatv(
-          "min shape [{0}"
-          "] is not scalar and does not match operand shape [{1}"
-          "]",
-          llvm::make_range(minShape.begin(), minShape.end()),
-          llvm::make_range(operandShape.begin(), operandShape.end())));
-    }
-
-    auto maxType = max()->getType().cast<RankedTensorType>();
-    auto maxShape = maxType.getShape();
-    if (maxShape != operandShape && maxType.getRank() != 0) {
-      return emitOpError(llvm::formatv(
-          "max shape [{0}"
-          "] is not scalar and does not match operand shape [{1}"
-          "]",
-          llvm::make_range(maxShape.begin(), maxShape.end()),
-          llvm::make_range(operandShape.begin(), operandShape.end())));
-    }
-
-    return success();
-  }];
 }
 
 def HLO_ConcatenateOp : HLO_Op<"concatenate",
-      [NoSideEffect, SameOperandsAndResultElementType]>, BASE_HLO_ConcatenateOp {
+    [NoSideEffect, SameOperandsAndResultElementType]>, BASE_HLO_ConcatenateOp {
 
-   let arguments = (
-     ins Variadic<HLO_Tensor>:$val,
-         I64Attr: $dimension
-   );
+  let arguments = (ins
+    Variadic<HLO_Tensor>:$val,
+    I64Attr: $dimension
+  );
 
-   let verifier = [{
-     auto firstType = getOperand(0)->getType().cast<RankedTensorType>();
+  let results = (outs HLO_Tensor);
 
-     auto firstShape = firstType.getShape();
-     int numOperands = getNumOperands();
-     for (int i = 1; i < numOperands; i++) {
-       auto secondType = getOperand(i)->getType().cast<RankedTensorType>();
-
-       if (firstType.getRank() != secondType.getRank()) {
-         return emitOpError(
-             llvm::formatv("operands (0) and ({0}) do not match rank.", i));
-       }
-
-       auto secondShape = secondType.getShape();
-       for (int d = 0; d < firstType.getRank(); ++d) {
-         if (firstShape[d] != secondShape[d] && d != dimension()) {
-           return emitOpError(llvm::formatv(
-               "operands (0) and ({0}) non-concat dimensions do not match "
-               "({1}) != ({2}).",
-               i, llvm::make_range(firstShape.begin(), firstShape.end()),
-               llvm::make_range(secondShape.begin(), secondShape.end())));
-         }
-       }
-     }
-     return success();
-   }];
-
-   let results = (outs HLO_Tensor);
+  let hasFolder = 1;
 
   // TODO(b/129422361) ConcatOp has special conversion logic to HLO.
   let hasCustomHLOConverter = 1;
@@ -555,22 +393,41 @@ def HLO_CopyOp: HLO_Op<"copy", [NoSideEffect, SameOperandsAndResultType]> {
 
 def HLO_DotOp: HLO_Op<"dot", [NoSideEffect]>, BASE_HLO_DotOp {
   let arguments = (
-        ins HLO_Tensor:$lhs,
-        HLO_Tensor:$rhs,
-        HLO_PrecisionConfigAttr:$precision_config
-    );
+    ins HLO_Tensor:$lhs,
+    HLO_Tensor:$rhs,
+    HLO_PrecisionConfigAttr:$precision_config
+  );
+  let results = (outs HLO_Tensor);
+}
+
+def DotDimensionNumbers : StructAttr<"DotDimensionNumbers", HLO_Dialect, [
+                StructFieldAttr<"lhs_batching_dimensions",   ElementsAttr>,
+                StructFieldAttr<"rhs_batching_dimensions",   ElementsAttr>,
+                StructFieldAttr<"lhs_contracting_dimensions", ElementsAttr>,
+                StructFieldAttr<"rhs_contracting_dimensions", ElementsAttr>] > {
+  let description = "Structure of dimension information for dot product";
+}
+
+def HLO_DotGeneralOp: HLO_Op<"dot_general", [NoSideEffect]>, BASE_HLO_DotGeneralOp {
+  let arguments = (ins
+    HLO_Tensor:$lhs,
+    HLO_Tensor:$rhs,
+    DotDimensionNumbers:$dot_dimension_numbers,
+    HLO_PrecisionConfigAttr:$precision_config
+  );
+
   let results = (outs HLO_Tensor);
 }
 
 def HLO_GatherOp: HLO_Op<"gather", [NoSideEffect]>, BASE_HLO_GatherOp {
-  let arguments = (
-      ins HLO_Tensor:$operand,
-          HLO_IntTensor:$start_indices,
-          I64Attr: $index_vector_dim,
-          ElementsAttr: $offset_dims,
-          ElementsAttr: $slice_sizes,
-          ElementsAttr: $collapsed_slice_dims,
-          ElementsAttr: $start_index_map
+  let arguments = (ins
+    HLO_Tensor:$operand,
+    HLO_IntTensor:$start_indices,
+    I64Attr:$index_vector_dim,
+    I64ElementsAttr:$offset_dims,
+    I64ElementsAttr:$slice_sizes,
+    I64ElementsAttr:$collapsed_slice_dims,
+    I64ElementsAttr:$start_index_map
   );
 
   let results = (outs HLO_Tensor);
@@ -602,41 +459,13 @@ def HLO_SelectOp: HLO_Op<"select", [NoSideEffect]>, BASE_HLO_SelectOp {
   );
 
   let results = (outs HLO_Tensor);
-
-  // TODO(b/129012527) These should be expressed as type constraints.
-  let verifier = [{
-    auto onTrueType = on_true()->getType().cast<RankedTensorType>();
-    auto onFalseType = on_false()->getType().cast<RankedTensorType>();
-
-    if (onTrueType != onFalseType) {
-      return emitOpError(
-          llvm::formatv("on_true type ({0}) does not match on_false type ({1})",
-                        onTrueType, onFalseType));
-    }
-
-    auto predType = pred()->getType().cast<RankedTensorType>();
-    auto predShape = predType.getShape();
-    auto predRank = predType.getRank();
-    auto selectShape = onTrueType.getShape();
-
-    if (predRank != 0 && predShape != selectShape) {
-      return emitOpError(llvm::formatv(
-          "pred shape ([{0}"
-          "]) is not scalar and does not match operand shapes ([{1}"
-          "])",
-          llvm::make_range(predShape.begin(), predShape.end()),
-          llvm::make_range(selectShape.begin(), selectShape.end())));
-    }
-
-    return success();
-  }];
 }
 
 def HLO_ReverseOp: HLO_Op<"reverse",
       [NoSideEffect, SameOperandsAndResultElementType]>, BASE_HLO_ReverseOp {
   let arguments = (ins
     HLO_Tensor:$operand,
-    ElementsAttr:$dimensions
+    I64ElementsAttr:$dimensions
   );
 
   let results = (outs HLO_Tensor);
@@ -650,9 +479,9 @@ def HLO_PadOp: HLO_Op<"pad",
   let arguments = (ins
     HLO_Tensor:$operand,
     HLO_Tensor:$padding_value,
-    ElementsAttr: $edge_padding_low,
-    ElementsAttr: $edge_padding_high,
-    ElementsAttr: $interior_padding
+    I64ElementsAttr: $edge_padding_low,
+    I64ElementsAttr: $edge_padding_high,
+    I64ElementsAttr: $interior_padding
   );
 
   let results = (outs HLO_Tensor);
@@ -661,51 +490,6 @@ def HLO_PadOp: HLO_Op<"pad",
     Pads the `operand` according to TBD.
   }];
 
-  let verifier = [{
-    auto input_type = operand()->getType().cast<RankedTensorType>();
-    auto pad_type = padding_value()->getType().cast<RankedTensorType>();
-
-    if (pad_type.getRank() != 0) {
-      return emitOpError(llvm::formatv("padding value type should be a rank-0 "
-          "tensor, is rank {0}", pad_type.getRank()));
-    }
-
-    const auto& padding_low = edge_padding_low();
-    if (padding_low.getType().getNumElements() != input_type.getRank()) {
-      return emitOpError(llvm::formatv(
-          "edge_padding_low length ({0}) must match operand rank ({1}).",
-          padding_low.getType().getNumElements(), input_type.getRank()));
-    }
-
-    const auto& padding_high = edge_padding_high();
-    if (padding_high.getType().getNumElements() != input_type.getRank()) {
-      return emitOpError(llvm::formatv(
-          "edge_padding_high length ({0}) must match operand rank ({1}).",
-          padding_high.getType().getNumElements(), input_type.getRank()));
-    }
-
-    auto input_shape = input_type.getShape();
-    auto output_shape = getResult()->getType().cast<RankedTensorType>().getShape();
-    if (input_shape.size() != output_shape.size()) {
-      return emitOpError(llvm::formatv(
-          "Operand rank ({0}) and result rank({0}) should match",
-          input_shape.size(), output_shape.size()));
-    }
-
-    for (int i = 0, e = input_shape.size(); i < e; i++) {
-      int expected_output = input_shape[i]
-          + padding_low.getValue<IntegerAttr>(i).getInt()
-          + padding_high.getValue<IntegerAttr>(i).getInt();
-      if (expected_output != output_shape[i]) {
-        return emitOpError(llvm::formatv("Expected output shape ({0}) and "
-            "output shape ({1}) should match.",
-            expected_output, output_shape[i]));
-      }
-    }
-
-    return success();
-  }];
-
   // TODO(b/129422361): PadOp has a custom constructor for HLO.
   let hasCustomHLOConverter = 1;
 }
@@ -714,63 +498,45 @@ def HLO_TransposeOp: HLO_Op<"transpose",
       [NoSideEffect, SameOperandsAndResultElementType]>, BASE_HLO_TransposeOp {
   let arguments = (ins
     HLO_Tensor:$operand,
-    ElementsAttr:$permutation
+    I64ElementsAttr:$permutation
   );
   let results = (outs HLO_Tensor);
 
   let hasFolder = 1;
+}
 
-  // TODO(b/129012527) These should be expressed as type constraints.
-  let verifier = [{
-    if (!permutation().isa<DenseIntElementsAttr>()) {
-      return emitOpError(
-          llvm::formatv("permutation must be a DenseIntElementsAttr; got {0}",
-                        permutation()));
-    }
+def HLO_ReduceWindowOp: HLO_Op<"reduce_window", [
+      NoSideEffect,
+      SameVariadicOperandSize,
+      SingleBlockImplicitTerminator<"ReturnOp">
+    ]>, BASE_HLO_ReduceWindowOp {
 
-    auto permutationType = permutation().getType().cast<RankedTensorType>();
-    auto permutationRank = permutationType.getRank();
-    if (permutationRank != 1) {
-      return emitOpError(llvm::formatv(
-          "permutation has rank {0} instead of rank 1", permutationRank));
-    }
+  // TODO(hinsu): Verify that padding attribute is 2-d and the remaining
+  // attributes are 1-d. Attributes' leading dimension should match rank of the
+  // inputs.
+  let arguments = (ins
+    Variadic<HLO_TensorOrTuple>:$operands,
+    Variadic<HLO_TensorOrTuple>:$init_values,
+    I64ElementsAttr:$window_dimensions,
+    // If strides or dilations attributes are missing then the default value is
+    // one for each of the input dimensions. Similarly, padding values are zero
+    // for both low and high in each of the dimensions, if not specified.
+    OptionalAttr<I64ElementsAttr>:$window_strides,
+    OptionalAttr<I64ElementsAttr>:$base_dilations,
+    OptionalAttr<I64ElementsAttr>:$window_dilations,
+    OptionalAttr<I64ElementsAttr>:$padding
+  );
 
-    auto operandType = operand()->getType().cast<RankedTensorType>();
-    auto operandRank = operandType.getRank();
-    auto permutationSize = permutationType.getNumElements();
-    if (permutationSize != operandRank) {
-      return emitOpError(llvm::formatv(
-          "permutation size ({0}) does not match operand rank ({1})",
-          permutationSize, operandRank));
-    }
+  let results = (outs Variadic<HLO_Tensor>);
 
-    auto resultType = getResult()->getType().cast<RankedTensorType>();
-    auto resultRank = resultType.getRank();
-    if (resultRank != operandRank) {
-      return emitOpError(
-          llvm::formatv("result rank ({0}) does not match operand rank ({1})",
-                        resultRank, operandRank));
-    }
+  // TODO(hinsu): Verify that the attached body arguments and results are
+  // compatible with reduce op's operands.
+  let regions = (region SizedRegion<1>:$body);
 
-    auto resultShape = resultType.getShape();
+  // TODO(b/129422361): ReduceWindowOp has special conversion logic to HLO.
+  let hasCustomHLOConverter = 1;
 
-    auto expectedShape = SmallVector<int64_t, 10>(operandRank);
-    for (int i = 0; i != operandRank; ++i) {
-      auto permutedDim = permutation().getValue<IntegerAttr>(i).getInt();
-      expectedShape[i] = operandType.getDimSize(permutedDim);
-    }
-
-    if (resultShape != llvm::makeArrayRef(expectedShape)) {
-      return emitOpError(llvm::formatv(
-          "result shape is [{0}"
-          "] instead of [{1}"
-          "]",
-          llvm::make_range(resultShape.begin(), resultShape.end()),
-          llvm::make_range(expectedShape.begin(), expectedShape.end())));
-    }
-
-    return success();
-  }];
+  // TODO(hinsu): Implement custom printer and parser.
 }
 
 def HLO_ReturnOp : HLO_Op<"return", [Terminator]> {
diff --git a/tensorflow/compiler/mlir/xla/ir/hlo_ops_base.td b/tensorflow/compiler/mlir/xla/ir/hlo_ops_base.td
index 28d6efd0aad..6623c21dcb8 100644
--- a/tensorflow/compiler/mlir/xla/ir/hlo_ops_base.td
+++ b/tensorflow/compiler/mlir/xla/ir/hlo_ops_base.td
@@ -165,7 +165,7 @@ class BASE_HLO_TanhOp {
 // smaller rank shape is broadcast into a larger rank shape. For example,
 // given a 2x3x4 cuboid and a 3x4 matrix, a broadcasting tuple (1,2) means
 // matching the matrix to dimensions 1 and 2 of the cuboid.
-def BroadcastDimAttr : OptionalAttr<ElementsAttr>;
+def BroadcastDimAttr : OptionalAttr<I64ElementsAttr>;
 
 class BASE_HLO_AddOp {
   string summary = "Addition operator";
@@ -244,10 +244,6 @@ class BASE_HLO_AndOp {
   }];
 }
 
-//===----------------------------------------------------------------------===//
-// XLA control flow op definitions.
-//===----------------------------------------------------------------------===//
-
 class BASE_HLO_ReduceOp {
   string summary = "Reduce operator";
 
@@ -259,6 +255,17 @@ class BASE_HLO_ReduceOp {
   }];
 }
 
+class BASE_HLO_ReduceWindowOp {
+  string summary = "ReduceWindow operator";
+
+  string description = [{
+    Returns the result of executing a reduction function over all elements in
+    each window of one or more arrays in parallel.
+
+    See https://www.tensorflow.org/xla/operation_semantics#reducewindow.
+  }];
+}
+
 //===----------------------------------------------------------------------===//
 // XLA tuple op definitions.
 //===----------------------------------------------------------------------===//
@@ -454,6 +461,17 @@ class BASE_HLO_DotOp {
   }];
 }
 
+class BASE_HLO_DotGeneralOp {
+  string summary = "General Dot operator";
+  string description = [{
+    Performs general dot products between vectors, vector/matrix and
+    matrix/matrix multiplication.
+
+    See https://www.tensorflow.org/xla/operation_semantics#dotgeneral.
+  }];
+}
+
+
 class BASE_HLO_GatherOp{
   string summary = "Gather operator";
 
diff --git a/tensorflow/compiler/mlir/xla/ir/lhlo_ops.td b/tensorflow/compiler/mlir/xla/ir/lhlo_ops.td
index 4ef40c4c69f..597e5b3671b 100644
--- a/tensorflow/compiler/mlir/xla/ir/lhlo_ops.td
+++ b/tensorflow/compiler/mlir/xla/ir/lhlo_ops.td
@@ -133,12 +133,13 @@ def LHLO_AndOp: LHLO_BinaryElementwiseOp<"and", []>, BASE_HLO_AndOp;
 // TODO(b/139813999): specify required function signature in a type-safe way.
 def LHLO_ReduceOp: LHLO_Op<"reduce", [SameVariadicOperandSize]>, BASE_HLO_ReduceOp {
   let arguments = (ins
-    Variadic<LHLO_BufferOrTuple>:$operands_and_init,
+    Variadic<LHLO_BufferOrTuple>:$operands,
+    Variadic<LHLO_BufferOrTuple>:$init_values,
     Variadic<LHLO_BufferOrTuple>:$out,
     // TODO(hinsu): Attach computation as a region similar to the
     // xla_hlo.reduce op.
     SymbolRefAttr:$computation,
-    ElementsAttr:$dimensions
+    I64ElementsAttr:$dimensions
   );
 }
 //===----------------------------------------------------------------------===//
@@ -175,12 +176,13 @@ def LHLO_CompareOp: LHLO_Op<"compare", []>, BASE_HLO_CompareOp {
 
 def LHLO_SliceOp: LHLO_Op<
       "slice",
-      [AllTypesMatch<["start_indices", "limit_indices"]>]> {
+      [AllTypesMatch<["start_indices", "limit_indices", "strides"]>]> {
   let arguments = (ins
     LHLO_Buffer:$operand,
     LHLO_Buffer:$output,
-    ElementsAttr:$start_indices,
-    ElementsAttr:$limit_indices
+    I64ElementsAttr:$start_indices,
+    I64ElementsAttr:$limit_indices,
+    I64ElementsAttr:$strides
   );
 }
 
@@ -217,7 +219,7 @@ def LHLO_BroadcastOp : LHLO_Op<"broadcast",
   let arguments = (ins
     LHLO_Buffer:$operand,
     LHLO_Buffer:$output,
-    ElementsAttr:$broadcast_sizes
+    I64ElementsAttr:$broadcast_sizes
   );
 }
 
@@ -243,7 +245,7 @@ def LHLO_ConcatenateOp : LHLO_Op<"concatenate", []>, BASE_HLO_ConcatenateOp {
    let arguments = (ins
      Variadic<LHLO_Buffer>:$val,
      LHLO_Buffer:$output,
-     I64Attr: $dimension
+     I64Attr:$dimension
    );
 }
 
@@ -268,11 +270,11 @@ def LHLO_GatherOp: LHLO_Op<"gather", []>, BASE_HLO_GatherOp {
   let arguments = (ins
     LHLO_Buffer:$operand,
     LHLO_IntBuffer:$start_indices,
-    I64Attr: $index_vector_dim,
-    ElementsAttr: $offset_dims,
-    ElementsAttr: $slice_sizes,
-    ElementsAttr: $collapsed_slice_dims,
-    ElementsAttr: $start_index_map,
+    I64Attr:$index_vector_dim,
+    I64ElementsAttr:$offset_dims,
+    I64ElementsAttr:$slice_sizes,
+    I64ElementsAttr:$collapsed_slice_dims,
+    I64ElementsAttr:$start_index_map,
     LHLO_Buffer:$output
   );
 }
@@ -297,7 +299,7 @@ def LHLO_SelectOp: LHLO_Op<"select", []>, BASE_HLO_SelectOp {
 def LHLO_ReverseOp: LHLO_Op<"reverse", []>, BASE_HLO_ReverseOp {
   let arguments = (ins
     LHLO_Buffer:$operand,
-    ElementsAttr:$dimensions,
+    I64ElementsAttr:$dimensions,
     LHLO_Buffer:$output
   );
 }
@@ -306,17 +308,17 @@ def LHLO_PadOp: LHLO_Op<"pad", []>, BASE_HLO_PadOp {
   let arguments = (ins
     LHLO_Buffer:$operand,
     LHLO_Buffer:$padding_value,
-    ElementsAttr: $edge_padding_low,
-    ElementsAttr: $edge_padding_high,
-    ElementsAttr: $interior_padding,
-    LHLO_Buffer: $output
+    I64ElementsAttr:$edge_padding_low,
+    I64ElementsAttr:$edge_padding_high,
+    I64ElementsAttr:$interior_padding,
+    LHLO_Buffer:$output
   );
 }
 
 def LHLO_TransposeOp: LHLO_Op<"transpose", []>, BASE_HLO_TransposeOp {
   let arguments = (ins
     LHLO_Buffer:$operand,
-    ElementsAttr:$permutation,
+    I64ElementsAttr:$permutation,
     LHLO_Buffer:$output
   );
 }
diff --git a/tensorflow/compiler/mlir/xla/mlir_hlo_to_hlo.cc b/tensorflow/compiler/mlir/xla/mlir_hlo_to_hlo.cc
index 230044d538b..c4008133b0c 100644
--- a/tensorflow/compiler/mlir/xla/mlir_hlo_to_hlo.cc
+++ b/tensorflow/compiler/mlir/xla/mlir_hlo_to_hlo.cc
@@ -27,42 +27,50 @@ limitations under the License.
 #include "mlir/IR/Function.h"  // TF:local_config_mlir
 #include "mlir/IR/Location.h"  // TF:local_config_mlir
 #include "mlir/IR/MLIRContext.h"  // TF:local_config_mlir
+#include "mlir/IR/Matchers.h"  // TF:local_config_mlir
 #include "mlir/IR/Module.h"  // TF:local_config_mlir
 #include "mlir/IR/Operation.h"  // TF:local_config_mlir
 #include "tensorflow/compiler/mlir/xla/ir/hlo_ops.h"
 #include "tensorflow/compiler/mlir/xla/type_to_shape.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/comparison_util.h"
+#include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 
-using tensorflow::int64;
+using ::tensorflow::int16;
+using ::tensorflow::int32;
+using ::tensorflow::int64;
+using ::tensorflow::int8;
+using ::tensorflow::uint16;
+using ::tensorflow::uint32;
+using ::tensorflow::uint64;
+using ::tensorflow::uint8;
 
 static std::vector<int64> ConvertDenseIntAttr(mlir::DenseIntElementsAttr attr) {
   auto values = attr.getValues<int64>();
   return {values.begin(), values.end()};
 }
 
-// Converts the broadcast_dimensions attribute into a span of dimension numbers
-// (empty if the attribute is absent).
+// Converts the broadcast_dimensions attribute into a vector of dimension
+// numbers (empty if the attribute is absent).
 static std::vector<int64> Convert_broadcast_dimensions(
-    llvm::Optional<mlir::ElementsAttr> broadcast_dimensions) {
+    llvm::Optional<mlir::DenseIntElementsAttr> broadcast_dimensions) {
   if (!broadcast_dimensions.hasValue()) return {};
 
-  return ConvertDenseIntAttr(
-      broadcast_dimensions->cast<mlir::DenseIntElementsAttr>());
+  return ConvertDenseIntAttr(*broadcast_dimensions);
 }
 
-// Converts the broadcast_sizes attribute into a span of dimension sizes.
+// Converts the broadcast_sizes attribute into a vector of dimension sizes.
 static std::vector<int64> Convert_broadcast_sizes(
-    mlir::ElementsAttr broadcast_sizes) {
-  return ConvertDenseIntAttr(
-      broadcast_sizes.cast<mlir::DenseIntElementsAttr>());
+    mlir::DenseIntElementsAttr broadcast_sizes) {
+  return ConvertDenseIntAttr(broadcast_sizes);
 }
 
-static std::vector<int64> Convert_permutation(mlir::ElementsAttr permutation) {
-  return ConvertDenseIntAttr(permutation.cast<mlir::DenseIntElementsAttr>());
+static std::vector<int64> Convert_permutation(
+    mlir::DenseIntElementsAttr permutation) {
+  return ConvertDenseIntAttr(permutation);
 }
 
 // Converts the precision config array of strings attribute into the
@@ -90,6 +98,45 @@ static std::unique_ptr<xla::PrecisionConfig> Convert_precision_config(
   return precision_config;
 }
 
+static xla::DotDimensionNumbers Convert_dot_dimension_numbers(
+    mlir::xla_hlo::DotDimensionNumbers dot_dimension_numbers_attr) {
+  xla::DotDimensionNumbers dot_dimension_numbers;
+
+  auto rhs_contracting_dimensions =
+      dot_dimension_numbers_attr.rhs_contracting_dimensions()
+          .cast<mlir::DenseIntElementsAttr>();
+  auto lhs_contracting_dimensions =
+      dot_dimension_numbers_attr.lhs_contracting_dimensions()
+          .cast<mlir::DenseIntElementsAttr>();
+  auto rhs_batch_dimensions =
+      dot_dimension_numbers_attr.rhs_batching_dimensions()
+          .cast<mlir::DenseIntElementsAttr>();
+  auto lhs_batch_dimensions =
+      dot_dimension_numbers_attr.lhs_batching_dimensions()
+          .cast<mlir::DenseIntElementsAttr>();
+
+  for (auto val : rhs_contracting_dimensions) {
+    dot_dimension_numbers.add_rhs_contracting_dimensions(
+        val.getLimitedValue(UINT64_MAX));
+  }
+  for (auto val : lhs_contracting_dimensions) {
+    dot_dimension_numbers.add_lhs_contracting_dimensions(
+        val.getLimitedValue(UINT64_MAX));
+  }
+
+  for (auto val : rhs_batch_dimensions) {
+    dot_dimension_numbers.add_rhs_batch_dimensions(
+        val.getLimitedValue(UINT64_MAX));
+  }
+
+  for (auto val : lhs_batch_dimensions) {
+    dot_dimension_numbers.add_lhs_batch_dimensions(
+        val.getLimitedValue(UINT64_MAX));
+  }
+
+  return dot_dimension_numbers;
+}
+
 // Converts the comparison_direction string attribute into the XLA enum. The
 // string is assumed to correspond to exactly one of the allowed strings
 // representing the enum. This should have been checked in the op verify method.
@@ -132,13 +179,47 @@ static double ConvertAPFloat(llvm::APFloat value) {
 namespace mlir {
 namespace {
 
+StatusOr<xla::Literal> CreateLiteralFromAttr(Type type, ElementsAttr attr) {
+  xla::Shape shape = xla::TypeToShape(type);
+
+#define ELEMENTS_ATTR_TO_LITERAL(xla_type, cpp_type)       \
+  case xla_type: {                                         \
+    xla::Array<cpp_type> source_data(shape.dimensions());  \
+    source_data.SetValues(attr.getValues<cpp_type>());     \
+    return xla::LiteralUtil::CreateFromArray(source_data); \
+  }
+
+  switch (shape.element_type()) {
+    ELEMENTS_ATTR_TO_LITERAL(xla::PrimitiveType::PRED, bool)
+    ELEMENTS_ATTR_TO_LITERAL(xla::PrimitiveType::F32, float)
+    ELEMENTS_ATTR_TO_LITERAL(xla::PrimitiveType::F64, double)
+    ELEMENTS_ATTR_TO_LITERAL(xla::PrimitiveType::S8, int8)
+    ELEMENTS_ATTR_TO_LITERAL(xla::PrimitiveType::S16, int16)
+    ELEMENTS_ATTR_TO_LITERAL(xla::PrimitiveType::S32, int32)
+    ELEMENTS_ATTR_TO_LITERAL(xla::PrimitiveType::S64, int64)
+    // TODO(b/130356985): Update once MLIR supports unsigned integers.
+    ELEMENTS_ATTR_TO_LITERAL(xla::PrimitiveType::U8, uint8)
+    ELEMENTS_ATTR_TO_LITERAL(xla::PrimitiveType::U16, uint16)
+    ELEMENTS_ATTR_TO_LITERAL(xla::PrimitiveType::U32, uint32)
+    ELEMENTS_ATTR_TO_LITERAL(xla::PrimitiveType::U64, uint64)
+    default:
+      return tensorflow::errors::Internal(absl::StrCat(
+          "Unsupported type: ", xla::PrimitiveType_Name(shape.element_type())));
+  }
+#undef ELEMENTS_ATTR_TO_LITERAL
+}
+
 class ConvertToHloModule {
  public:
   using ValueLoweringMap = llvm::DenseMap<Value*, xla::XlaOp>;
   using FunctionLoweringMap = llvm::DenseMap<mlir::FuncOp, xla::XlaComputation>;
 
-  explicit ConvertToHloModule(mlir::ModuleOp module)
-      : module_(module), module_builder_("main") {}
+  explicit ConvertToHloModule(mlir::ModuleOp module, bool use_tuple_args,
+                              bool always_return_tuple)
+      : module_(module),
+        module_builder_("main"),
+        use_tuple_args_(use_tuple_args),
+        always_return_tuple_(always_return_tuple) {}
 
   // Perform the lowering to XLA. This function returns failure if an error was
   // encountered.
@@ -160,6 +241,9 @@ class ConvertToHloModule {
   }
 
  private:
+  LogicalResult Lower(mlir::Operation* inst, xla::XlaBuilder* builder,
+                      ConvertToHloModule::ValueLoweringMap* value_lowering);
+
   // The module being lowered.
   mlir::ModuleOp module_;
 
@@ -168,26 +252,37 @@ class ConvertToHloModule {
 
   // Map between function and lowered computation.
   FunctionLoweringMap lowered_computation_;
+
+  // Whether the entry function should take a single tuple as input.
+  bool use_tuple_args_;
+
+  // Whether to always return a tuple.
+  bool always_return_tuple_;
 };
 
-LogicalResult Lower(mlir::Operation* inst, xla::XlaBuilder* builder,
-                    ConvertToHloModule::FunctionLoweringMap* function_lowering,
-                    ConvertToHloModule::ValueLoweringMap* value_lowering) {
-  if (auto xla_op = CreateXlaOperator(inst, value_lowering)) return success();
-
-  // TODO(riverriddle) We currently don't support lowering constant operations.
-  if (isa<mlir::xla_hlo::ConstOp>(inst)) {
-    inst->emitError("unable to lower 'xla_hlo.constant' operation");
-    return failure();
-  }
+LogicalResult ConvertToHloModule::Lower(
+    mlir::Operation* inst, xla::XlaBuilder* builder,
+    ConvertToHloModule::ValueLoweringMap* value_lowering) {
+  if (succeeded(ExportXlaOperator(inst, value_lowering))) return success();
 
   auto& value_map = *value_lowering;
+  ElementsAttr const_attr;
+  // TODO(jpienaar): This doesn't support layouts yet.
+  if (matchPattern(inst, m_Constant(&const_attr))) {
+    auto literal_or =
+        CreateLiteralFromAttr(*inst->result_type_begin(), const_attr);
+    if (!literal_or.ok()) return inst->emitError("unsupported elemental type");
+    value_map[inst->getResult(0)] =
+        xla::ConstantLiteral(builder, literal_or.ValueOrDie());
+    return success();
+  }
+
   if (auto ret = dyn_cast<mlir::ReturnOp>(inst)) {
     // Construct the return value for the function. If there are multiple
     // values returned, then create a tuple, else return value directly.
     xla::XlaOp return_value;
     unsigned num_return_values = ret.getNumOperands();
-    if (num_return_values > 1) {
+    if (always_return_tuple_ || num_return_values > 1) {
       std::vector<xla::XlaOp> returns(num_return_values);
       for (unsigned i = 0, e = ret.getNumOperands(); i != e; ++i) {
         returns[i] = value_map[ret.getOperand(i)];
@@ -205,7 +300,7 @@ LogicalResult Lower(mlir::Operation* inst, xla::XlaBuilder* builder,
       return failure();
     }
     auto f = inst->getParentOfType<mlir::FuncOp>();
-    (*function_lowering)[f] = std::move(computation_or.ValueOrDie());
+    lowered_computation_[f] = std::move(computation_or.ValueOrDie());
     return success();
   }
   inst->emitError("unable to lower operation of type '" +
@@ -228,28 +323,42 @@ LogicalResult ConvertToHloModule::RunOnFunction(mlir::FuncOp f) {
   // Mapping from the Value to lowered XlaOp. The code below lowers in
   // program order and will fail if an operand is unseen. This can be improved.
   ValueLoweringMap lowering;
-  for (auto& bb : f) {
-    int num = 0;
-    for (auto& arg : bb.getArguments()) {
+  auto& bb = f.front();
+
+  // If using tuples as input, then there is only one input
+  // parameter that is a tuple.
+  if (use_tuple_args_) {
+    std::vector<xla::Shape> arg_shapes;
+    arg_shapes.reserve(bb.getNumArguments());
+    for (auto& arg : bb.getArguments())
+      arg_shapes.push_back(xla::TypeToShape(arg->getType()));
+    xla::Shape input_shape = xla::ShapeUtil::MakeTupleShape(arg_shapes);
+    auto tuple = xla::Parameter(&builder, 0, input_shape, "arg_tuple");
+    for (auto& it : llvm::enumerate(bb.getArguments())) {
+      lowering[it.value()] = xla::GetTupleElement(tuple, it.index());
+    }
+  } else {
+    for (auto& it : llvm::enumerate(bb.getArguments())) {
+      auto* arg = it.value();
+      auto num = it.index();
       xla::Shape shape = xla::TypeToShape(arg->getType());
       lowering[arg] =
           xla::Parameter(&builder, num, shape, absl::StrCat("Arg_", num));
-      ++num;
     }
-
-    for (auto& inst : bb)
-      if (failed(Lower(&inst, &builder, &lowered_computation_, &lowering)))
-        return failure();
   }
 
+  for (auto& inst : bb)
+    if (failed(Lower(&inst, &builder, &lowering))) return failure();
+
   return success();
 }
 
 }  // namespace
 
-Status ConvertMlirHloToHlo(mlir::ModuleOp module, xla::HloProto* hlo_proto) {
+Status ConvertMlirHloToHlo(mlir::ModuleOp module, xla::HloProto* hlo_proto,
+                           bool use_tuple_args, bool always_return_tuple) {
   mlir::StatusScopedDiagnosticHandler diag_handler(module.getContext());
-  ConvertToHloModule converter(module);
+  ConvertToHloModule converter(module, use_tuple_args, always_return_tuple);
   if (failed(converter.Run())) return diag_handler.ConsumeStatus();
   auto hlo_module = converter.ConsumeMainProto();
   hlo_proto->mutable_hlo_module()->Swap(&hlo_module);
diff --git a/tensorflow/compiler/mlir/xla/mlir_hlo_to_hlo.h b/tensorflow/compiler/mlir/xla/mlir_hlo_to_hlo.h
index b16636f039c..24d20fe7017 100644
--- a/tensorflow/compiler/mlir/xla/mlir_hlo_to_hlo.h
+++ b/tensorflow/compiler/mlir/xla/mlir_hlo_to_hlo.h
@@ -23,8 +23,12 @@ limitations under the License.
 
 namespace mlir {
 
-// Converts a MLIR module in HLO dialect into a HloModuleProto.
-Status ConvertMlirHloToHlo(mlir::ModuleOp module, xla::HloProto* hlo_proto);
+// Converts a MLIR module in HLO dialect into a HloModuleProto. If
+// use_tuple_args is set, then functions will have a single tuple as input. If
+// always_return_tuple is set, then functions will return tuple whether or not
+// there is only one result.
+Status ConvertMlirHloToHlo(mlir::ModuleOp module, xla::HloProto* hlo_proto,
+                           bool use_tuple_args, bool always_return_tuple);
 
 // Creates XlaOp equivalent of a given MLIR operation using the operand info
 // from `value_lowering` map.
diff --git a/tensorflow/compiler/mlir/xla/operator_writer_gen.cc b/tensorflow/compiler/mlir/xla/operator_writer_gen.cc
index 6aecf70b385..00b7cd06a1e 100644
--- a/tensorflow/compiler/mlir/xla/operator_writer_gen.cc
+++ b/tensorflow/compiler/mlir/xla/operator_writer_gen.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include <sstream>
 
+#include "llvm/ADT/Sequence.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/FormatVariadic.h"
@@ -24,24 +25,19 @@ limitations under the License.
 #include "llvm/TableGen/Main.h"
 #include "llvm/TableGen/Record.h"
 #include "llvm/TableGen/TableGenBackend.h"
+#include "mlir/Support/STLExtras.h"  // TF:local_config_mlir
 #include "mlir/TableGen/Operator.h"  // TF:local_config_mlir
 
-using llvm::dyn_cast;
-using llvm::LessRecord;
 using llvm::raw_ostream;
-using llvm::Record;
 using llvm::RecordKeeper;
 using llvm::StringRef;
+using mlir::interleaveComma;
+using mlir::tblgen::NamedAttribute;
+using mlir::tblgen::NamedTypeConstraint;
 using mlir::tblgen::Operator;
 
-// Returns the builder function name for the given op definition.
-// E.g., AddOp -> CreateAddOp
-static inline std::string GetOperatorBuilderName(StringRef op_name) {
-  return "Create" + op_name.str();
-}
-
-static std::string GetConversionFunction(
-    mlir::tblgen::NamedAttribute named_attr) {
+static std::string GetDefaultAttrExport(
+    const mlir::tblgen::NamedAttribute& named_attr) {
   auto storage_type = named_attr.attr.getStorageType();
   // For some attribute types we have a general conversion, so use that.
   if (storage_type.endswith("IntegerAttr") ||
@@ -51,110 +47,50 @@ static std::string GetConversionFunction(
   return "Convert_" + named_attr.name.str();
 }
 
-using ArgumentName = std::string;
-using ArgumentDeclaration = std::string;
-using Argument = std::pair<ArgumentName, ArgumentDeclaration>;
-using ArgumentList = std::vector<Argument>;
-
-static std::string BuildOperator(const Operator& op) {
-  std::stringstream os;
-  StringRef op_name = op.getCppClassName();
-  std::string xla_op_name = op_name.drop_back(2).str();
-
-  // Signature.
-  os << "static xla::XlaOp " << GetOperatorBuilderName(op_name)
-     << "(mlir::xla_hlo::" << op_name.str() << " xla_op, "
-     << "llvm::DenseMap<mlir::Value*, xla::XlaOp>* "
-        "value_lowering) {\n";
-
+static void BuildOperator(const Operator& op, raw_ostream* output) {
+  auto& os = *output;
   os << "  auto& value_map = *value_lowering;\n"
      << "  auto result = xla_op.getResult();\n";
 
-  // Invoke the conversion function for each attribute.
-  for (const auto& named_attr : op.getAttributes()) {
-    os << "  auto " << named_attr.name.str() << " = "
-       << GetConversionFunction(named_attr) << "("
-       << "xla_op." << named_attr.name.str() << "());\n";
+  // Build a conversion for each of the arguments.
+  int operand_number = 0;
+  for (int index : llvm::seq<int>(0, op.getNumArgs())) {
+    auto arg = op.getArg(index);
+
+    // Emit an argument for an operand.
+    if (auto* operand_cst = arg.dyn_cast<NamedTypeConstraint*>()) {
+      // Handle a non-variadic operand.
+      if (!operand_cst->isVariadic()) {
+        os << "auto xla_arg_" << index << " = value_map[*xla_op.getODSOperands("
+           << operand_number++ << ").begin()];\n";
+        continue;
+      }
+
+      // Otherwise, this is a varidiac operand list.
+      os << "  std::vector<xla::XlaOp> xla_arg_" << index << ";"
+         << "  for (auto operand : xla_op.getODSOperands(" << operand_number++
+         << "))\n    xla_arg_" << index << ".push_back(value_map[operand]);\n";
+      continue;
+    }
+
+    // Otherwise, this is an attribute.
+    auto named_attr = arg.get<NamedAttribute*>();
+    os << "auto xla_arg_" << index << " = " << GetDefaultAttrExport(*named_attr)
+       << "(xla_op." << op.getArgName(index) << "());\n";
   }
 
   // Assumes that the client builder method names closely follow the op names
   // in the dialect. For e.g., AddOp -> xla::Add method.
-  os << "  auto xla_result = xla::" << xla_op_name << "(";
-
-  int num_operands = op.getNumOperands();
-  if (num_operands == 1) {
-    os << "value_map[xla_op.getOperand()]";
-  } else {
-    for (auto i = 0; i < num_operands; i++) {
-      os << "value_map[xla_op.getOperand(" << i << ")]";
-      if (i != num_operands - 1) {
-        os << ", ";
-      }
-    }
-  }
-
-  for (const auto& named_attr : op.getAttributes()) {
-    os << ", Unwrap(" << named_attr.name.str() << ")";
-  }
+  StringRef op_name = op.getCppClassName();
+  os << "  auto xla_result = xla::" << op_name.drop_back(2) << "(";
 
+  // Emit each of the arguments.
+  interleaveComma(llvm::seq<int>(0, op.getNumArgs()), os,
+                  [&](int i) { os << "Unwrap(xla_arg_" << i << ')'; });
   os << ");\n";
 
   os << "  value_map[result] = xla_result;\n";
-  os << "  return xla_result;\n";
-  os << "}\n\n";
-  return os.str();
-}
-
-// For each XLA op, emits a builder function that constructs the XLA op using
-// the HLO client builder.
-static void EmitOperatorBuilders(const RecordKeeper& record_keeper,
-                                 const std::vector<Record*>& defs,
-                                 raw_ostream* ostream) {
-  raw_ostream& os = *ostream;
-
-  for (const auto* def : defs) {
-    // Skip operations that have a custom converter.
-    if (def->getValueAsBit("hasCustomHLOConverter")) continue;
-
-    Operator op(def);
-    os << BuildOperator(op);
-  }
-}
-
-// Emits a builder function that returns the XlaOp object given a
-// mlir::Operation.
-//
-// The signature of the function is:
-//
-//   llvm::Optional<xla::XlaOp>
-//   mlir::CreateXlaOperator(
-//       mlir::Operation* op,
-//       llvm::DenseMap<mlir::Value*, xla::XlaOp>
-//       *value_lowering);
-static void EmitBuilder(const std::vector<Record*>& defs,
-                        raw_ostream* ostream) {
-  raw_ostream& os = *ostream;
-
-  // Signature
-  os << "llvm::Optional<xla::XlaOp>\n"
-        "mlir::CreateXlaOperator(mlir::Operation* op, "
-        "llvm::DenseMap<mlir::Value*, xla::XlaOp> "
-        "*value_lowering) {\n";
-
-  for (const auto* def : defs) {
-    // Skip operations that have a custom converter.
-    if (def->getValueAsBit("hasCustomHLOConverter")) continue;
-
-    StringRef op_name = def->getName().drop_front(4);
-
-    // Try to cast to each op and call the corresponding op builder.
-    os << "  if (auto xla_op = llvm::dyn_cast<mlir::xla_hlo::" << op_name
-       << ">(op))\n     return " << GetOperatorBuilderName(op_name)
-       << "(xla_op, value_lowering);\n";
-  }
-
-  os << "  return llvm::None;\n"
-        "}\n";
+  os << "  return mlir::success();\n";
 }
 
 // The function below has a non-constant reference as that is required by LLVM's
@@ -163,26 +99,27 @@ static void EmitBuilder(const std::vector<Record*>& defs,
 static bool OperatorWritersMain(raw_ostream& os, RecordKeeper& records) {
   emitSourceFileHeader("MLIR XLA Builders", os);
 
-  // Retrieve all the definitions derived from HLO_Op and sort by record name.
-  std::vector<Record*> defs = records.getAllDerivedDefinitions("HLO_Op");
-  llvm::sort(defs, LessRecord());
+  // Emit a function to generate an XLA operation for the operations with
+  // auto-generated builders.
+  os << "mlir::LogicalResult ExportXlaOperator(\n"
+        "mlir::Operation* op, llvm::DenseMap<mlir::Value*, xla::XlaOp> "
+        "*value_lowering) {\n";
 
-  for (const auto* def : defs) {
-    // XLA ops in the .td file are expected to follow the naming convention:
-    // HLO_<OpName>Op.
-    // The generated XLA op C++ class should be HLO::<OpName>Op.
-    if (!def->getName().startswith("HLO_"))
-      PrintFatalError(def->getLoc(),
-                      "unexpected op name format: 'HLO_' prefix missing");
-    if (!def->getName().endswith("Op"))
-      PrintFatalError(def->getLoc(),
-                      "unexpected op name format: 'Op' suffix missing");
+  // Retrieve all the definitions derived from HLO_Op and sort by record name.
+  for (const auto* def : records.getAllDerivedDefinitions("HLO_Op")) {
+    // Skip operations that have a custom exporter.
+    if (def->getValueAsBit("hasCustomHLOConverter")) continue;
+    Operator op(def);
+
+    // Cast to the current operation and build the exporter.
+    os << "  if (auto xla_op = llvm::dyn_cast<mlir::xla_hlo::"
+       << op.getCppClassName() << ">(op)) {\n";
+    BuildOperator(op, &os);
+    os << "}\n";
   }
 
-  EmitOperatorBuilders(records, defs, &os);
-  os << "\n\n";
-  EmitBuilder(defs, &os);
-
+  os << "  return mlir::failure();\n"
+        "}\n";
   return false;
 }
 
diff --git a/tensorflow/compiler/mlir/xla/tests/concatenate.mlir b/tensorflow/compiler/mlir/xla/tests/concatenate.mlir
new file mode 100644
index 00000000000..b0f3ceeb59e
--- /dev/null
+++ b/tensorflow/compiler/mlir/xla/tests/concatenate.mlir
@@ -0,0 +1,9 @@
+// RUN: tf-opt %s -canonicalize | FileCheck %s
+
+// CHECK-LABEL: func @single_operand
+// CHECK-SAME: [[ARG:%[a-zA-Z0-9]+]]
+func @single_operand(%arg: tensor<1x2xf32>) -> tensor<1x2xf32> {
+  %0 = "xla_hlo.concatenate"(%arg) {dimension = 0 : i64} : (tensor<1x2xf32>) -> tensor<1x2xf32>
+  // CHECK-NEXT: return [[ARG]]
+  return %0 : tensor<1x2xf32>
+}
\ No newline at end of file
diff --git a/tensorflow/compiler/mlir/xla/tests/legalize-tf.mlir b/tensorflow/compiler/mlir/xla/tests/legalize-tf.mlir
index 0328761becc..4af6726c584 100644
--- a/tensorflow/compiler/mlir/xla/tests/legalize-tf.mlir
+++ b/tensorflow/compiler/mlir/xla/tests/legalize-tf.mlir
@@ -141,6 +141,53 @@ func @broadcast_sub(%arg0: tensor<1xi32>, %arg1: tensor<1x2xi32>) -> tensor<1x2x
   return %0: tensor<1x2xi32>
 }
 
+//===----------------------------------------------------------------------===//
+// Concat op legalizations.
+//===----------------------------------------------------------------------===//
+
+// CHECK-LABEL: func @concat_v2
+func @concat_v2(%arg0: tensor<3x3xf32>, %arg1: tensor<3x3xf32>) -> tensor<6x3xf32> {
+  // CHECK: "xla_hlo.concatenate"({{.*}}) {dimension = 0 : i64} : (tensor<3x3xf32>, tensor<3x3xf32>) -> tensor<6x3xf32>
+  %axis = "tf.Const"() { value = dense<0> : tensor<i64> } : () -> tensor<i64>
+  %1 = "tf.ConcatV2"(%arg0, %arg1, %axis) {N = 2 : i64} : (tensor<3x3xf32>, tensor<3x3xf32>, tensor<i64>) -> tensor<6x3xf32>
+  return %1 : tensor<6x3xf32>
+}
+
+// CHECK-LABEL: func @concat_v2_neg_axis
+func @concat_v2_neg_axis(%arg0: tensor<3x3xf32>, %arg1: tensor<3x3xf32>) -> tensor<6x3xf32> {
+  // CHECK: "xla_hlo.concatenate"({{.*}}) {dimension = 0 : i64} : (tensor<3x3xf32>, tensor<3x3xf32>) -> tensor<6x3xf32>
+
+  %axis = "tf.Const"() { value = dense<-2> : tensor<i64> } : () -> tensor<i64>
+  %1 = "tf.ConcatV2"(%arg0, %arg1, %axis) {N = 2 : i64} : (tensor<3x3xf32>, tensor<3x3xf32>, tensor<i64>) -> tensor<6x3xf32>
+  return %1 : tensor<6x3xf32>
+}
+
+// CHECK-LABEL: func @concat_v2_1d_axis
+func @concat_v2_1d_axis(%arg0: tensor<3x3xf32>, %arg1: tensor<3x3xf32>) -> tensor<3x6xf32> {
+  // CHECK: "xla_hlo.concatenate"({{.*}}) {dimension = 1 : i64} : (tensor<3x3xf32>, tensor<3x3xf32>) -> tensor<3x6xf32>
+
+  %axis = "tf.Const"() { value = dense<[1]> : tensor<1xi64> } : () -> tensor<1xi64>
+  %1 = "tf.ConcatV2"(%arg0, %arg1, %axis) {N = 2 : i64} : (tensor<3x3xf32>, tensor<3x3xf32>, tensor<1xi64>) -> tensor<3x6xf32>
+  return %1 : tensor<3x6xf32>
+}
+
+// CHECK-LABEL: func @concat_v2_non_const_axis
+func @concat_v2_non_const_axis(%arg0: tensor<3x3xf32>, %arg1: tensor<3x3xf32>, %axis: tensor<i64>) -> tensor<3x6xf32> {
+  // CHECK: "tf.ConcatV2"
+
+  %1 = "tf.ConcatV2"(%arg0, %arg1, %axis) {N = 2 : i64} : (tensor<3x3xf32>, tensor<3x3xf32>, tensor<i64>) -> tensor<3x6xf32>
+  return %1 : tensor<3x6xf32>
+}
+
+// CHECK-LABEL: func @concat_v2_unranked
+func @concat_v2_unranked(%arg0: tensor<*xf32>, %arg1: tensor<*xf32>) -> tensor<*xf32> {
+  // CHECK: "tf.ConcatV2"
+
+  %axis = "tf.Const"() { value = dense<0> : tensor<i64> } : () -> tensor<i64>
+  %1 = "tf.ConcatV2"(%arg0, %arg1, %axis) {N = 2 : i64} : (tensor<*xf32>, tensor<*xf32>, tensor<i64>) -> tensor<*xf32>
+  return %1 : tensor<*xf32>
+}
+
 //===----------------------------------------------------------------------===//
 // Identity op legalizations.
 //===----------------------------------------------------------------------===//
@@ -177,6 +224,37 @@ func @matmul_notranspose(%arg0: tensor<5x7xf32>, %arg1: tensor<7x11xf32>) -> ten
   return %0 : tensor<5x11xf32>
 }
 
+//===----------------------------------------------------------------------===//
+// MaxPool op legalizations.
+//===----------------------------------------------------------------------===//
+
+// CHECK-LABEL: maxpool_valid_padding
+// CHECK-SAME: %[[ARG:.*]]: tensor
+func @maxpool_valid_padding(%arg0: tensor<2x12x20x7xi32>) -> tensor<2x3x5x7xi32> {
+  // CHECK: %[[INIT:.*]] = constant dense<-2147483648> : tensor<i32>
+  // CHECK: "xla_hlo.reduce_window"(%[[ARG]], %[[INIT]])
+  // CHECK: xla_hlo.max
+  // CHECK: xla_hlo.return
+  // CHECK: {window_dimensions = dense<[1, 2, 2, 1]> : tensor<4xi64>, window_strides = dense<[1, 4, 4, 1]> : tensor<4xi64>}
+
+  %0 = "tf.MaxPool"(%arg0) {data_format = "NHWC", ksize = [1, 2, 2, 1], padding = "VALID", strides = [1, 4, 4, 1]} : (tensor<2x12x20x7xi32>) -> tensor<2x3x5x7xi32>
+  return %0 : tensor<2x3x5x7xi32>
+}
+
+//===----------------------------------------------------------------------===//
+// Pack op legalizations.
+//===----------------------------------------------------------------------===//
+
+// CHECK-LABEL: func @pack
+func @pack(%arg0: tensor<2xi32>, %arg1: tensor<2xi32>) -> tensor<2x2xi32> {
+  // CHECK: "xla_hlo.reshape"({{.*}}) : (tensor<2xi32>) -> tensor<1x2xi32>
+  // CHECK: "xla_hlo.reshape"({{.*}}) : (tensor<2xi32>) -> tensor<1x2xi32>
+  // CHECK: "xla_hlo.concatenate"({{.*}}) {dimension = 0 : i64} : (tensor<1x2xi32>, tensor<1x2xi32>) -> tensor<2x2xi32>
+
+  %0 = "tf.Pack"(%arg0, %arg1) {N = 2 : i64} : (tensor<2xi32>, tensor<2xi32>) -> tensor<2x2xi32>
+  return %0 : tensor<2x2xi32>
+}
+
 //===----------------------------------------------------------------------===//
 // Relu op legalizations.
 //===----------------------------------------------------------------------===//
@@ -198,6 +276,64 @@ func @relu6(%arg0: tensor<1xi32>) -> tensor<1xi32> {
   return %0: tensor<1xi32>
 }
 
+//===----------------------------------------------------------------------===//
+// Softmax op legalizations.
+//===----------------------------------------------------------------------===//
+
+// CHECK-LABEL: func @simple_softmax
+// CHECK-SAME: (%[[ARG0:.*]]: tensor<2x3xf32>)
+func @simple_softmax(%arg0: tensor<2x3xf32>) -> tensor<2x3xf32> {
+  // CHECK: %[[NEG_INF:.*]] = constant dense<0xFF800000> : tensor<f32>
+  // CHECK: %[[ZERO:.*]] = constant dense<0.000000e+00> : tensor<f32>
+
+  // Verify reduce op for max computation and its body.
+  // CHECK: %[[MAX:.*]] = "xla_hlo.reduce"(%[[ARG0]], %[[NEG_INF]])
+  // CHECK:  xla_hlo.max
+  // CHECK: "xla_hlo.return"
+  // CHECK: {dimensions = dense<1> : tensor<1xi64>} : (tensor<2x3xf32>, tensor<f32>) -> tensor<2xf32>
+
+  // CHECK: %[[SHIFTED_INP:.*]] = "xla_hlo.sub"(%[[ARG0]], %[[MAX]]) {broadcast_dimensions = dense<0> : tensor<1xi64>}
+  // CHECK: %[[EXP:.*]] = "xla_hlo.exp"(%[[SHIFTED_INP]])
+
+  // Verify reduce op for summation and its body.
+  // CHECK: %[[SUM:.*]] = "xla_hlo.reduce"(%[[EXP]], %[[ZERO]])
+  // CHECK:  xla_hlo.add
+  // CHECK: "xla_hlo.return"
+  // CHECK: {dimensions = dense<1> : tensor<1xi64>}
+
+  // CHECK: %[[RESULT:.*]] = "xla_hlo.div"(%[[EXP]], %[[SUM]]) {broadcast_dimensions = dense<0> : tensor<1xi64>}
+  // return %[[RESULT]]
+
+  %0 = "tf.Softmax"(%arg0) : (tensor<2x3xf32>) -> tensor<2x3xf32>
+  return %0: tensor<2x3xf32>
+}
+
+// CHECK-LABEL: bf16_softmax
+func @bf16_softmax(%arg0: tensor<2x3xbf16>) -> tensor<2x3xbf16> {
+  // Verify that conversion to f32 and then back to bf16 are introduced.
+
+  // CHECK: "xla_hlo.convert"({{.*}}) : (tensor<2x3xbf16>) -> tensor<2x3xf32>
+  // CHECK: "xla_hlo.convert"({{.*}}) : (tensor<2xf32>) -> tensor<2xbf16>
+
+  %0 = "tf.Softmax"(%arg0) : (tensor<2x3xbf16>) -> tensor<2x3xbf16>
+  return %0: tensor<2x3xbf16>
+}
+
+// CHECK-LABEL: rank4_softmax
+func @rank4_softmax(%arg0: tensor<2x3x4x5xf16>) -> tensor<2x3x4x5xf16> {
+  // Verify that reduce op dimensions and broadcast dimensions are correct.
+
+  // CHECK: "xla_hlo.reduce"
+  // CHECK: dimensions = dense<3>
+
+  // CHECK: "xla_hlo.reduce"
+  // CHECK: dimensions = dense<3>
+
+  // CHECK: "xla_hlo.div"{{.*}} {broadcast_dimensions = dense<[0, 1, 2]> : tensor<3xi64>}
+  %0 = "tf.Softmax"(%arg0) : (tensor<2x3x4x5xf16>) -> tensor<2x3x4x5xf16>
+  return %0: tensor<2x3x4x5xf16>
+}
+
 //===----------------------------------------------------------------------===//
 // Unary op legalizations.
 //===----------------------------------------------------------------------===//
@@ -229,3 +365,10 @@ func @squeeze_dynamic(%arg0: tensor<?x10xf32>) -> tensor<*xf32> {
   %0 = "tf.Squeeze"(%arg0) : (tensor<?x10xf32>) -> tensor<*xf32>
   return %0 : tensor<*xf32>
 }
+
+// CHECK-LABEL: expand_dims
+func @expand_dims(%arg0: tensor<2xf32>, %axis: tensor<i32>) -> tensor<1x2xf32> {
+  // CHECK: "xla_hlo.reshape"{{.*}} : (tensor<2xf32>) -> tensor<1x2xf32>
+  %0 = "tf.ExpandDims"(%arg0, %axis) : (tensor<2xf32>, tensor<i32>) -> tensor<1x2xf32>
+  return %0 : tensor<1x2xf32>
+}
diff --git a/tensorflow/compiler/mlir/xla/tests/lhlo_ops.mlir b/tensorflow/compiler/mlir/xla/tests/lhlo_ops.mlir
index 4aee05a146c..146dc0a4b45 100644
--- a/tensorflow/compiler/mlir/xla/tests/lhlo_ops.mlir
+++ b/tensorflow/compiler/mlir/xla/tests/lhlo_ops.mlir
@@ -136,9 +136,8 @@ func @reduce_computation(%sum: memref<1xf32>, %element: memref<1xf32>) -> () {
 }
 
 // CHECK-LABEL: func @reduce_memref
-func @reduce_memref(%input: memref<10xf32>, %out: memref<1xf32>) -> () {
-  "xla_lhlo.reduce"(%input, %out) {computation = @reduce_computation,
-                                   dimensions = dense<[0]> : tensor<1xi64>} : (memref<10xf32>, memref<1xf32>) -> ()
+func @reduce_memref(%input: memref<10xf32>, %init: memref<f32>, %out: memref<1xf32>) -> () {
+  "xla_lhlo.reduce"(%input, %init, %out) {computation = @reduce_computation, dimensions = dense<[0]> : tensor<1xi64>} : (memref<10xf32>, memref<f32>, memref<1xf32>) -> ()
   return
 }
 
@@ -156,4 +155,4 @@ func @fusion_memref(%input1: memref<10xf32>, %input2: memref<10xf32>, %input3: m
     "xla_lhlo.terminator"() : () -> ()
   } ) : () -> ()
   return
-}
\ No newline at end of file
+}
diff --git a/tensorflow/compiler/mlir/xla/tests/ops.mlir b/tensorflow/compiler/mlir/xla/tests/ops.mlir
index 06c98fb39b0..4520f7615ca 100644
--- a/tensorflow/compiler/mlir/xla/tests/ops.mlir
+++ b/tensorflow/compiler/mlir/xla/tests/ops.mlir
@@ -50,30 +50,6 @@ func @broadcast(%arg0: tensor<3xi32>) -> tensor<1x2x3xi32> {
 
 // -----
 
-func @broadcast_nonint_sizes(%arg0: tensor<3xi32>) -> tensor<1x2x3xi32> {
-  // expected-error@+1 {{broadcast_sizes must be a DenseIntElementsAttr}}
-  %0 = "xla_hlo.broadcast"(%arg0) {broadcast_sizes = dense<[1.0, 2.0]> : tensor<2xf64>} : (tensor<3xi32>) -> tensor<1x2x3xi32>
-  return %0 : tensor<1x2x3xi32>
-}
-
-// -----
-
-func @broadcast_splat_sizes(%arg0: tensor<3xi32>) -> tensor<1x2x3xi32> {
-  // expected-error@+1 {{broadcast_sizes must be a DenseIntElementsAttr}}
-  %0 = "xla_hlo.broadcast"(%arg0) {broadcast_sizes = dense<2.0> : tensor<2xf64>} : (tensor<3xi32>) -> tensor<1x2x3xi32>
-  return %0 : tensor<1x2x3xi32>
-}
-
-// -----
-
-func @broadcast_sparse_sizes(%arg0: tensor<3xi32>) -> tensor<1x2x3xi32> {
-  // expected-error@+1 {{broadcast_sizes must be a DenseIntElementsAttr}}
-  %0 = "xla_hlo.broadcast"(%arg0) {broadcast_sizes = sparse<[[0, 0], [1, 2]], [1, 5]> : tensor<3x4xi32>} : (tensor<3xi32>) -> tensor<1x2x3xi32>
-  return %0 : tensor<1x2x3xi32>
-}
-
-// -----
-
 func @broadcast_bad_sizes_rank(%arg0: tensor<3xi32>) -> tensor<1x2x3xi32> {
   // expected-error@+1 {{broadcast_sizes has rank 2 instead of rank 1}}
   %0 = "xla_hlo.broadcast"(%arg0) {broadcast_sizes = dense<[[1, 2]]> : tensor<1x2xi64>} : (tensor<3xi32>) -> tensor<1x2x3xi32>
@@ -122,30 +98,6 @@ func @broadcast_in_dim_zero_rank(%arg0: tensor<i32>) -> tensor<1x2x3xi32> {
 
 // -----
 
-func @broadcast_in_dim_bad_nonint_dimensions(%arg0: tensor<1x2xi32>) -> tensor<1x2x3xi32> {
-  // expected-error@+1 {{broadcast_sizes must be a DenseIntElementsAttr}}
-  %0 = "xla_hlo.broadcast_in_dim"(%arg0) {broadcast_dimensions = dense<[1.0, 2.0]> : tensor<2xf64>} : (tensor<1x2xi32>) -> tensor<1x2x3xi32>
-  return %0 : tensor<1x2x3xi32>
-}
-
-// -----
-
-func @broadcast_in_dim_bad_splat_dimensions(%arg0: tensor<1x2xi32>) -> tensor<1x2x3xi32> {
-  // expected-error@+1 {{broadcast_sizes must be a DenseIntElementsAttr}}
-  %0 = "xla_hlo.broadcast_in_dim"(%arg0) {broadcast_dimensions = dense<2.0> : tensor<2xf64>} : (tensor<1x2xi32>) -> tensor<1x2x3xi32>
-  return %0 : tensor<1x2x3xi32>
-}
-
-// -----
-
-func @broadcast_in_dim_bad_sparse_dimensions(%arg0: tensor<1x2xi32>) -> tensor<1x2x3xi32> {
-  // expected-error@+1 {{broadcast_sizes must be a DenseIntElementsAttr}}
-  %0 = "xla_hlo.broadcast_in_dim"(%arg0) {broadcast_dimensions = sparse<[[0, 0], [1, 2]], [1, 5]> : tensor<3x4xi32>} : (tensor<1x2xi32>) -> tensor<1x2x3xi32>
-  return %0 : tensor<1x2x3xi32>
-}
-
-// -----
-
 func @broadcast_in_dim_bad_dimension_rank(%arg0: tensor<1x2xi32>) -> tensor<1x2x3xi32> {
   // expected-error@+1 {{broadcast_dimensions has rank 2 instead of rank 1}}
   %0 = "xla_hlo.broadcast_in_dim"(%arg0) {broadcast_dimensions = dense<[[1,1],[1,1]]> : tensor<2x2xi64>} : (tensor<1x2xi32>) -> tensor<1x2x3xi32>
@@ -428,15 +380,15 @@ func @select_bad_pred_shape(%arg0: tensor<3xi1>, %arg1: tensor<2x3xi32>, %arg2:
 
 // CHECK-LABEL: func @slice
 func @slice(%arg0: tensor<3x4xi32>) -> tensor<1x4xi32> {
-  %0 = "xla_hlo.slice"(%arg0) {start_indices = dense<[1, 0]> : tensor<2xi64>, limit_indices = dense<[2, 4]> : tensor<2xi64>} : (tensor<3x4xi32>) -> tensor<1x4xi32>
+  %0 = "xla_hlo.slice"(%arg0) {start_indices = dense<[1, 0]> : tensor<2xi64>, limit_indices = dense<[2, 4]> : tensor<2xi64>, strides = dense<[1, 2]> : tensor<2xi64>} : (tensor<3x4xi32>) -> tensor<1x4xi32>
   return %0 : tensor<1x4xi32>
 }
 
 // -----
 
 func @slice_indices_mismatch(%arg0: tensor<3x4xi32>) -> tensor<1x4xi32> {
-  // expected-error@+1 {{failed to verify that all of {start_indices, limit_indices} have same type}}
-  %0 = "xla_hlo.slice"(%arg0) {start_indices = dense<[1, 2, 3]> : tensor<3xi64>, limit_indices = dense<[2, 4]> : tensor<2xi64>} : (tensor<3x4xi32>) -> tensor<1x4xi32>
+  // expected-error@+1 {{failed to verify that all of {start_indices, limit_indices, strides} have same type}}
+  %0 = "xla_hlo.slice"(%arg0) {start_indices = dense<[1, 2, 3]> : tensor<3xi64>, limit_indices = dense<[2, 4]> : tensor<2xi64>, strides = dense<[1, 2]> : tensor<2xi64>} : (tensor<3x4xi32>) -> tensor<1x4xi32>
   return %0 : tensor<1x4xi32>
 }
 
@@ -444,7 +396,7 @@ func @slice_indices_mismatch(%arg0: tensor<3x4xi32>) -> tensor<1x4xi32> {
 
 func @slice_operand_result_mismatch(%arg0: tensor<3x4xi32>) -> tensor<1x4xf32> {
   // expected-error@+1 {{requires the same element type for all operands and results}}
-  %0 = "xla_hlo.slice"(%arg0) {start_indices = dense<[1, 0]> : tensor<2xi64>, limit_indices = dense<[2, 4]> : tensor<2xi64>} : (tensor<3x4xi32>) -> tensor<1x4xf32>
+  %0 = "xla_hlo.slice"(%arg0) {start_indices = dense<[1, 0]> : tensor<2xi64>, limit_indices = dense<[2, 4]> : tensor<2xi64>, strides = dense<[1, 2]> : tensor<2xi64>} : (tensor<3x4xi32>) -> tensor<1x4xf32>
   return %0 : tensor<1x4xf32>
 }
 
@@ -458,30 +410,6 @@ func @transpose(%arg0: tensor<1x2x3x4xi32>) -> tensor<2x1x4x3xi32> {
 
 // -----
 
-func @transpose_bad_permutations_float(%arg0: tensor<1x2x3x4xi32>) ->  tensor<2x1x4x3xi32> {
-  // expected-error@+1 {{permutation must be a DenseIntElementsAttr}}
-  %0 = "xla_hlo.transpose"(%arg0) {permutation = dense<[1.0, 0.0, 3.0, 2.0]> : tensor<4xf64>} : (tensor<1x2x3x4xi32>) -> tensor<2x1x4x3xi32>
-  return %0: tensor<2x1x4x3xi32>
-}
-
-// -----
-
-func @transpose_bad_permutations_splat(%arg0: tensor<1x2x3x4xi32>) ->  tensor<2x1x4x3xi32> {
-  // expected-error@+1 {{permutation must be a DenseIntElementsAttr}}
-  %0 = "xla_hlo.transpose"(%arg0) {permutation = dense<2.0> : tensor<2xf64>} : (tensor<1x2x3x4xi32>) -> tensor<2x1x4x3xi32>
-  return %0: tensor<2x1x4x3xi32>
-}
-
-// -----
-
-func @transpose_bad_permutations_sparse(%arg0: tensor<1x2x3x4xi32>) ->  tensor<2x1x4x3xi32> {
-  // expected-error@+1 {{permutation must be a DenseIntElementsAttr}}
-  %0 = "xla_hlo.transpose"(%arg0) {permutation = sparse<[[0, 0], [1, 2]], [1, 5]> : tensor<3x4xi32>} : (tensor<1x2x3x4xi32>) -> tensor<2x1x4x3xi32>
-  return %0: tensor<2x1x4x3xi32>
-}
-
-// -----
-
 func @transpose_bad_permutations_rank(%arg0: tensor<1x2x3x4xi32>) ->  tensor<2x1x4x3xi32> {
   // expected-error@+1 {{permutation has rank 2 instead of rank 1}}
   %0 = "xla_hlo.transpose"(%arg0) {permutation = dense<[[1]]> : tensor<1x1xi64>} : (tensor<1x2x3x4xi32>) -> tensor<2x1x4x3xi32>
@@ -519,3 +447,55 @@ func @tuple(%arg0: tensor<1xi32>, %arg1: tensor<1x2xf32>) -> tuple<tensor<1xi32>
   %0 = "xla_hlo.tuple"(%arg0, %arg1) : (tensor<1xi32>, tensor<1x2xf32>) -> tuple<tensor<1xi32>, tensor<1x2xf32>>
   return %0: tuple<tensor<1xi32>, tensor<1x2xf32>>
 }
+
+// -----
+
+func @tuple_arg_size_mismatch(%arg0: tensor<f32>, %arg1: tensor<f32>) -> tuple<tensor<f32>, tensor<f32>, tensor<f32>> {
+  // expected-error@+1 {{has return type tuple<tensor<f32>, tensor<f32>, tensor<f32>>, but expected tuple<tensor<f32>, tensor<f32>>}}
+  %0 = "xla_hlo.tuple"(%arg0, %arg1) : (tensor<f32>, tensor<f32>) -> tuple<tensor<f32>, tensor<f32>, tensor<f32>>
+  return %0 : tuple<tensor<f32>, tensor<f32>, tensor<f32>>
+}
+
+// -----
+
+func @tuple_type_mismatch(%arg0: tensor<f32>, %arg1: tensor<f32>) -> tuple<tensor<f32>, tensor<i32>> {
+  // expected-error@+1 {{has return type tuple<tensor<f32>, tensor<i32>>, but expected tuple<tensor<f32>, tensor<f32>>}}
+  %0 = "xla_hlo.tuple"(%arg0, %arg1) : (tensor<f32>, tensor<f32>) -> tuple<tensor<f32>, tensor<i32>>
+  return %0 : tuple<tensor<f32>, tensor<i32>>
+}
+
+// -----
+
+func @get_tuple_element(%arg0: tuple<tensor<f32>, tensor<i32>>) -> tensor<f32> {
+  %0 = "xla_hlo.get_tuple_element"(%arg0) {index = 0 : i32} : (tuple<tensor<f32>, tensor<i32>>) -> tensor<f32>
+  return %0 : tensor<f32>
+}
+
+// -----
+
+func @get_tuple_element_bad_type(%arg0: tuple<tensor<f32>, tensor<i32>>) -> tensor<i32> {
+  // expected-error@+1 {{has return type tensor<i32>, but expected tensor<f32>}}
+  %0 = "xla_hlo.get_tuple_element"(%arg0) {index = 0 : i32} : (tuple<tensor<f32>, tensor<i32>>) -> tensor<i32>
+  return %0 : tensor<i32>
+}
+
+// -----
+
+func @get_tuple_element_index_out_of_bounds(%arg0: tuple<tensor<f32>, tensor<i32>>) -> tensor<f32> {
+  // expected-error@+1 {{index 2 is out of bounds of operand with size 2}}
+  %0 = "xla_hlo.get_tuple_element"(%arg0) {index = 2 : i32} : (tuple<tensor<f32>, tensor<i32>>) -> tensor<f32>
+  return %0 : tensor<f32>
+}
+
+// -----
+
+// CHECK-LABEL: func @reduce_window
+func @reduce_window(%arg0: tensor<4x4xi32>) -> tensor<2x2xi32> {
+  %cst = constant dense<0> : tensor<i32>
+  %0 = "xla_hlo.reduce_window"(%arg0, %cst) ( {
+    ^bb0(%arg1: tensor<i32>, %arg2: tensor<i32>):       // no predecessors
+    %6 = "xla_hlo.max"(%arg1, %arg2) : (tensor<i32>, tensor<i32>) -> tensor<i32>
+    "xla_hlo.return"(%6) : (tensor<i32>) -> ()
+  }) {window_dimensions = dense<[2, 2]> : tensor<2xi64>, window_strides = dense<[2, 2]> : tensor<2xi64>, padding = dense<[2, 2]> : tensor<2xi64>} : (tensor<4x4xi32>, tensor<i32>) -> tensor<2x2xi32>
+  return %0 : tensor<2x2xi32>
+}
diff --git a/tensorflow/compiler/mlir/xla/tests/translate/add.mlir b/tensorflow/compiler/mlir/xla/tests/translate/add.mlir
index a77b90ca083..a457ba59e22 100644
--- a/tensorflow/compiler/mlir/xla/tests/translate/add.mlir
+++ b/tensorflow/compiler/mlir/xla/tests/translate/add.mlir
@@ -1,6 +1,12 @@
 // RUN: tf-mlir-translate -mlir-hlo-to-hlo-text %s | FileCheck %s
+// RUN: tf-mlir-translate -mlir-hlo-to-hlo-text -emit-use-tuple-args %s | FileCheck %s --check-prefix=TUPLE-ARG
+// RUN: tf-mlir-translate -mlir-hlo-to-hlo-text -emit-always-return-tuple %s | FileCheck %s --check-prefix=TUPLE-RET
+// RUN: tf-mlir-translate -mlir-hlo-to-hlo-text -emit-use-tuple-args -emit-always-return-tuple %s | FileCheck %s --check-prefix=TUPLES
 
-// CHECK-LABEL: ENTRY %main.5 (Arg_0.1: f32[4], Arg_1.2: f32[4]) -> f32[4] {
+// CHECK-LABEL: ENTRY %main.{{.*}} (Arg_0.1: f32[4], Arg_1.2: f32[4]) -> f32[4]
+// TUPLE-ARG-LABEL: ENTRY %main.{{.*}} (arg_tuple.1: (f32[4], f32[4])) -> f32[4]
+// TUPLE-RET-LABEL: ENTRY %main.{{.*}} (Arg_0.1: f32[4], Arg_1.2: f32[4]) -> (f32[4])
+// TUPLES-LABEL: ENTRY %main.{{.*}} (arg_tuple.1: (f32[4], f32[4])) -> (f32[4])
 func @main(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>) -> tensor<4xf32> {
   // CHECK-NEXT: %Arg_0.1 = f32[4] parameter(0)
   // CHECK-NEXT: %Arg_1.2 = f32[4] parameter(1)
diff --git a/tensorflow/compiler/mlir/xla/tests/translate/const.mlir b/tensorflow/compiler/mlir/xla/tests/translate/const.mlir
new file mode 100644
index 00000000000..42d9c5dc963
--- /dev/null
+++ b/tensorflow/compiler/mlir/xla/tests/translate/const.mlir
@@ -0,0 +1,30 @@
+// RUN: tf-mlir-translate -mlir-hlo-to-hlo-text %s | FileCheck %s --dump-input-on-failure
+
+// CHECK-LABEL: ENTRY %main
+func @main() -> tensor<2x2x1x1xf32> {
+  // CHECK: constant.{{.*}} = s64[] constant(1)
+  %cst = constant dense<1> : tensor<i64>
+  // CHECK: constant.{{.*}} = f32[2,2,1,1]
+  // CHECK-SAME: { { /*i0=0*/ { /*i1=0*/ {1} }, { /*i1=1*/ {2} } }, { /*i0=1*/ { /*i1=0*/ {3} }, { /*i1=1*/ {4} } } }
+  %cst_0 = constant dense<
+    [[[[1.000000e+00]], [[2.000000e+00]]], [[[3.000000e+00]], [[4.000000e+00]]]]
+  > : tensor<2x2x1x1xf32>
+
+  // CHECK: s32[1] constant({1})
+  %cst_1 = constant dense<1> : tensor<1xi32>
+
+  // CHECK: %[[C:.*]] = s32[] constant(1)
+  // CHECK: s32[10] broadcast(s32[] %[[C]])
+  %cst_2 = constant dense<1> : tensor<10xi32>
+
+  // CHECK: s32[4] constant({1, 2, 3, 4})
+  %cst_3 = constant dense<[1, 2, 3, 4]> : tensor<4xi32>
+
+  // CHECK: s32[2,2] constant({ { 1, 2 }, { 3, 4 } })
+  %cst_4 = constant dense<[[1, 2], [3, 4]]> : tensor<2x2xi32>
+
+  // CHECK: s32[2,2] constant({ { 3, 2 }, { 1, 4 } })
+  %cst_5 = constant dense<[[3, 2], [1, 4]]> : tensor<2x2xi32>
+
+  return %cst_0 : tensor<2x2x1x1xf32>
+}
diff --git a/tensorflow/compiler/mlir/xla/tests/translate/dot_general.hlotxt b/tensorflow/compiler/mlir/xla/tests/translate/dot_general.hlotxt
new file mode 100644
index 00000000000..25efcfd3e73
--- /dev/null
+++ b/tensorflow/compiler/mlir/xla/tests/translate/dot_general.hlotxt
@@ -0,0 +1,25 @@
+// RUN: tf-mlir-translate -hlo-text-to-mlir-hlo %s -o - | FileCheck %s
+
+HloModule main
+
+// CHECK-LABEL: @main
+// CHECK-SAME: [[ARG0:%[a-zA-Z0-9]+]]
+// CHECK-SAME: [[ARG1:%[a-zA-Z0-9]+]]
+ENTRY %main (Arg_0.1: f32[4, 1], Arg_1.2: f32[1, 4]) -> f32[] {
+  %Arg_0.1 = f32[4, 1] parameter(0)
+  %Arg_1.2 = f32[1, 4] parameter(1)
+
+  // CHECK-NEXT: [[R0:%.+]] = "xla_hlo.dot_general"([[ARG0]], [[ARG1]]) {dot_dimension_numbers = {lhs_batching_dimensions = dense<[]> : tensor<0xi64>, lhs_contracting_dimensions = dense<0> : tensor<1xi64>, rhs_batching_dimensions = dense<[]> : tensor<0xi64>, rhs_contracting_dimensions = dense<1> : tensor<1xi64>}, name = "dot.3", precision_config = ["HIGH", "HIGHEST"]}
+  dot.3 = f32[] dot(Arg_0.1, Arg_1.2), lhs_contracting_dims={0}, rhs_contracting_dims={1}, operand_precision={high,highest}
+
+  // CHECK-NEXT: [[R1:%.+]] = "xla_hlo.dot_general"([[ARG0]], [[ARG1]]) {dot_dimension_numbers = {lhs_batching_dimensions = dense<[]> : tensor<0xi64>, lhs_contracting_dimensions = dense<0> : tensor<1xi64>, rhs_batching_dimensions = dense<[]> : tensor<0xi64>, rhs_contracting_dimensions = dense<1> : tensor<1xi64>}, name = "dot.4", precision_config = ["HIGHEST", "DEFAULT"]}
+  dot.4 = f32[] dot(Arg_0.1, Arg_1.2), lhs_contracting_dims={0}, rhs_contracting_dims={1}, operand_precision={highest,default}
+
+  // CHECK-NEXT: [[R2:%.+]] = "xla_hlo.dot_general"([[ARG0]], [[ARG1]]) {dot_dimension_numbers = {lhs_batching_dimensions = dense<[]> : tensor<0xi64>, lhs_contracting_dimensions = dense<0> : tensor<1xi64>, rhs_batching_dimensions = dense<[]> : tensor<0xi64>, rhs_contracting_dimensions = dense<1> : tensor<1xi64>}, name = "dot.5", precision_config = ["DEFAULT", "DEFAULT"]}
+  %dot.5 = f32[] dot(Arg_0.1, Arg_1.2), lhs_contracting_dims={0}, rhs_contracting_dims={1}, operand_precision={default,default}
+
+  // TODO(b/129709049) consider making this default precision config inferred.
+  // CHECK-NEXT: [[R3:%.+]] = "xla_hlo.dot_general"([[ARG0]], [[ARG1]]) {dot_dimension_numbers = {lhs_batching_dimensions = dense<[]> : tensor<0xi64>, lhs_contracting_dimensions = dense<0> : tensor<1xi64>, rhs_batching_dimensions = dense<[]> : tensor<0xi64>, rhs_contracting_dimensions = dense<1> : tensor<1xi64>}, name = "dot.6", precision_config = ["DEFAULT", "DEFAULT"]}
+  // CHECK-NEXT: return [[R3]]
+  ROOT %dot.6 = f32[] dot(Arg_0.1, Arg_1.2), lhs_contracting_dims={0}, rhs_contracting_dims={1}
+}
\ No newline at end of file
diff --git a/tensorflow/compiler/mlir/xla/tests/translate/multiple_return_tuple.mlir b/tensorflow/compiler/mlir/xla/tests/translate/multiple_return_tuple.mlir
new file mode 100644
index 00000000000..87817519870
--- /dev/null
+++ b/tensorflow/compiler/mlir/xla/tests/translate/multiple_return_tuple.mlir
@@ -0,0 +1,14 @@
+// RUN: tf-mlir-translate -mlir-hlo-to-hlo-text %s | FileCheck %s
+// RUN: tf-mlir-translate -mlir-hlo-to-hlo-text -emit-use-tuple-args -emit-always-return-tuple %s | FileCheck %s --check-prefix=TUPLE
+
+// Test to verify that multiple result function with always emit return tuple
+// does not result in nested tuples.
+
+// CHECK-LABEL: ENTRY %main.{{.*}} (Arg_0.1: s32[4]) -> (s32[4], s32[1,2,3,4])
+// TUPLE-LABEL: ENTRY %main.{{.*}} (arg_tuple.1: (s32[4])) -> (s32[4], s32[1,2,3,4])
+func @main(%arg0: tensor<4xi32>) -> (tensor<4xi32>, tensor<1x2x3x4xi32>) {
+  // CHECK-NEXT: %Arg_0.1 = s32[4] parameter(0)
+  // CHECK-NEXT: %broadcast.2 = s32[1,2,3,4] broadcast(s32[4] %Arg_0.1), dimensions={3}
+  %0 = "xla_hlo.broadcast"(%arg0) {broadcast_sizes = dense<[1,2,3]> : tensor<3xi64>} : (tensor<4xi32>) -> tensor<1x2x3x4xi32>
+  return %arg0, %0 : tensor<4xi32>, tensor<1x2x3x4xi32>
+}
diff --git a/tensorflow/compiler/mlir/xla/transforms/legalize_tf.cc b/tensorflow/compiler/mlir/xla/transforms/legalize_tf.cc
index 830bcde5f04..fabdde69cf6 100644
--- a/tensorflow/compiler/mlir/xla/transforms/legalize_tf.cc
+++ b/tensorflow/compiler/mlir/xla/transforms/legalize_tf.cc
@@ -15,13 +15,19 @@ limitations under the License.
 
 // This file implements logic for lowering TensorFlow dialect to XLA dialect.
 
+#include <cstdint>
 #include <numeric>
 
 #include "mlir/Dialect/StandardOps/Ops.h"  // TF:local_config_mlir
+#include "mlir/IR/Attributes.h"  // TF:local_config_mlir
+#include "mlir/IR/MLIRContext.h"  // TF:local_config_mlir
+#include "mlir/IR/Module.h"  // TF:local_config_mlir
 #include "mlir/IR/Operation.h"  // TF:local_config_mlir
 #include "mlir/IR/PatternMatch.h"  // TF:local_config_mlir
+#include "mlir/IR/StandardTypes.h"  // TF:local_config_mlir
 #include "mlir/Pass/Pass.h"  // TF:local_config_mlir
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
+#include "tensorflow/compiler/mlir/tensorflow/transforms/lower_tf.h"
 #include "tensorflow/compiler/mlir/xla/ir/hlo_ops.h"
 #include "tensorflow/compiler/mlir/xla/transforms/passes.h"
 
@@ -47,6 +53,53 @@ static size_t getFeatureDimension(StringAttr format,
   return isDefaultDataFormat(format.getValue()) ? inputType.getRank() - 1 : 1;
 }
 
+static IntegerAttr GetHLOAxisFromTFAxis(ElementsAttr attr, int64_t rank,
+                                        Builder *b) {
+  SmallVector<uint64_t, 1> index(attr.getType().getRank(), 0);
+  int64_t axis = attr.getValue<IntegerAttr>(index).getInt();
+  if (axis < 0) {
+    axis += rank;
+  }
+  return b->getI64IntegerAttr(axis);
+}
+
+// Returns minimum value for the given int or float element type.
+static ConstantOp GetMinValueForType(Type ty, Location loc,
+                                     PatternRewriter *rewriter) {
+  RankedTensorType scalar_ty = rewriter->getTensorType({}, ty);
+
+  DenseElementsAttr attr;
+  if (auto float_ty = ty.dyn_cast_or_null<FloatType>()) {
+    APFloat neg_inf =
+        APFloat::getInf(float_ty.getFloatSemantics(), /*negative=*/true);
+    attr = DenseElementsAttr::get(scalar_ty, neg_inf);
+  } else {
+    auto int_ty = ty.cast<IntegerType>();
+    APInt min_val = APInt::getSignedMinValue(int_ty.getWidth());
+    attr = DenseElementsAttr::get(scalar_ty, min_val);
+  }
+  return rewriter->create<ConstantOp>(loc, attr);
+}
+
+// Builds body for reduce op by using the using the template binary op as the
+// reducer op.
+template <typename Op>
+static void BuildReduceBody(Type element_type, Region *body,
+                            OpBuilder *builder) {
+  OpBuilder::InsertionGuard guard(*builder);
+  Block *block = builder->createBlock(body);
+
+  // Block arguments are scalars of the given element type.
+  Type type = builder->getTensorType(/*shape=*/{}, element_type);
+  block->addArguments({type, type});
+
+  Location loc = body->getLoc();
+  auto reducer = builder->create<Op>(loc, type, block->getArgument(0),
+                                     block->getArgument(1),
+                                     /*broadcast_dimensions=*/nullptr);
+  builder->create<xla_hlo::ReturnOp>(loc, reducer.getResult());
+}
+
 //===----------------------------------------------------------------------===//
 // BatchNorm op utilities.
 //===----------------------------------------------------------------------===//
@@ -72,12 +125,15 @@ static bool hasValidBiasFeatureDimension(StringAttr format, Value *input,
   return biasType.getDimSize(0) == inputType.getDimSize(featureDim);
 }
 
-/// Return a 1D ElementsAttr for the feature dimension of a BiasAdd.
-static ElementsAttr getBiasFeatureDimension(Builder &b, StringAttr format,
-                                            Value *input) {
-  return b.getDenseIntElementsAttr(
-      b.getTensorType(1, b.getIntegerType(64)),
-      getFeatureDimension(format, input->getType().cast<RankedTensorType>()));
+/// Return a 1D DenseIntElementsAttr for the feature dimension of a BiasAdd.
+static DenseIntElementsAttr getBiasFeatureDimension(Builder &b,
+                                                    StringAttr format,
+                                                    Value *input) {
+  auto inputType = input->getType().cast<RankedTensorType>();
+  size_t featureDim = getFeatureDimension(format, inputType);
+  RankedTensorType type = b.getTensorType(1, b.getIntegerType(64));
+  return DenseIntElementsAttr::get(type, featureDim)
+      .cast<DenseIntElementsAttr>();
 }
 
 //===----------------------------------------------------------------------===//
@@ -101,7 +157,8 @@ static ElementsAttr getSplat(Builder &b, Value *val, T constant) {
   return DenseElementsAttr::get(valType, elementAttr);
 }
 
-static ElementsAttr getBroadcastDimensionsAttr(Builder &b, Value *x, Value *y) {
+static DenseIntElementsAttr getBroadcastDimensionsAttr(Builder &b, Value *x,
+                                                       Value *y) {
   TensorType xType = x->getType().dyn_cast<RankedTensorType>();
   TensorType yType = y->getType().dyn_cast<RankedTensorType>();
   if (xType == yType || !xType || !yType) return {};
@@ -126,23 +183,208 @@ static ElementsAttr getBroadcastDimensionsAttr(Builder &b, Value *x, Value *y) {
   std::iota(broadcastDimensions.begin(), broadcastDimensions.end(),
             maxRank - minRank);
 
-  return b.getDenseIntElementsAttr(
-      b.getTensorType({minRank}, b.getIntegerType(64)), broadcastDimensions);
+  RankedTensorType type = b.getTensorType({minRank}, b.getIntegerType(64));
+  return DenseIntElementsAttr::get<int64_t>(type, broadcastDimensions)
+      .cast<DenseIntElementsAttr>();
 }
 
+//===----------------------------------------------------------------------===//
+// Softmax op utilities.
+//===----------------------------------------------------------------------===//
+
+// Returns a 1-d i64 elements attribute populated with numbers from start to
+// end, excluding.
+static DenseIntElementsAttr GetI64ElementsAttrForSeq(int start, int end,
+                                                     Builder *builder) {
+  int size = end - start;
+
+  SmallVector<int64_t, 4> vals;
+  vals.resize(size);
+  std::iota(vals.begin(), vals.end(), start);
+
+  TensorType ty = builder->getTensorType({size}, builder->getIntegerType(64));
+  return DenseIntElementsAttr::get<int64_t>(ty, vals)
+      .cast<DenseIntElementsAttr>();
+}
+
+// Returns the type to use for accumulating the given type.
+static Type GetAccumulationType(Type ty) {
+  // Upcast 16 bit sum reductions to 32 bit to reduce the precision loss from
+  // repeated floating point additions.
+  return (ty.isF16() || ty.isBF16()) ? FloatType::getF32(ty.getContext()) : ty;
+}
+
+//===----------------------------------------------------------------------===//
+// Op converters.
+//===----------------------------------------------------------------------===//
+
 namespace mlir {
 namespace xla {
 namespace {
+
+// Converts MaxPool op to HLO ReduceWindow op by setting appropriate window
+// dimensions with max as the reduction function.
+//
+// Sample result for VALID padding mode:
+//
+//   %init = constant dense<...> : tensor<i32>
+//   %max_pool = "xla_hlo.reduce"(%inp, %init) ["xla_hlo.max"]
+//               {window_dimensions = ..., window_strides = ... }
+//
+class ConvertMaxPoolOp : public OpRewritePattern<TF::MaxPoolOp> {
+ public:
+  explicit ConvertMaxPoolOp(MLIRContext *context)
+      : OpRewritePattern<TF::MaxPoolOp>(context, 1) {}
+
+  PatternMatchResult matchAndRewrite(TF::MaxPoolOp op,
+                                     PatternRewriter &rewriter) const override {
+    // TODO(hinsu): Support 'SAME' padding mode.
+    if (op.padding() != "VALID") return matchFailure();
+
+    Type element_type =
+        op.input()->getType().cast<TensorType>().getElementType();
+    if (!element_type.isIntOrFloat()) return matchFailure();
+    Location loc = op.getLoc();
+    ConstantOp init = GetMinValueForType(element_type, loc, &rewriter);
+
+    auto get_elements_attr = [&](ArrayAttr attr) {
+      RankedTensorType ty = rewriter.getTensorType(
+          static_cast<int64_t>(attr.size()), rewriter.getIntegerType(64));
+      return DenseElementsAttr::get(ty, attr.getValue())
+          .cast<DenseIntElementsAttr>();
+    };
+
+    auto reduce = rewriter.create<xla_hlo::ReduceWindowOp>(
+        loc, op.getType(), op.input(), init.getResult(),
+        get_elements_attr(op.ksize()), get_elements_attr(op.strides()),
+        /*base_dilations=*/DenseIntElementsAttr(),
+        /*window_dilations=*/DenseIntElementsAttr(),
+        /*paddings=*/DenseIntElementsAttr());
+    BuildReduceBody<xla_hlo::MaxOp>(element_type, &reduce.body(), &rewriter);
+
+    rewriter.replaceOp(op, reduce.getResult(0));
+    return matchSuccess();
+  }
+};
+
+// Converts Softmax op to HLO ops computing softmax with the following formula:
+//
+//     softmax = div(exp(logits), sum(exp(logits)))
+//
+// Sample result with 2-d f16 inputs with B batches of with N elements each.
+//
+//    // Subtract each element by their batches' max to improve numerical
+//    // stability.
+//    %neg_infinity = constant dense<0xFF800000> : tensor<f16>
+//    %max = "xla_hlo.reduce"(%input, %neg_infinity) ["xla_hlo.max"]
+//             {dimensions = 1}
+//           : (tensor<BxNxf16>, tensor<1xf16>) -> tensor<Bxf16>
+//    %sub = "xla_hlo.sub"(%inp, %max) {broadcast_dimensions = 0}
+//            : (tensor<BxNxf16>, tensor<Bxf16>) -> tensor<BxNxf16>
+//
+//    %exp = "xla_hlo.exp"(%sub) : (tensor<BxNxf16>) -> tensor<BxNxf16>
+//
+//    // Cast to f32 to avoid precision loss in summation.
+//    %exp_f32 = "xla_hlo.convert"(%exp) : (tensor<BxNxbf16>) -> tensor<BxNxf32>
+//    %zero = constant dense<0.000000e+00> : tensor<f32>
+//    %sum = "xla_hlo.reduce"(%exp, %zero) ["xla_hlo.add"] {dimensions = 1}
+//            : (tensor<BxNxf32>, tensor<1xf32>) -> tensor<Bxf32>
+//
+//    %sum_f16 = "xla_hlo.convert"(%sum) : (tensor<BxNxbf32>) -> tensor<BxNxf16>
+//    %softmax = "xla_hlo.div"(%exp, %sum_f16) {broadcast_dimensions = 0}
+//            : (tensor<BxNxf16>, tensor<Bxf16>) -> tensor<BxNxf16>
+//
+class ConvertSoftmaxOp : public OpRewritePattern<TF::SoftmaxOp> {
+ public:
+  explicit ConvertSoftmaxOp(MLIRContext *context)
+      : OpRewritePattern<TF::SoftmaxOp>(context, 1) {}
+
+  PatternMatchResult matchAndRewrite(TF::SoftmaxOp op,
+                                     PatternRewriter &rewriter) const override {
+    Value *logits = op.logits();
+
+    // Softmax converter requires ranked type because the XLA reduce ops used
+    // while lowering requires dimensions attribute to reduce along.
+    RankedTensorType type = logits->getType().dyn_cast<RankedTensorType>();
+    if (!type) return matchFailure();
+    int rank = type.getRank();
+
+    // Note that the TensorFlow Softmax op verifies that the input rank is
+    // greater than or equal to one so both of the following sequences are
+    // valid.
+    auto batch_dims = GetI64ElementsAttrForSeq(0, rank - 1, &rewriter);
+    auto reduce_dim = GetI64ElementsAttrForSeq(rank - 1, rank, &rewriter);
+    Location loc = op.getLoc();
+
+    // Exponential of input values and then their sum can be very large here.
+    // Division with large denominator is numerically unstable. To improve
+    // numerical stability, subtract each batch with their max element so that
+    // the maximum input value is zero. It can be shown that softmax computed
+    // after adding or subtracting all inputs in a batch using a common value
+    // gives mathematically equivalent result.
+    Type element_type = type.getElementType();
+    ArrayRef<int64_t> reduce_shape = type.getShape().drop_back();
+    RankedTensorType reduce_out_type =
+        rewriter.getTensorType(reduce_shape, element_type);
+    auto init = GetMinValueForType(element_type, loc, &rewriter);
+    auto max_logits = rewriter.create<xla_hlo::ReduceOp>(
+        loc, reduce_out_type, logits, init.getResult(), reduce_dim);
+    BuildReduceBody<xla_hlo::MaxOp>(element_type, &max_logits.body(),
+                                    &rewriter);
+    auto shifted_logits = rewriter.create<xla_hlo::SubOp>(
+        loc, type, logits, max_logits.getResult(0), batch_dims);
+
+    // Exponentiate the inputs.
+    Value *exp = rewriter.create<xla_hlo::ExpOp>(loc, type, shifted_logits);
+
+    // Cast the exponentials to the appropriate accumulation type to avoid
+    // precision loss during summation.
+    Type sum_element_type = GetAccumulationType(element_type);
+    Type sum_type = rewriter.getTensorType(type.getShape(), sum_element_type);
+    auto casted_exp = rewriter.create<xla_hlo::ConvertOp>(loc, sum_type, exp);
+
+    // Compute summation of the exponentials.
+    init = rewriter.create<ConstantOp>(
+        loc, DenseElementsAttr::get(rewriter.getTensorType({}, element_type),
+                                    rewriter.getZeroAttr(element_type)));
+    Type sum_out_type = rewriter.getTensorType(reduce_shape, sum_element_type);
+    auto exp_sum = rewriter.create<xla_hlo::ReduceOp>(
+        loc, sum_out_type, casted_exp.getResult(), init.getResult(),
+        reduce_dim);
+    BuildReduceBody<xla_hlo::AddOp>(element_type, &exp_sum.body(), &rewriter);
+    Value *sum = exp_sum.getResult(0);
+
+    // Convert the summation result back to the original element type and divide
+    // exponentials by the summations.
+    sum = rewriter.create<xla_hlo::ConvertOp>(loc, reduce_out_type, sum);
+    rewriter.replaceOpWithNewOp<xla_hlo::DivOp>(op, op.getType(), exp, sum,
+                                                batch_dims);
+    return matchSuccess();
+  }
+};
+
 #include "tensorflow/compiler/mlir/xla/transforms/generated_legalize_tf.inc"
 }  // end anonymous namespace
 }  // end namespace xla
 }  // end namespace mlir
 
 void mlir::xla_hlo::legalizeTF(Operation *op) {
-  // Add the generated patterns to the list.
+  // Add lowering patterns to the list.
   OwningRewritePatternList patterns;
   xla::populateWithGenerated(op->getContext(), &patterns);
 
+  // Add patterns that lower some of the high level TensorFlow ops to lower
+  // level TensorFlow ops. So, we don't have to target all the TensorFlow ops
+  // here for lowering to HLO.
+  //
+  // TODO(b/140964075): Switch to DialectConversion to avoid premature lowering
+  // to lower level TensorFlow ops if we actually want to target the higher
+  // level TensorFlow op directly.
+  mlir::TF::PopulateLoweringTFPatterns(op->getContext(), &patterns);
+
+  patterns.insert<mlir::xla::ConvertMaxPoolOp>(op->getContext());
+  patterns.insert<mlir::xla::ConvertSoftmaxOp>(op->getContext());
+
   // Recursively applies rewrite patterns to nested operations.
   applyPatternsGreedily(op, patterns);
 }
diff --git a/tensorflow/compiler/mlir/xla/transforms/legalize_tf_patterns.td b/tensorflow/compiler/mlir/xla/transforms/legalize_tf_patterns.td
index d67f7b0c5fd..fe930e6095d 100644
--- a/tensorflow/compiler/mlir/xla/transforms/legalize_tf_patterns.td
+++ b/tensorflow/compiler/mlir/xla/transforms/legalize_tf_patterns.td
@@ -21,7 +21,7 @@ include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.td"
 include "tensorflow/compiler/mlir/xla/ir/hlo_ops.td"
 
 def NullArrayAttr : NativeCodeCall<"ArrayAttr()">;
-def NullElementsAttr : NativeCodeCall<"ElementsAttr()">;
+def NullDenseIntElementsAttr : NativeCodeCall<"DenseIntElementsAttr()">;
 
 //===----------------------------------------------------------------------===//
 // BatchNorm op patterns.
@@ -87,6 +87,33 @@ foreach fromToBinPair = [[TF_AddOp, HLO_AddOp],
                          [TF_SubOp, HLO_SubOp]] in
   def : DirectBinaryPat<fromToBinPair[0], fromToBinPair[1]>;
 
+//===----------------------------------------------------------------------===//
+// Concat op patterns.
+//===----------------------------------------------------------------------===//
+
+def OneElementAttrPred
+  : CPred<"$_self.cast<ElementsAttr>().getType().getNumElements() == 1">;
+
+def OneElementAttr
+  : ElementsAttrBase<And<[ElementsAttr.predicate, OneElementAttrPred]>,
+                     "Scalar ElementsAttr">;
+
+def GetHLOAxisFromTFAxis : NativeCodeCall<
+  "GetHLOAxisFromTFAxis("
+  "$0, (*$1.begin())->getType().cast<RankedTensorType>().getRank(), "
+  "&$_builder)">;
+
+def HasRankedFirstOperand
+  : Constraint<CPred<"(*$0.begin())->getType().isa<RankedTensorType>()">>;
+
+// Here, we convert from TensorFlow axis format to HLO axis format which
+// doesn't wrap around like TensorFlow and is always positive. For this
+// conversion, use the first input to get inputs rank. Other inputs need not be
+// ranked.
+def : Pat<(TF_ConcatV2Op $inputs, (TF_ConstOp OneElementAttr:$axis), $unused),
+          (HLO_ConcatenateOp $inputs, (GetHLOAxisFromTFAxis $axis, $inputs)),
+          [(HasRankedFirstOperand $inputs)]>;
+
 //===----------------------------------------------------------------------===//
 // Identity op patterns.
 //===----------------------------------------------------------------------===//
@@ -118,7 +145,7 @@ class ConstantSplat<string value> : NativeCodeCall<
 
 def : Pat<(TF_ReluOp AnyTensor:$input),
           (HLO_MaxOp (ConstantOp (ConstantSplat<"0"> $input)), $input,
-                     (NullElementsAttr))>;
+                     (NullDenseIntElementsAttr))>;
 
 def : Pat<(TF_Relu6Op AnyTensor:$input),
           (HLO_ClampOp (ConstantOp (ConstantSplat<"0"> $input)), $input,
@@ -128,8 +155,7 @@ def : Pat<(TF_Relu6Op AnyTensor:$input),
 // Unary op patterns.
 //===----------------------------------------------------------------------===//
 
-def : Pat<(TF_ReshapeOp:$res AnyStaticShapeTensor:$arg, $ignored),
-          (HLO_ReshapeOp $arg), [(AnyStaticShapeTensor $res)]>;
-
-def : Pat<(TF_SqueezeOp AnyStaticShapeTensor:$arg, $ignored_dims),
-          (HLO_ReshapeOp $arg)>;
+foreach TfOp = [TF_ExpandDimsOp, TF_ReshapeOp, TF_SqueezeOp] in {
+  def : Pat<(TfOp:$res AnyStaticShapeTensor:$arg, $ignored),
+            (HLO_ReshapeOp $arg), [(AnyStaticShapeTensor $res)]>;
+}
diff --git a/tensorflow/compiler/mlir/xla/transforms/lhlo_legalize_to_affine.cc b/tensorflow/compiler/mlir/xla/transforms/lhlo_legalize_to_affine.cc
index 780e6380398..03f55f1a1cf 100644
--- a/tensorflow/compiler/mlir/xla/transforms/lhlo_legalize_to_affine.cc
+++ b/tensorflow/compiler/mlir/xla/transforms/lhlo_legalize_to_affine.cc
@@ -111,26 +111,23 @@ Value* GetBinaryOp<xla_lhlo::MaxOp>(Type element_type, Location loc, Value* lhs,
 }
 
 template <typename LhloOp>
-struct BinaryOpConverter : public RewritePattern {
-  explicit BinaryOpConverter(MLIRContext* context)
-      : RewritePattern(LhloOp::getOperationName(), {}, 1, context) {}
+struct BinaryOpConverter : public OpRewritePattern<LhloOp> {
+  using OpRewritePattern<LhloOp>::OpRewritePattern;
 
-  PatternMatchResult matchAndRewrite(Operation* op,
+  PatternMatchResult matchAndRewrite(LhloOp op,
                                      PatternRewriter& rewriter) const override {
-    auto binary_op = cast<LhloOp>(op);
-
-    const auto& lhs = binary_op.lhs();
-    const auto& rhs = binary_op.rhs();
+    const auto& lhs = op.lhs();
+    const auto& rhs = op.rhs();
     const auto& lhs_type = lhs->getType().template cast<MemRefType>();
     const auto& rhs_type = rhs->getType().template cast<MemRefType>();
     const auto& element_type = lhs_type.getElementType();
 
     if (lhs_type.getShape() != rhs_type.getShape()) {
-      return matchFailure();
+      return this->matchFailure();
     }
     const auto& shape = lhs_type.getShape();
     SmallVector<Value*, 4> induction_vars;
-    const auto loc = op->getLoc();
+    const auto loc = op.getLoc();
     for (int i = 0; i < shape.size(); ++i) {
       auto forOp = rewriter.create<AffineForOp>(loc, 0, shape[i]);
       induction_vars.push_back(forOp.getInductionVar());
@@ -140,23 +137,26 @@ struct BinaryOpConverter : public RewritePattern {
     auto r = rewriter.create<LoadOp>(loc, rhs, induction_vars);
     auto result = GetBinaryOp<LhloOp>(element_type, loc, l, r, rewriter);
     if (result == nullptr) {
-      return matchFailure();
+      return this->matchFailure();
     }
-    rewriter.create<StoreOp>(loc, result, binary_op.out(), induction_vars);
+    rewriter.create<StoreOp>(loc, result, op.out(), induction_vars);
     rewriter.replaceOp(op, {});
-    return matchSuccess();
+    return this->matchSuccess();
   }
 };
 
 void populateLHLOToAffineConversionPattern(MLIRContext* context,
                                            OwningRewritePatternList* patterns) {
-  patterns->insert<BinaryOpConverter<xla_lhlo::AddOp>>(context);
-  patterns->insert<BinaryOpConverter<xla_lhlo::AndOp>>(context);
-  patterns->insert<BinaryOpConverter<xla_lhlo::DivOp>>(context);
-  patterns->insert<BinaryOpConverter<xla_lhlo::MaxOp>>(context);
-  patterns->insert<BinaryOpConverter<xla_lhlo::MinOp>>(context);
-  patterns->insert<BinaryOpConverter<xla_lhlo::MulOp>>(context);
-  patterns->insert<BinaryOpConverter<xla_lhlo::SubOp>>(context);
+  // clang-format off
+  patterns->insert<
+      BinaryOpConverter<xla_lhlo::AddOp>,
+      BinaryOpConverter<xla_lhlo::AndOp>,
+      BinaryOpConverter<xla_lhlo::DivOp>,
+      BinaryOpConverter<xla_lhlo::MaxOp>,
+      BinaryOpConverter<xla_lhlo::MinOp>,
+      BinaryOpConverter<xla_lhlo::MulOp>,
+      BinaryOpConverter<xla_lhlo::SubOp>>(context);
+  // clang-format on
 }
 
 struct LhloLegalizeToAffine : public FunctionPass<LhloLegalizeToAffine> {
diff --git a/tensorflow/compiler/mlir/xla/type_to_shape.cc b/tensorflow/compiler/mlir/xla/type_to_shape.cc
index e64182889cb..06d945946f6 100644
--- a/tensorflow/compiler/mlir/xla/type_to_shape.cc
+++ b/tensorflow/compiler/mlir/xla/type_to_shape.cc
@@ -22,7 +22,11 @@ limitations under the License.
 #include "mlir/IR/Location.h"  // TF:local_config_mlir
 #include "mlir/IR/StandardTypes.h"  // TF:local_config_mlir
 #include "mlir/Support/DebugStringHelper.h"  // TF:local_config_mlir
+#include "tensorflow/compiler/mlir/tensorflow/utils/convert_tensor.h"
+#include "tensorflow/compiler/mlir/tensorflow/utils/convert_type.h"
 #include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/statusor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/types.h"
 
@@ -68,6 +72,23 @@ PrimitiveType TypeToPrimitiveType(mlir::Type type) {
   }
 }
 
+StatusOr<Shape> TypeToShape(
+    mlir::Type type, CustomShapeRepresentationFn shape_representation_fn) {
+  tensorflow::PartialTensorShape partial_tensor_shape =
+      tensorflow::ConvertTypeToTensorShape(type);
+
+  tensorflow::TensorShape fully_defined_tensor_shape;
+  if (!partial_tensor_shape.AsTensorShape(&fully_defined_tensor_shape)) {
+    return tensorflow::errors::InvalidArgument(
+        "XLA HLO only allows fully-defined shape");
+  }
+
+  tensorflow::DataType dtype;
+  TF_RETURN_IF_ERROR(tensorflow::ConvertToDataType(type, &dtype));
+
+  return shape_representation_fn(fully_defined_tensor_shape, dtype);
+}
+
 Shape TypeToShape(mlir::Type type) {
   PrimitiveType ptype = TypeToPrimitiveType(type);
   if (ptype != PrimitiveType::PRIMITIVE_TYPE_INVALID)
diff --git a/tensorflow/compiler/mlir/xla/type_to_shape.h b/tensorflow/compiler/mlir/xla/type_to_shape.h
index 6bd5384f857..4bc3fac9b1c 100644
--- a/tensorflow/compiler/mlir/xla/type_to_shape.h
+++ b/tensorflow/compiler/mlir/xla/type_to_shape.h
@@ -16,15 +16,29 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_MLIR_XLA_TYPE_TO_SHAPE_H_
 #define TENSORFLOW_COMPILER_MLIR_XLA_TYPE_TO_SHAPE_H_
 
+#include "llvm/ADT/STLExtras.h"
 #include "mlir/IR/Types.h"  // TF:local_config_mlir
 #include "tensorflow/compiler/xla/shape.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tensorflow/core/framework/tensor_shape.h"
 
 namespace xla {
 
 // Returns a XLA Shape equivalent of a MLIR Type, else returns empty shape.
 Shape TypeToShape(mlir::Type type);
 
+// Type of a custom function that converts a TensorFlow type and shape into an
+// XLA shape with optional layout info.
+typedef llvm::function_ref<xla::StatusOr<xla::Shape>(
+    const tensorflow::TensorShape&, tensorflow::DataType)>
+    CustomShapeRepresentationFn;
+
+// Compute an XLA shape based in given MLIR type and an
+// CustomShapeRepresentationFn, which allows setting custom layout in returned
+// XLA shape.
+StatusOr<Shape> TypeToShape(
+    mlir::Type type, CustomShapeRepresentationFn shape_representation_fn);
+
 // Returns a XLA PrimitiveType equivalent of a MLIR Type that represents a
 // primitive type (e.g., i8, f32), else returns PRIMITIVE_TYPE_INVALID.
 PrimitiveType TypeToPrimitiveType(mlir::Type type);
diff --git a/tensorflow/compiler/mlir/xla/type_to_shape_test.cc b/tensorflow/compiler/mlir/xla/type_to_shape_test.cc
index 57922fe1532..49a4a838e30 100644
--- a/tensorflow/compiler/mlir/xla/type_to_shape_test.cc
+++ b/tensorflow/compiler/mlir/xla/type_to_shape_test.cc
@@ -134,5 +134,43 @@ TEST(TypeToShapeTest, ConvertTensorTypeToTypes) {
       EqualsProto(Shape().ToProto()));
 }
 
+TEST(TypeToShapeTest, ConvertWithShapeRepresentationFn) {
+  tensorflow::DataType captured_dtype;
+  tensorflow::TensorShape captured_tensor_shape;
+
+  // A dummy shape representation function that does nothing other than
+  // capturing arguments passed to it.
+  auto test_shape_representation_fn = [&](const tensorflow::TensorShape& shape,
+                                          tensorflow::DataType dtype) {
+    captured_tensor_shape = shape;
+    captured_dtype = dtype;
+    return xla::Shape();
+  };
+
+  MLIRContext context;
+  Builder b(&context);
+  StatusOr<Shape> status_or_shape;
+
+  // Non-fully-defined shape.
+  status_or_shape = TypeToShape(b.getTensorType({-1, 2, 3}, b.getF32Type()),
+                                test_shape_representation_fn);
+  EXPECT_EQ(status_or_shape.status().code(),
+            tensorflow::errors::Code::INVALID_ARGUMENT);
+
+  // Scalar Int32 Tensor, using fast memory.
+  status_or_shape =
+      TypeToShape(b.getIntegerType(32), test_shape_representation_fn);
+  EXPECT_TRUE(status_or_shape.ok());
+  EXPECT_EQ(captured_dtype, tensorflow::DataType::DT_INT32);
+  EXPECT_EQ(captured_tensor_shape, tensorflow::TensorShape());
+
+  // Ranked Float32 Tensor, not using fast memory.
+  status_or_shape = TypeToShape(b.getTensorType({1, 2, 3}, b.getF32Type()),
+                                test_shape_representation_fn);
+  EXPECT_TRUE(status_or_shape.ok());
+  EXPECT_EQ(captured_dtype, tensorflow::DataType::DT_FLOAT);
+  EXPECT_EQ(captured_tensor_shape, tensorflow::TensorShape({1, 2, 3}));
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/mlir/xla/xla_mlir_translate.cc b/tensorflow/compiler/mlir/xla/xla_mlir_translate.cc
index ad7e4724d90..7fbc5e4e2bc 100644
--- a/tensorflow/compiler/mlir/xla/xla_mlir_translate.cc
+++ b/tensorflow/compiler/mlir/xla/xla_mlir_translate.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/compiler/mlir/xla/xla_mlir_translate.h"
 
+#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/ToolOutputFile.h"
 #include "mlir/IR/Module.h"  // TF:local_config_mlir
@@ -30,6 +31,18 @@ limitations under the License.
 using stream_executor::port::Status;
 using stream_executor::port::StatusOr;  // NOLINT TODO(b/130822468) fix this
 
+// NOLINTNEXTLINE
+static llvm::cl::opt<bool> emit_use_tuple_arg(
+    "emit-use-tuple-args",
+    llvm::cl::desc("Emit HLO modules using tuples as args"),
+    llvm::cl::init(false));
+
+// NOLINTNEXTLINE
+static llvm::cl::opt<bool> emit_always_return_tuple(
+    "emit-always-return-tuple",
+    llvm::cl::desc("Emit HLO modules always return tuple"),
+    llvm::cl::init(false));
+
 namespace xla {
 
 namespace {
@@ -122,7 +135,8 @@ static mlir::LogicalResult MlirHloToHloTranslateFunction(
   }
 
   HloProto hloProto;
-  Status status = mlir::ConvertMlirHloToHlo(module, &hloProto);
+  Status status = mlir::ConvertMlirHloToHlo(
+      module, &hloProto, emit_use_tuple_arg, emit_always_return_tuple);
   if (!status.ok()) {
     LOG(ERROR) << "Module conversion failed: " << status;
     return mlir::failure();
@@ -155,7 +169,8 @@ static mlir::LogicalResult MlirHloToHloTextTranslateFunction(
   }
 
   HloProto hloProto;
-  Status status = mlir::ConvertMlirHloToHlo(module, &hloProto);
+  Status status = mlir::ConvertMlirHloToHlo(
+      module, &hloProto, emit_use_tuple_arg, emit_always_return_tuple);
   if (!status.ok()) {
     LOG(ERROR) << "Module conversion failed: " << status;
     return mlir::failure();
diff --git a/tensorflow/compiler/tests/slice_ops_test.py b/tensorflow/compiler/tests/slice_ops_test.py
index b7784062e82..1d511ede1c2 100644
--- a/tensorflow/compiler/tests/slice_ops_test.py
+++ b/tensorflow/compiler/tests/slice_ops_test.py
@@ -22,6 +22,7 @@ from tensorflow.compiler.tests import xla_test
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import googletest
 
 
@@ -138,6 +139,22 @@ class StridedSliceTest(xla_test.XLATestCase):
 
         self.assertAllEqual([2, 4], result)
 
+  def test1DDynamic(self):
+    for dtype in self.numeric_types:
+      with self.session():
+        i = array_ops.placeholder(dtype, shape=[10])
+        begin = array_ops.placeholder(dtypes.int32, shape=[1])
+        with self.test_scope():
+          end = math_ops.add(begin, [1])
+          o = array_ops.strided_slice(i, begin, end, [1])
+        params = {
+            i: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
+            begin: [0]
+        }
+        result = o.eval(feed_dict=params)
+
+        self.assertAllEqual([0], result)
+
   def test1DNegativeStride(self):
     for dtype in self.numeric_types:
       with self.session():
@@ -179,6 +196,22 @@ class StridedSliceTest(xla_test.XLATestCase):
 
         self.assertEqual(tensor_shape.TensorShape((0, 3)), result.shape)
 
+  def test2DFullSlice(self):
+    for dtype in self.numeric_types:
+      with self.session():
+        with self.test_scope():
+          i = array_ops.placeholder(dtype, shape=[2, 4])
+          begin = array_ops.placeholder(dtypes.int32, shape=[2])
+          end = math_ops.add(begin, [1, 1])
+          o = array_ops.strided_slice(i, begin, end, [1, 1])
+        params = {
+            i: [[0, 1, 2, 3], [4, 5, 6, 7]],
+            begin: [1, 1]
+        }
+        result = o.eval(feed_dict=params)
+
+        self.assertAllEqual([[5]], result)
+
   def test3D(self):
     for dtype in self.numeric_types:
       with self.session():
diff --git a/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.cc b/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.cc
index 43f920b9ccc..65d5f9a2ecd 100644
--- a/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.cc
+++ b/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.cc
@@ -1566,8 +1566,11 @@ void Converter::MaybeApplyQuantizationRanges() {
 #endif
 
   if (use_calibration()) return;
+#if !IS_TRT_VERSION_GE(6, 0, 0, 0)
   // Attempt to find tensors that are missing ranges, and set the corresponding
   // layer's precision to FP16 to avoid Builder::buildCudaEngine() failing.
+  // This is only needed for TensorRT 5 and before because
+  // TensorRT6 falls to FP16 internally.
   // TensorRT doesn't need ranges for intermediate tensors when layers are fused
   // so find fused layers first.
   // Get all tensors from network and deduce fused ops.
@@ -1696,6 +1699,7 @@ void Converter::MaybeApplyQuantizationRanges() {
       }
     }
   }
+#endif
 }
 
 void Converter::PropagateQuantizationRanges() {
@@ -5211,6 +5215,18 @@ Status ConvertCombinedNMS(OpConverterParams* params) {
       &plugin_inputs[0], static_cast<int>(plugin_inputs.size()), *plugin);
   TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_def.name());
 
+  // Set plugin outputs
+  nvinfer1::ITensor* output_nmsed_boxes = layer->getOutput(1);
+#if IS_TRT_VERSION_GE(6, 0, 0, 0)
+  // TRT6 fixes (removes) the extra last dimension in CombinedNMS outputs
+  nvinfer1::ITensor* output_num_detections = layer->getOutput(0);
+  nvinfer1::ITensor* output_nmsed_scores = layer->getOutput(2);
+  nvinfer1::ITensor* output_nmsed_classes = layer->getOutput(3);
+#else
+  nvinfer1::ITensor* output_num_detections = nullptr;
+  nvinfer1::ITensor* output_nmsed_scores = nullptr;
+  nvinfer1::ITensor* output_nmsed_classes = nullptr;
+
   auto shrink_last_dim = [params](nvinfer1::ITensor* in_tensor,
                                   nvinfer1::ITensor** out_tensor) {
     nvinfer1::Dims dims = in_tensor->getDimensions();
@@ -5224,18 +5240,13 @@ Status ConvertCombinedNMS(OpConverterParams* params) {
         /*validation_only=*/false, out_tensor));
     return Status::OK();
   };
-
-  // Set plugin outputs
-  nvinfer1::ITensor* output_nmsed_boxes = layer->getOutput(1);
-  nvinfer1::ITensor* output_nmsed_scores = nullptr;
-  nvinfer1::ITensor* output_nmsed_classes = nullptr;
-  nvinfer1::ITensor* output_num_detections = nullptr;
   TF_RETURN_IF_ERROR(
       shrink_last_dim(layer->getOutput(2), &output_nmsed_scores));
   TF_RETURN_IF_ERROR(
       shrink_last_dim(layer->getOutput(3), &output_nmsed_classes));
   TF_RETURN_IF_ERROR(
       shrink_last_dim(layer->getOutput(0), &output_num_detections));
+#endif  // IS_TRT_VERSION_GE(6, 0, 0, 0)
 
   params->outputs->push_back(TRT_TensorOrWeights(output_nmsed_boxes));
   params->outputs->push_back(TRT_TensorOrWeights(output_nmsed_scores));
diff --git a/tensorflow/compiler/tf2xla/BUILD b/tensorflow/compiler/tf2xla/BUILD
index dedeb647023..88533727c27 100644
--- a/tensorflow/compiler/tf2xla/BUILD
+++ b/tensorflow/compiler/tf2xla/BUILD
@@ -459,6 +459,7 @@ tf_cc_test(
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/client:xla_builder",
         "//tensorflow/compiler/xla/service:cpu_plugin",
+        "//tensorflow/compiler/xla/service:hlo_proto",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:framework",
diff --git a/tensorflow/compiler/tf2xla/graph_compiler.cc b/tensorflow/compiler/tf2xla/graph_compiler.cc
index a431abd26e0..34888fc0e2f 100644
--- a/tensorflow/compiler/tf2xla/graph_compiler.cc
+++ b/tensorflow/compiler/tf2xla/graph_compiler.cc
@@ -90,14 +90,7 @@ Status PrepareArguments(XlaOpKernelContext* ctx, Graph* graph,
         break;
       case XlaExpression::Kind::kResource: {
         XlaResource* resource = expressions[i]->resource();
-
-        arg.initialized = resource->initialized();
-        arg.kind = XlaCompiler::Argument::kResource;
-        arg.resource_kind = resource->kind();
-        arg.type = resource->type();
-        arg.shape = resource->shape();
-        arg.max_array_size = resource->max_array_size();
-        arg.name = resource->name();
+        XlaCompiler::PopulateArgumentFromResource(*resource, &arg);
         break;
       }
       case XlaExpression::Kind::kTensorList: {
diff --git a/tensorflow/compiler/tf2xla/kernels/binary_ops.cc b/tensorflow/compiler/tf2xla/kernels/binary_ops.cc
index 47574a8c202..19c09b07959 100644
--- a/tensorflow/compiler/tf2xla/kernels/binary_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/binary_ops.cc
@@ -220,9 +220,7 @@ XLA_MAKE_BINARY(SigmoidGrad,
                 xla::Mul(xla::Mul(rhs, lhs),
                          xla::Sub(XlaHelpers::One(b, input_type(0)), lhs)));
 
-XLA_MAKE_BINARY(SoftplusGrad,
-                xla::Div(lhs, xla::Add(xla::Exp(xla::Neg(rhs)),
-                                       XlaHelpers::One(b, input_type(1)))));
+XLA_MAKE_BINARY(SoftplusGrad, xla::Mul(lhs, xla::Logistic(rhs)));
 
 // softsigngrad(gradients, features) = gradients / (1 + abs(features)) ** 2
 XLA_MAKE_BINARY(SoftsignGrad,
diff --git a/tensorflow/compiler/tf2xla/kernels/case_op.cc b/tensorflow/compiler/tf2xla/kernels/case_op.cc
index 5ba844e10bd..9b3770cf55e 100644
--- a/tensorflow/compiler/tf2xla/kernels/case_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/case_op.cc
@@ -93,20 +93,9 @@ void XlaCaseOp::Compile(XlaOpKernelContext* ctx) {
     if (type == DT_RESOURCE) {
       XlaResource* resource;
       OP_REQUIRES_OK(ctx, ctx->GetResourceInput(i + 1, &resource));
-
-      arg.initialized = resource->initialized();
-      arg.kind = XlaCompiler::Argument::kResource;
-      arg.resource_kind = resource->kind();
-
-      arg.type = resource->type();
-      arg.shape = resource->shape();
+      XlaCompiler::PopulateArgumentFromResource(*resource, &arg);
       OP_REQUIRES(ctx, arg.initialized,
                   errors::Unimplemented("Uninitialized arguments: ", arg.name));
-      arg.max_array_size = resource->max_array_size();
-      for (const auto& gradient : resource->tensor_array_gradients()) {
-        arg.tensor_array_gradients.insert(gradient.first);
-      }
-      arg.name = resource->name();
       VLOG(2) << "Resource " << resource->name()
               << " type: " << DataTypeString(arg.type)
               << " shape: " << arg.HumanString()
@@ -235,6 +224,22 @@ void XlaCaseOp::Compile(XlaOpKernelContext* ctx) {
                      branch_results[0].xla_output_shape);
     }
 
+    // Check that all branches have same TensorList output indices.
+    for (int output_index = 0; output_index < branch_results[0].outputs.size();
+         output_index++) {
+      bool is_tensor_list_in_branch_0 =
+          branch_results[0].outputs[output_index].is_tensor_list;
+      bool is_tensor_list_in_branch_j =
+          branch_results[j].outputs[output_index].is_tensor_list;
+      OP_REQUIRES(
+          ctx, is_tensor_list_in_branch_0 == is_tensor_list_in_branch_j,
+          errors::FailedPrecondition("Output #", output_index, " is ",
+                                     (is_tensor_list_in_branch_0 ? "" : "not"),
+                                     " a TensorList in branch 0, but is ",
+                                     (is_tensor_list_in_branch_j ? "" : "not"),
+                                     " a TensorList in branch ", j));
+    }
+
     // We set return_updated_values_for_all_resources=true and we pass the same
     // arguments to both computations, so the resource update count must match.
     OP_REQUIRES(ctx,
@@ -296,7 +301,12 @@ void XlaCaseOp::Compile(XlaOpKernelContext* ctx) {
         LOG(INFO) << "Shape unknown for output " << i;
       }
     }
-    ctx->SetOutput(i, output_handle);
+    // We have checked that all branches have same TensorList output indices.
+    if (branch_results[0].outputs[i].is_tensor_list) {
+      ctx->SetTensorListOutput(i, output_handle);
+    } else {
+      ctx->SetOutput(i, output_handle);
+    }
   }
   if (has_token_input_output_) {
     // Set token output for this "Case" op. Token output is the last output of
diff --git a/tensorflow/compiler/tf2xla/kernels/einsum_op.cc b/tensorflow/compiler/tf2xla/kernels/einsum_op.cc
index 6b3334dc1de..bf9313389dd 100644
--- a/tensorflow/compiler/tf2xla/kernels/einsum_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/einsum_op.cc
@@ -25,7 +25,8 @@ limitations under the License.
 namespace tensorflow {
 namespace {
 
-constexpr std::array<DataType, 2> kEinsumTypes = {{DT_BFLOAT16, DT_FLOAT}};
+constexpr std::array<DataType, 6> kEinsumTypes = {
+    {DT_HALF, DT_BFLOAT16, DT_FLOAT, DT_DOUBLE, DT_COMPLEX64, DT_COMPLEX128}};
 
 class EinsumOp : public XlaOpKernel {
  public:
@@ -38,8 +39,6 @@ class EinsumOp : public XlaOpKernel {
   void Compile(XlaOpKernelContext* ctx) override {
     xla::XlaOp lhs = ctx->Input(0);
     xla::XlaOp rhs = ctx->Input(1);
-    const TensorShape a_shape = ctx->InputShape(0);
-    const TensorShape b_shape = ctx->InputShape(1);
     ctx->SetOutput(0, xla::Einsum(lhs, rhs, equation_));
   }
 
@@ -49,6 +48,7 @@ class EinsumOp : public XlaOpKernel {
 };
 
 REGISTER_XLA_OP(Name("XlaEinsum").TypeConstraint("T", kEinsumTypes), EinsumOp);
+REGISTER_XLA_OP(Name("Einsum").TypeConstraint("T", kEinsumTypes), EinsumOp);
 
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/kernels/fft_ops.cc b/tensorflow/compiler/tf2xla/kernels/fft_ops.cc
index 5ac288d8a34..e5e4e797cc5 100644
--- a/tensorflow/compiler/tf2xla/kernels/fft_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/fft_ops.cc
@@ -138,10 +138,20 @@ class RFFTOp : public GenericFftOp {
   explicit RFFTOp(OpKernelConstruction* ctx)
       : GenericFftOp(ctx, /*fft_type=*/FftType::RFFT, /*fft_rank=*/FFTRank) {}
 };
-REGISTER_XLA_OP(Name("RFFT").CompileTimeConstantInput("fft_length"), RFFTOp<1>);
-REGISTER_XLA_OP(Name("RFFT2D").CompileTimeConstantInput("fft_length"),
+REGISTER_XLA_OP(Name("RFFT")
+                    .TypeConstraint("Treal", DT_FLOAT)
+                    .TypeConstraint("Tcomplex", DT_COMPLEX64)
+                    .CompileTimeConstantInput("fft_length"),
+                RFFTOp<1>);
+REGISTER_XLA_OP(Name("RFFT2D")
+                    .TypeConstraint("Treal", DT_FLOAT)
+                    .TypeConstraint("Tcomplex", DT_COMPLEX64)
+                    .CompileTimeConstantInput("fft_length"),
                 RFFTOp<2>);
-REGISTER_XLA_OP(Name("RFFT3D").CompileTimeConstantInput("fft_length"),
+REGISTER_XLA_OP(Name("RFFT3D")
+                    .TypeConstraint("Treal", DT_FLOAT)
+                    .TypeConstraint("Tcomplex", DT_COMPLEX64)
+                    .CompileTimeConstantInput("fft_length"),
                 RFFTOp<3>);
 
 template <int FFTRank>
@@ -150,11 +160,20 @@ class IRFFTOp : public GenericFftOp {
   explicit IRFFTOp(OpKernelConstruction* ctx)
       : GenericFftOp(ctx, /*fft_type=*/FftType::IRFFT, /*fft_rank=*/FFTRank) {}
 };
-REGISTER_XLA_OP(Name("IRFFT").CompileTimeConstantInput("fft_length"),
+REGISTER_XLA_OP(Name("IRFFT")
+                    .TypeConstraint("Treal", DT_FLOAT)
+                    .TypeConstraint("Tcomplex", DT_COMPLEX64)
+                    .CompileTimeConstantInput("fft_length"),
                 IRFFTOp<1>);
-REGISTER_XLA_OP(Name("IRFFT2D").CompileTimeConstantInput("fft_length"),
+REGISTER_XLA_OP(Name("IRFFT2D")
+                    .TypeConstraint("Treal", DT_FLOAT)
+                    .TypeConstraint("Tcomplex", DT_COMPLEX64)
+                    .CompileTimeConstantInput("fft_length"),
                 IRFFTOp<2>);
-REGISTER_XLA_OP(Name("IRFFT3D").CompileTimeConstantInput("fft_length"),
+REGISTER_XLA_OP(Name("IRFFT3D")
+                    .TypeConstraint("Treal", DT_FLOAT)
+                    .TypeConstraint("Tcomplex", DT_COMPLEX64)
+                    .CompileTimeConstantInput("fft_length"),
                 IRFFTOp<3>);
 
 }  // namespace
diff --git a/tensorflow/compiler/tf2xla/kernels/gather_op.cc b/tensorflow/compiler/tf2xla/kernels/gather_op.cc
index 489ffd3fdad..3178c04875a 100644
--- a/tensorflow/compiler/tf2xla/kernels/gather_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/gather_op.cc
@@ -25,8 +25,10 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/compiler/xla/client/lib/slicing.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/core/framework/kernel_def_builder.h"
 #include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/lib/core/errors.h"
 
 namespace tensorflow {
 
@@ -150,6 +152,85 @@ Status XlaGather(const xla::XlaOp& input, const TensorShape& input_shape,
   return Status::OK();
 }
 
+Status XlaGatherWithBatchDimsOpImpl(XlaOpKernelContext* context,
+                                    const xla::XlaOp input,
+                                    const TensorShape& input_shape,
+                                    int batch_dims, xla::XlaOp* gather_output) {
+  auto indices = context->Input(1);
+  auto indices_shape = context->InputShape(1);
+
+  absl::optional<int64> axis;
+  if (context->num_inputs() == 3) {
+    const TensorShape axis_shape = context->InputShape(2);
+    if (!TensorShapeUtils::IsScalar(axis_shape)) {
+      return errors::InvalidArgument("axis must be scalar");
+    }
+    DataType axis_type = context->input_type(2);
+    if (axis_type != DT_INT32 && axis_type != DT_INT64) {
+      return errors::InvalidArgument("axis must be int32 or int64");
+    }
+
+    int64 axis_input;
+    TF_RETURN_IF_ERROR(context->ConstantInputAsIntScalar(2, &axis_input));
+
+    const auto params_dims = input_shape.dims();
+    if (-params_dims > axis_input || axis_input >= params_dims) {
+      return errors::InvalidArgument("Expected axis in the range [",
+                                     -params_dims, ", ", params_dims,
+                                     "), but got ", axis_input);
+    }
+    if (axis_input < 0) {
+      axis_input += params_dims;
+    }
+    axis = axis_input;
+  }
+
+  if (batch_dims != 0) {
+    if (batch_dims < 0) {
+      batch_dims = indices_shape.dims() + batch_dims;
+    }
+
+    axis = axis.value_or(batch_dims);
+
+    if (batch_dims < -indices_shape.dims() ||
+        batch_dims >= indices_shape.dims()) {
+      return errors::InvalidArgument(
+          "Expected batch_dims in the range [", -indices_shape.dims(), ", ",
+          indices_shape.dims(), "), but got ", batch_dims);
+    }
+
+    if (batch_dims >= input_shape.dims()) {
+      return errors::InvalidArgument("batch_dims (", batch_dims,
+                                     ") must be less than rank(input) (",
+                                     input_shape.dims(), ").");
+    }
+
+    if (*axis < batch_dims) {
+      return errors::InvalidArgument("batch_dims (", batch_dims,
+                                     ") must be less than or equal to ",
+                                     "axis (", *axis, ").");
+    }
+  }
+
+  axis = axis.value_or(0);
+  DataType index_type = context->input_type(1);
+  if (index_type != DT_INT32 && index_type != DT_INT64) {
+    return errors::InvalidArgument("indices must be int32 or int64");
+  }
+
+  xla::XlaOp gather;
+  if (batch_dims > 0) {
+    *gather_output = xla::TorchIndexSelect(input, indices, *axis, batch_dims);
+  } else {
+    // XlaGather() manages degenerate cases, like empty-indices, which are
+    // error conditions and caught above if batch_dims is not 0.
+    TF_RETURN_IF_ERROR(
+        XlaGather(input, input_shape, indices, indices_shape, *axis,
+                  /*indices_are_nd=*/false, context->expected_output_dtype(0),
+                  index_type, context->builder(), gather_output));
+  }
+  return Status::OK();
+}
 class GatherOp : public XlaOpKernel {
  public:
   explicit GatherOp(OpKernelConstruction* context) : XlaOpKernel(context) {
@@ -164,76 +245,11 @@ class GatherOp : public XlaOpKernel {
   void Compile(XlaOpKernelContext* context) override {
     auto input = context->Input(0);
     auto input_shape = context->InputShape(0);
-    auto indices = context->Input(1);
-    auto indices_shape = context->InputShape(1);
-
-    absl::optional<int64> axis;
-    if (context->num_inputs() == 3) {
-      const TensorShape axis_shape = context->InputShape(2);
-      OP_REQUIRES(context, TensorShapeUtils::IsScalar(axis_shape),
-                  errors::InvalidArgument("axis must be scalar"));
-      DataType axis_type = input_type(2);
-      OP_REQUIRES(context, axis_type == DT_INT32 || axis_type == DT_INT64,
-                  errors::InvalidArgument("axis must be int32 or int64"));
-
-      int64 axis_input;
-      OP_REQUIRES_OK(context,
-                     context->ConstantInputAsIntScalar(2, &axis_input));
-
-      const auto params_dims = input_shape.dims();
-      OP_REQUIRES(context,
-                  -params_dims <= axis_input && axis_input < params_dims,
-                  errors::InvalidArgument("Expected axis in the range [",
-                                          -params_dims, ", ", params_dims,
-                                          "), but got ", axis_input));
-      if (axis_input < 0) {
-        axis_input += params_dims;
-      }
-      axis = axis_input;
-    }
-
-    if (batch_dims_ != 0) {
-      if (batch_dims_ < 0) {
-        batch_dims_ = indices_shape.dims() + batch_dims_;
-      }
-
-      axis = axis.value_or(batch_dims_);
-
-      OP_REQUIRES(context,
-                  batch_dims_ >= -indices_shape.dims() &&
-                      batch_dims_ < indices_shape.dims(),
-                  errors::InvalidArgument("Expected batch_dims in the range [",
-                                          -indices_shape.dims(), ", ",
-                                          indices_shape.dims(), "), but got ",
-                                          batch_dims_));
-
-      OP_REQUIRES(context, batch_dims_ < input_shape.dims(),
-                  errors::InvalidArgument("batch_dims (", batch_dims_,
-                                          ") must be less than rank(input) (",
-                                          input_shape.dims(), ")."));
-
-      OP_REQUIRES(context, *axis >= batch_dims_,
-                  errors::InvalidArgument("batch_dims (", batch_dims_,
-                                          ") must be less than or equal to ",
-                                          "axis (", *axis, ")."));
-    }
-
-    axis = axis.value_or(0);
-    DataType index_type = input_type(1);
-    OP_REQUIRES(context, index_type == DT_INT32 || index_type == DT_INT64,
-                errors::InvalidArgument("indices must be int32 or int64"));
 
     xla::XlaOp gather;
-    if (batch_dims_ > 0) {
-      gather = xla::TorchIndexSelect(input, indices, *axis, batch_dims_);
-    } else {
-      // XlaGather() manages degenerate cases, like empty-indices, which are
-      // error conditions and caught above if batch_dims is not 0.
-      OP_REQUIRES_OK(
-          context, XlaGather(input, input_shape, indices, indices_shape, *axis,
-                             /*indices_are_nd=*/false, input_type(0),
-                             index_type, context->builder(), &gather));
-    }
+    OP_REQUIRES_OK(context,
+                   XlaGatherWithBatchDimsOpImpl(context, input, input_shape,
+                                                batch_dims_, &gather));
     context->SetOutput(0, gather);
   }
 
diff --git a/tensorflow/compiler/tf2xla/kernels/gather_op_helpers.h b/tensorflow/compiler/tf2xla/kernels/gather_op_helpers.h
index 92346283c31..7bd25230d46 100644
--- a/tensorflow/compiler/tf2xla/kernels/gather_op_helpers.h
+++ b/tensorflow/compiler/tf2xla/kernels/gather_op_helpers.h
@@ -39,6 +39,13 @@ Status XlaGather(const xla::XlaOp& input, const TensorShape& input_shape,
                  DataType index_type, xla::XlaBuilder* builder,
                  xla::XlaOp* gather_output);
 
+// The implementation of Gather and ResourceGather through XLA. Uses `input` as
+// the input instead of context->input(0) in order to allow ResourceGather to
+// handle obtaining the data from the ResourceVariable.
+Status XlaGatherWithBatchDimsOpImpl(XlaOpKernelContext* context,
+                                    const xla::XlaOp input,
+                                    const TensorShape& input_shape,
+                                    int batch_dims, xla::XlaOp* gather_output);
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_COMPILER_TF2XLA_KERNELS_GATHER_OP_HELPERS_H_
diff --git a/tensorflow/compiler/tf2xla/kernels/if_op.cc b/tensorflow/compiler/tf2xla/kernels/if_op.cc
index 4422af7d15f..a7dd1bb0079 100644
--- a/tensorflow/compiler/tf2xla/kernels/if_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/if_op.cc
@@ -93,24 +93,13 @@ void XlaIfOp::Compile(XlaOpKernelContext* ctx) {
       XlaResource* resource;
       OP_REQUIRES_OK(ctx, ctx->GetResourceInput(i + 1, &resource));
 
-      arg.initialized = resource->initialized();
-      arg.kind = XlaCompiler::Argument::kResource;
-      arg.resource_kind = resource->kind();
-
-      arg.type = resource->type();
-      arg.shape = resource->shape();
+      XlaCompiler::PopulateArgumentFromResource(*resource, &arg);
       OP_REQUIRES(ctx, arg.initialized,
                   errors::Unimplemented("Uninitialized arguments: ", arg.name));
-      arg.max_array_size = resource->max_array_size();
-      for (const auto& gradient : resource->tensor_array_gradients()) {
-        arg.tensor_array_gradients.insert(gradient.first);
-      }
-      arg.name = resource->name();
       VLOG(2) << "Resource " << resource->name()
               << " type: " << DataTypeString(arg.type)
               << " shape: " << arg.HumanString()
               << " initialized: " << arg.initialized;
-
       num_resource_args++;
     } else {
       arg.kind = XlaCompiler::Argument::kParameter;
@@ -220,6 +209,22 @@ void XlaIfOp::Compile(XlaOpKernelContext* ctx) {
           xla::ShapeUtil::HumanString(then_result.xla_output_shape), " vs. ",
           xla::ShapeUtil::HumanString(else_result.xla_output_shape)));
 
+  // Check that both branches have same TensorList output indices.
+  for (int output_index = 0; output_index < then_result.outputs.size();
+       output_index++) {
+    bool is_tensor_list_in_then_branch =
+        then_result.outputs[output_index].is_tensor_list;
+    bool is_tensor_list_in_else_branch =
+        else_result.outputs[output_index].is_tensor_list;
+    OP_REQUIRES(
+        ctx, is_tensor_list_in_then_branch == is_tensor_list_in_else_branch,
+        errors::FailedPrecondition("Output #", output_index, " is ",
+                                   (is_tensor_list_in_then_branch ? "" : "not"),
+                                   " a TensorList in then branch, but is ",
+                                   (is_tensor_list_in_else_branch ? "" : "not"),
+                                   " a TensorList in else branch"));
+  }
+
   VLOG(2) << "Input shape: " << xla::ShapeUtil::HumanString(then_input_shape);
   VLOG(2) << "Output shape: "
           << xla::ShapeUtil::HumanString(then_result.xla_output_shape);
@@ -282,7 +287,12 @@ void XlaIfOp::Compile(XlaOpKernelContext* ctx) {
         LOG(INFO) << "Shape unknown for output " << i;
       }
     }
-    ctx->SetOutput(i, output_handle);
+    // We have checked that both branches have same TensorList output indices.
+    if (then_result.outputs[i].is_tensor_list) {
+      ctx->SetTensorListOutput(i, output_handle);
+    } else {
+      ctx->SetOutput(i, output_handle);
+    }
   }
   if (has_token_input_output_) {
     // Set token output for this "If" op. Token output is the last output of
diff --git a/tensorflow/compiler/tf2xla/kernels/reshape_op.cc b/tensorflow/compiler/tf2xla/kernels/reshape_op.cc
index 273ac3dd5ae..bf9a9150ea6 100644
--- a/tensorflow/compiler/tf2xla/kernels/reshape_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/reshape_op.cc
@@ -106,13 +106,34 @@ class ReshapeOp : public XlaOpKernel {
                                         " values, but the requested shape has ",
                                         shape.num_elements()));
 
-    VLOG(1) << "Reshape from " << input_shape.DebugString() << " to "
+    VLOG(2) << "Reshape from " << input_shape.DebugString() << " to "
             << shape.DebugString() << ", unknown_index=" << unknown_index;
 
+    shape_input.clear();
+    // Run get input again, this time with dynamic dimension represented as
+    // "-1"
+    ctx->set_dynamic_dimension_is_minus_one(true);
+    OP_REQUIRES_OK(ctx, ctx->ConstantInputAsIntVector(1, &shape_input));
+
+    int dynamic_dimension = -1;
+
+    for (int d = 0; d < num_dims; ++d) {
+      const int32 size = shape_input[d];
+      if (size == -1) {
+        if (dynamic_dimension == -1) {
+          dynamic_dimension = d;
+        } else {
+          if (unknown_index != d) {
+            dynamic_dimension = d;
+          }
+        }
+      }
+    }
+
     // Pass unknown_index to Xla::Reshape as a hint for dynamic shape inference
     // in XLA to know which output dimension is dynamic.
     ctx->SetOutput(0, xla::ReshapeWithInferredDimension(
-                          ctx->Input(0), shape.dim_sizes(), unknown_index));
+                          ctx->Input(0), shape.dim_sizes(), dynamic_dimension));
   }
 };
 
diff --git a/tensorflow/compiler/tf2xla/kernels/scatter_nd_op.cc b/tensorflow/compiler/tf2xla/kernels/scatter_nd_op.cc
index 93dfc189fd6..ce4a46b45c8 100644
--- a/tensorflow/compiler/tf2xla/kernels/scatter_nd_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/scatter_nd_op.cc
@@ -20,7 +20,9 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/compiler/xla/primitive_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/framework/kernel_def_builder.h"
 #include "tensorflow/core/framework/op_kernel.h"
 
@@ -108,18 +110,26 @@ class ScatterNdOp : public XlaOpKernel {
                                  buffer_shape.dim_sizes());
     auto indices = context->Input(0);
     auto updates = context->Input(1);
+    auto combine =
+        context->input_xla_type(1) == xla::PRED ? CombineBool : CombineNum;
     auto result =
         XlaScatter(buffer, updates, indices,
-                   /*indices_are_vectors=*/true, /*combiner=*/Combine, builder);
+                   /*indices_are_vectors=*/true, /*combiner=*/combine, builder);
     OP_REQUIRES_OK(context, result.status());
     context->SetOutput(0, result.ValueOrDie());
   }
 
  private:
-  static xla::XlaOp Combine(const xla::XlaOp& x, const xla::XlaOp& y,
-                            xla::XlaBuilder* builder) {
+  static xla::XlaOp CombineNum(const xla::XlaOp x, const xla::XlaOp y,
+                               xla::XlaBuilder* builder) {
+    (void)builder;
     return xla::Add(x, y);
   }
+  static xla::XlaOp CombineBool(const xla::XlaOp x, const xla::XlaOp y,
+                                xla::XlaBuilder* builder) {
+    (void)builder;
+    return xla::Or(x, y);
+  }
 };
 
 REGISTER_XLA_OP(Name("ScatterNd").CompileTimeConstantInput("shape"),
diff --git a/tensorflow/compiler/tf2xla/kernels/shape_op.cc b/tensorflow/compiler/tf2xla/kernels/shape_op.cc
index 88af12dacee..06095631434 100644
--- a/tensorflow/compiler/tf2xla/kernels/shape_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/shape_op.cc
@@ -119,7 +119,9 @@ class SizeOp : public XlaOpKernel {
     xla::XlaBuilder* builder = ctx->builder();
     auto size = xla::One(builder, xla::U32);
     for (int64 i = 0; i < rank; ++i) {
-      size = xla::Mul(size, xla::GetDimensionSize(ctx->Input(0), i));
+      size = xla::Mul(
+          size, xla::ConvertElementType(xla::GetDimensionSize(ctx->Input(0), i),
+                                        xla::U32));
     }
     size = xla::ConvertElementType(size, ctx->output_xla_type(0));
     ctx->SetOutput(0, size);
diff --git a/tensorflow/compiler/tf2xla/kernels/strided_slice_op.cc b/tensorflow/compiler/tf2xla/kernels/strided_slice_op.cc
index 9da1504bff1..546b0e7f9e1 100644
--- a/tensorflow/compiler/tf2xla/kernels/strided_slice_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/strided_slice_op.cc
@@ -14,12 +14,14 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/core/util/strided_slice_op.h"
+
 #include "absl/types/span.h"
 #include "tensorflow/compiler/tf2xla/literal_util.h"
 #include "tensorflow/compiler/tf2xla/type_util.h"
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
+#include "tensorflow/compiler/xla/client/lib/constants.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/ops_util.h"
@@ -44,60 +46,124 @@ class StridedSliceOp : public XlaOpKernel {
 
   void Compile(XlaOpKernelContext* ctx) override {
     const TensorShape input_shape = ctx->InputShape(0);
+    const TensorShape begin_shape = ctx->InputShape("begin");
+
+    OP_REQUIRES(
+        ctx, begin_shape.dims() == 1,
+        errors::InvalidArgument("'begin' input has to be a rank 1 vector"));
 
-    TensorShape final_shape;
     absl::InlinedVector<int64, 4> begin;
     absl::InlinedVector<int64, 4> end;
     absl::InlinedVector<int64, 4> strides;
 
     xla::Literal begin_literal, end_literal, strides_literal;
-    OP_REQUIRES_OK(ctx, ctx->ConstantInput(1, &begin_literal));
-    OP_REQUIRES_OK(ctx, ctx->ConstantInput(2, &end_literal));
+    bool begin_is_constant = ctx->ConstantInput(1, &begin_literal).ok();
+    bool end_is_constant = ctx->ConstantInput(2, &end_literal).ok();
+
     OP_REQUIRES_OK(ctx, ctx->ConstantInput(3, &strides_literal));
 
     Tensor begin_tensor, end_tensor, strides_tensor;
-    OP_REQUIRES_OK(
-        ctx, LiteralToHostTensor(begin_literal, index_type_, &begin_tensor));
-    OP_REQUIRES_OK(ctx,
-                   LiteralToHostTensor(end_literal, index_type_, &end_tensor));
+    if (begin_is_constant) {
+      OP_REQUIRES_OK(
+          ctx, LiteralToHostTensor(begin_literal, index_type_, &begin_tensor));
+    }
+    if (end_is_constant) {
+      OP_REQUIRES_OK(
+          ctx, LiteralToHostTensor(end_literal, index_type_, &end_tensor));
+    }
     OP_REQUIRES_OK(ctx, LiteralToHostTensor(strides_literal, index_type_,
                                             &strides_tensor));
 
-    TensorShape dummy_processing_shape;
+    TensorShape final_shape;
+    PartialTensorShape dummy_processing_shape, partial_final_shape;
     bool dummy = false;
-    OP_REQUIRES_OK(ctx,
-                   ValidateStridedSliceOp(
-                       &begin_tensor, &end_tensor, strides_tensor, input_shape,
-                       begin_mask_, end_mask_, ellipsis_mask_, new_axis_mask_,
-                       shrink_axis_mask_, &dummy_processing_shape, &final_shape,
-                       &dummy, &dummy, &dummy, &begin, &end, &strides));
+    OP_REQUIRES_OK(ctx, ValidateStridedSliceOp(
+                            begin_is_constant ? &begin_tensor : nullptr,
+                            end_is_constant ? &end_tensor : nullptr,
+                            strides_tensor, input_shape, begin_mask_, end_mask_,
+                            ellipsis_mask_, new_axis_mask_, shrink_axis_mask_,
+                            &dummy_processing_shape, &partial_final_shape,
+                            &dummy, &dummy, &dummy, &begin, &end, &strides));
 
-    absl::InlinedVector<int64, 4> dimensions_to_reverse;
-    absl::InlinedVector<int64, 4> slice_begin, slice_end, slice_strides;
-
-    for (int i = 0; i < begin.size(); ++i) {
-      if (strides[i] > 0) {
-        slice_begin.push_back(begin[i]);
-        slice_end.push_back(std::max(end[i], begin[i]));
-        slice_strides.push_back(strides[i]);
-      } else {
-        // Negative stride: swap begin and end, add 1 because the interval
-        // is semi-open, and mark the dimension to be reversed.
-        slice_begin.push_back(input_shape.dim_size(i) - begin[i] - 1);
-        slice_end.push_back(std::max(input_shape.dim_size(i) - end[i] - 1,
-                                     input_shape.dim_size(i) - begin[i] - 1));
-        slice_strides.push_back(-strides[i]);
-        dimensions_to_reverse.push_back(i);
-      }
-    }
+    OP_REQUIRES(ctx, partial_final_shape.AsTensorShape(&final_shape),
+                errors::InvalidArgument(
+                    "XLA can't deduce compile time constant output "
+                    "shape for strided slice: ",
+                    partial_final_shape.DebugString(),
+                    ", output shape must be a compile-time constant"));
 
     xla::XlaOp slice = ctx->Input(0);
-    if (!dimensions_to_reverse.empty()) {
-      slice = xla::Rev(slice, dimensions_to_reverse);
+    if (begin_is_constant && end_is_constant) {
+      absl::InlinedVector<int64, 4> dimensions_to_reverse;
+      absl::InlinedVector<int64, 4> slice_begin, slice_end, slice_strides;
+      for (int i = 0; i < begin.size(); ++i) {
+        if (strides[i] > 0) {
+          slice_begin.push_back(begin[i]);
+          slice_end.push_back(std::max(end[i], begin[i]));
+          slice_strides.push_back(strides[i]);
+        } else {
+          // Negative stride: swap begin and end, add 1 because the interval
+          // is semi-open, and mark the dimension to be reversed.
+          slice_begin.push_back(input_shape.dim_size(i) - begin[i] - 1);
+          slice_end.push_back(std::max(input_shape.dim_size(i) - end[i] - 1,
+                                       input_shape.dim_size(i) - begin[i] - 1));
+          slice_strides.push_back(-strides[i]);
+          dimensions_to_reverse.push_back(i);
+        }
+      }
+      if (!dimensions_to_reverse.empty()) {
+        slice = xla::Rev(slice, dimensions_to_reverse);
+      }
+      slice = xla::Slice(slice, slice_begin, slice_end, slice_strides);
+    } else {
+      // When output shape is fully defined, it must be a size one slice:
+      //
+      // 1. The number of output elements has to equal to number of input
+      // elements that are sliced.
+      // 2. The stride of the slice dimensions must be exact one.
+      int64 output_elements = final_shape.num_elements();
+
+      int64 input_elements_sliced = 1;
+      int64 slicing_dim_size = begin_shape.dim_size(0);
+      // We only support slicing major dimensions, so minor dimensions after
+      for (int64 d = slicing_dim_size; d < input_shape.dims(); ++d) {
+        input_elements_sliced *= input_shape.dim_size(d);
+      }
+
+      OP_REQUIRES(
+          ctx, output_elements == input_elements_sliced,
+          errors::InvalidArgument(
+              "The number of output elements ", output_elements,
+              "  has to equal to number of input elements that are sliced ",
+              input_elements_sliced, " when input indices are not constant."));
+
+      for (int64 i = 0; i < ctx->InputShape("begin").dims(); ++i) {
+        OP_REQUIRES(
+            ctx, strides[i] == 1,
+            errors::InvalidArgument(
+                "Strides have to be one when inputs are not constant."));
+      }
+
+      // When inputs are not compile time constants, shape inference can only
+      // inference size 1 slice.
+      std::vector<int64> slice_sizes(slicing_dim_size, 1);
+      std::vector<xla::XlaOp> start_indices;
+      for (int64 d = 0; d < slicing_dim_size; ++d) {
+        auto index = xla::Slice(ctx->Input("begin"), {d}, {d + 1}, {1});
+        // Convert index to scalar.
+        start_indices.push_back(xla::Reshape(index, {}));
+      }
+
+      for (int64 d = slicing_dim_size; d < input_shape.dims(); ++d) {
+        // For non-slice dims, naturally we get the full slice starting from 0.
+        slice_sizes.push_back(input_shape.dim_size(d));
+        start_indices.push_back(
+            xla::Zero(ctx->builder(), ctx->InputXlaType("begin")));
+      }
+
+      std::vector<int64> output_shape_dim_sizes;
+      slice = xla::DynamicSlice(slice, start_indices, slice_sizes);
     }
-
-    slice = xla::Slice(slice, slice_begin, slice_end, slice_strides);
-
     slice = xla::Reshape(slice, final_shape.dim_sizes());
     ctx->SetOutput(0, slice);
   }
diff --git a/tensorflow/compiler/tf2xla/kernels/variable_ops.cc b/tensorflow/compiler/tf2xla/kernels/variable_ops.cc
index 7b4125ab76e..60424f85840 100644
--- a/tensorflow/compiler/tf2xla/kernels/variable_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/variable_ops.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/shape_util.h"
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
+#include "tensorflow/compiler/xla/client/lib/slicing.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/core/framework/kernel_def_builder.h"
@@ -122,27 +123,24 @@ REGISTER_XLA_OP(
 
 class ResourceGatherOp : public XlaOpKernel {
  public:
-  explicit ResourceGatherOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {}
+  explicit ResourceGatherOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("batch_dims", &batch_dims_));
+  }
   void Compile(XlaOpKernelContext* ctx) override {
-    xla::XlaBuilder* builder = ctx->builder();
-
     DataType type = ctx->expected_output_dtype(0);
 
-    TensorShape resource_shape;
-    xla::XlaOp resource_handle;
-    OP_REQUIRES_OK(ctx, ctx->ReadVariableInput(0, type, &resource_shape,
-                                               &resource_handle));
+    TensorShape input_shape;
+    xla::XlaOp input;
+    OP_REQUIRES_OK(ctx, ctx->ReadVariableInput(0, type, &input_shape, &input));
 
-    auto indices = ctx->Input(1);
-    auto indices_shape = ctx->InputShape(1);
-    DataType index_type = ctx->input_type(1);
     xla::XlaOp gather;
-    OP_REQUIRES_OK(
-        ctx, XlaGather(resource_handle, resource_shape, indices, indices_shape,
-                       /*axis=*/0, /*indices_are_nd=*/false, type, index_type,
-                       builder, &gather));
+    OP_REQUIRES_OK(ctx, XlaGatherWithBatchDimsOpImpl(ctx, input, input_shape,
+                                                     batch_dims_, &gather));
     ctx->SetOutput(0, gather);
   }
+
+ private:
+  int32 batch_dims_;
 };
 REGISTER_XLA_OP(Name("ResourceGather"), ResourceGatherOp);
 
diff --git a/tensorflow/compiler/tf2xla/kernels/while_op.cc b/tensorflow/compiler/tf2xla/kernels/while_op.cc
index e82519f68ca..36c35f3c83b 100644
--- a/tensorflow/compiler/tf2xla/kernels/while_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/while_op.cc
@@ -81,29 +81,17 @@ Status MakeXlaCompilerArgumentsFromInputs(
     if (type == DT_RESOURCE) {
       XlaResource* resource;
       TF_RETURN_IF_ERROR(ctx->GetResourceInput(i, &resource));
-
-      arg.initialized = resource->initialized();
-      arg.kind = XlaCompiler::Argument::kResource;
-      arg.resource_kind = resource->kind();
+      XlaCompiler::PopulateArgumentFromResource(*resource, &arg);
       if (arg.resource_kind == XlaResource::kTensorArray) {
         *has_tensor_arrays = true;
       }
-
-      arg.type = resource->type();
-      arg.shape = resource->shape();
       if (!arg.initialized) {
         *has_uninitialized_vars = true;
       }
-      arg.max_array_size = resource->max_array_size();
-      for (const auto& gradient : resource->tensor_array_gradients()) {
-        arg.tensor_array_gradients.insert(gradient.first);
-      }
-      arg.name = resource->name();
       VLOG(2) << "    resource " << resource->name()
               << " type: " << DataTypeString(arg.type)
               << " shape: " << arg.ShapeHumanString()
               << " initialized: " << arg.initialized;
-
     } else {
       arg.kind = XlaCompiler::Argument::kParameter;
       arg.type = type;
diff --git a/tensorflow/compiler/tf2xla/ops/xla_ops.cc b/tensorflow/compiler/tf2xla/ops/xla_ops.cc
index f6fc0c526d9..43d9e5d0e10 100644
--- a/tensorflow/compiler/tf2xla/ops/xla_ops.cc
+++ b/tensorflow/compiler/tf2xla/ops/xla_ops.cc
@@ -624,7 +624,7 @@ REGISTER_OP("XlaEinsum")
     .Input("b: T")
     .Output("product: T")
     .Attr("equation: string")
-    .Attr("T: {bfloat16, float}")
+    .Attr("T: {complex64, bfloat16, float}")
     .SetShapeFn([](shape_inference::InferenceContext* context) {
       shape_inference::ShapeHandle input_a = context->input(0);
       shape_inference::ShapeHandle input_b = context->input(1);
diff --git a/tensorflow/compiler/tf2xla/xla_compiler.cc b/tensorflow/compiler/tf2xla/xla_compiler.cc
index f1bf88b5418..54beb0aebfe 100644
--- a/tensorflow/compiler/tf2xla/xla_compiler.cc
+++ b/tensorflow/compiler/tf2xla/xla_compiler.cc
@@ -161,9 +161,10 @@ Status BuildComputation(
     const std::vector<std::unique_ptr<XlaResource>>& resources,
     std::unique_ptr<xla::XlaOp> token_output,
     const XlaCompiler::ShapeRepresentationFn& shape_representation_fn,
-    bool return_updated_values_for_all_resources, bool always_return_tuple,
-    xla::XlaBuilder* builder, xla::XlaComputation* computation,
-    int* num_computation_outputs, int* num_nonconst_outputs,
+    bool is_entry_computation, bool return_updated_values_for_all_resources,
+    bool always_return_tuple, xla::XlaBuilder* builder,
+    xla::XlaComputation* computation, int* num_computation_outputs,
+    int* num_nonconst_outputs,
     std::vector<XlaCompiler::OutputDescription>* outputs,
     std::vector<XlaCompiler::ResourceUpdate>* resource_updates,
     xla::Shape* output_shape) {
@@ -173,6 +174,7 @@ Status BuildComputation(
   xla::OpMetadata retval_metadata;
   retval_metadata.set_op_name("XLA_Retvals");
   builder->SetOpMetadata(retval_metadata);
+  VLOG(1) << "Building new computation";
   auto cleanup = gtl::MakeCleanup([builder]() { builder->ClearOpMetadata(); });
 
   // Builds a no-op XLA computation. We need to set the sharding of outputs, but
@@ -189,6 +191,10 @@ Status BuildComputation(
   // a descending layout is used. The first element is the output index, second
   // element is the new layout.
   std::vector<std::pair<int64, xla::Layout>> retval_index_and_layout;
+  // Keeps track of sharding of each retval. If a retval is not in this list,
+  // replicate sharding is used. The first element is the output index, second
+  // element is the sharding.
+  std::unordered_map<int, xla::OpSharding> retval_index_and_sharding;
   for (int i = 0; i < retvals.size(); ++i) {
     XlaCompiler::OutputDescription& output = (*outputs)[i];
     const XlaExpression& retval = retvals[i];
@@ -216,6 +222,9 @@ Status BuildComputation(
             builder, it == retval_shardings.end()
                          ? absl::optional<xla::OpSharding>()
                          : it->second);
+        if (it != retval_shardings.end()) {
+          retval_index_and_sharding[elems.size()] = it->second;
+        }
         if (shape_representation_fn) {
           // If there is a shape representation function, reshape the output
           // tensor to the shape given by the representation shape function.
@@ -290,6 +299,9 @@ Status BuildComputation(
       xla::XlaScopedShardingAssignment assign_sharding(
           builder, it == arg_shardings.end() ? absl::optional<xla::OpSharding>()
                                              : it->second);
+      if (it != arg_shardings.end()) {
+        retval_index_and_sharding[elems.size()] = it->second;
+      }
 
       xla::XlaOp handle;
       TF_RETURN_IF_ERROR(resource->Pack(&handle, builder));
@@ -334,7 +346,44 @@ Status BuildComputation(
   // Builds the XLA computation. We *always* form a tuple here to ensure that
   // the output value is the last thing added into the XLA computation, even
   // if there is only one output value.
-  auto tuple = xla::Tuple(builder, elems);
+  xla::XlaOp tuple;
+  if (retval_index_and_sharding.empty() || !is_entry_computation) {
+    tuple = xla::Tuple(builder, elems);
+  } else {
+    std::vector<xla::Shape> elem_shapes;
+    for (const auto& elem : elems) {
+      TF_ASSIGN_OR_RETURN(xla::Shape elem_shape,
+                          elem.builder()->GetShape(elem));
+      elem_shapes.push_back(elem_shape);
+    }
+    xla::Shape shape = xla::ShapeUtil::MakeTupleShape(elem_shapes);
+    // Copy specified sharding from retval_index_and_sharding.
+    std::vector<xla::HloSharding> sharding_elems;
+    for (int i = 0; i < elems.size(); i++) {
+      const auto& iter = retval_index_and_sharding.find(i);
+      TF_RET_CHECK(iter != retval_index_and_sharding.end());
+      const xla::OpSharding& sub_op_sharding = iter->second;
+      TF_ASSIGN_OR_RETURN(xla::HloSharding sub_sharding,
+                          xla::HloSharding::FromProto(sub_op_sharding));
+      if (elem_shapes[i].IsTuple()) {
+        const std::vector<xla::HloSharding> sub_sharding_elems =
+            sub_sharding.tuple_elements();
+        TF_RET_CHECK(sub_sharding_elems.size() ==
+                     xla::ShapeUtil::GetLeafCount(elem_shapes[i]));
+        for (const auto& sub_sharding_elem : sub_sharding_elems) {
+          sharding_elems.push_back(sub_sharding_elem);
+        }
+      } else {
+        sharding_elems.push_back(sub_sharding);
+      }
+    }
+    xla::HloSharding modified_sharding =
+        xla::HloSharding::Tuple(shape, sharding_elems);
+    xla::OpSharding op_sharding = modified_sharding.ToProto();
+    // Assign proper sharding to the tuple instruction.
+    xla::XlaScopedShardingAssignment assign_sharding(builder, op_sharding);
+    tuple = xla::Tuple(builder, elems);
+  }
   if (!always_return_tuple && elems.size() == 1) {
     xla::GetTupleElement(tuple, 0);
   }
@@ -793,6 +842,22 @@ Status XlaCompiler::XLAShapeForArgument(const XlaCompiler::Argument& arg,
   }
 }
 
+/* static */
+void XlaCompiler::PopulateArgumentFromResource(const XlaResource& resource,
+                                               Argument* arg) {
+  arg->initialized = resource.initialized();
+  arg->kind = XlaCompiler::Argument::kResource;
+  arg->resource_kind = resource.kind();
+
+  arg->type = resource.type();
+  arg->shape = resource.shape();
+  arg->max_array_size = resource.max_array_size();
+  for (const auto& gradient : resource.tensor_array_gradients()) {
+    arg->tensor_array_gradients.insert(gradient.first);
+  }
+  arg->name = resource.name();
+}
+
 // Builds XLA computations for each of the arguments to the computation.
 // `args` are the arguments to the computation.
 Status XlaCompiler::BuildArguments(
@@ -915,6 +980,9 @@ Status XlaCompiler::BuildArguments(
       const XlaCompiler::Argument& arg = args[input_to_args->at(i)];
       for (const auto& dim_and_arg_num : arg.dynamic_dim_to_arg_num_map) {
         int dynamic_size_param_index = arg_to_inputs.at(dim_and_arg_num.second);
+        VLOG(1) << "Setting dynamic binding " << i << " -> "
+                << dynamic_size_param_index;
+
         TF_RETURN_IF_ERROR(builder->SetDynamicBinding(
             /*dynamic_size_param_num=*/0, {dynamic_size_param_index},
             /*target_param_num=*/0, /*target_param_index=*/{i},
@@ -1170,7 +1238,7 @@ Status XlaCompiler::CompileGraph(
     std::unique_ptr<Graph> graph, absl::Span<const XlaCompiler::Argument> args,
     absl::Span<const xla::XlaBuilder::InputOutputAlias> user_aliases,
     CompilationResult* result) {
-  VLOG(1) << "Executing graph symbolically to populate XlaBuilder.";
+  VLOG(1) << "Executing graph symbolically to populate XlaBuilder.: " << name;
 
   TF_RETURN_IF_ERROR(PropagateConstIntoFunctionalNodes(
       graph.get(), options_.flib_def, local_flib_def_.get()));
@@ -1291,6 +1359,7 @@ Status XlaCompiler::CompileGraph(
       std::move(token_output),
       options.is_entry_computation ? options_.shape_representation_fn
                                    : ShapeRepresentationFn{},
+      options.is_entry_computation,
       options.return_updated_values_for_all_resources,
       options.always_return_tuple, &builder, result->computation.get(),
       &num_computation_outputs, &num_nonconst_outputs, &result->outputs,
diff --git a/tensorflow/compiler/tf2xla/xla_compiler.h b/tensorflow/compiler/tf2xla/xla_compiler.h
index 6ab8bde542d..4b4ee02aad9 100644
--- a/tensorflow/compiler/tf2xla/xla_compiler.h
+++ b/tensorflow/compiler/tf2xla/xla_compiler.h
@@ -351,6 +351,10 @@ class XlaCompiler {
 
   ~XlaCompiler();
 
+  // Helper function to populate an XlaCompiler::Argument from XlaResource.
+  static void PopulateArgumentFromResource(const XlaResource& resource,
+                                           Argument* arg);
+
   Status CompileFunction(const CompileOptions& options,
                          const NameAttrList& fn_name_attrs,
                          absl::Span<const Argument> args,
diff --git a/tensorflow/compiler/tf2xla/xla_compiler_test.cc b/tensorflow/compiler/tf2xla/xla_compiler_test.cc
index 4413625dc3c..324c31e8bf9 100644
--- a/tensorflow/compiler/tf2xla/xla_compiler_test.cc
+++ b/tensorflow/compiler/tf2xla/xla_compiler_test.cc
@@ -34,6 +34,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/local_client.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/literal.h"
+#include "tensorflow/compiler/xla/service/hlo.pb.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/tests/literal_test_util.h"
@@ -1738,5 +1739,56 @@ TEST_F(XlaCompilerTest, WhileWithResources) {
   EXPECT_TRUE(xla::LiteralTestUtil::Equal(expected_literal, actual_literal));
 }
 
+TEST_F(XlaCompilerTest, SetShardingForReturnedTuple) {
+  // Builds a graph that returns its only argument.
+  Scope scope = Scope::NewRootScope().ExitOnError();
+  auto a = ops::_Arg(scope.WithOpName("A"), DT_INT32, 0);
+  auto b = ops::_Retval(scope.WithOpName("B"), a, 0);
+  std::unique_ptr<Graph> graph(new Graph(OpRegistry::Global()));
+  TF_ASSERT_OK(scope.ToGraph(graph.get()));
+
+  // Sets _XlaSharding attribute for the _Retval node.
+  auto node_name_index = graph->BuildNodeNameIndex();
+  Node* ret_node = node_name_index["B"];
+  ASSERT_NE(ret_node, nullptr);
+  xla::Array<int64> tile_assignment({2});
+  tile_assignment.FillIota(0);
+  xla::HloSharding sharding = xla::HloSharding::Tile(tile_assignment);
+  ret_node->AddAttr("_XlaSharding", sharding.ToProto().SerializeAsString());
+
+  // Builds a description of the arguments.
+  std::vector<XlaCompiler::Argument> args(1);
+  args[0].kind = XlaCompiler::Argument::kParameter;
+  args[0].type = DT_INT32;
+  args[0].shape = TensorShape({2});
+
+  // Compiles the graph.
+  XlaCompiler compiler(DefaultOptions());
+
+  XlaCompiler::CompilationResult result;
+  TF_ASSERT_OK(compiler.CompileGraph(XlaCompiler::CompileOptions(), "test",
+                                     std::move(graph), args,
+                                     /*user_aliases=*/{}, &result));
+
+  // Tests that we set sharding on the root TUPLE instruction.
+  const auto& hlo_module_proto = result.computation->proto();
+  ASSERT_EQ(hlo_module_proto.computations_size(), 1);
+  const auto& hlo_computation_proto = hlo_module_proto.computations(0);
+  absl::optional<xla::HloInstructionProto> root_instruction_proto;
+  for (const auto& inst : hlo_computation_proto.instructions()) {
+    if (inst.id() == hlo_computation_proto.root_id()) {
+      root_instruction_proto = inst;
+      break;
+    }
+  }
+  ASSERT_TRUE(root_instruction_proto);
+  xla::Shape tuple_shape = xla::ShapeUtil::MakeTupleShape(
+      {xla::ShapeUtil::MakeShape(xla::S32, {2})});
+  xla::HloSharding tuple_sharding = xla::HloSharding::Tuple(
+      tuple_shape, std::vector<xla::HloSharding>{sharding});
+  EXPECT_EQ(root_instruction_proto->sharding().SerializeAsString(),
+            tuple_sharding.ToProto().SerializeAsString());
+}
+
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/xla_expression.cc b/tensorflow/compiler/tf2xla/xla_expression.cc
index 3d228c92adc..0aa139ce4f0 100644
--- a/tensorflow/compiler/tf2xla/xla_expression.cc
+++ b/tensorflow/compiler/tf2xla/xla_expression.cc
@@ -102,7 +102,7 @@ xla::XlaOp XlaExpression::AsXlaOp(xla::XlaBuilder* builder) const {
 }
 
 xla::StatusOr<absl::optional<Tensor>> XlaExpression::ResolveConstant(
-    xla::Client* client) const {
+    xla::Client* client, bool dynamic_dimension_is_minus_one) const {
   switch (kind()) {
     case Kind::kConstant:
       return {constant_value()};
@@ -122,7 +122,8 @@ xla::StatusOr<absl::optional<Tensor>> XlaExpression::ResolveConstant(
   if (!is_constant) return {absl::nullopt};
 
   TF_ASSIGN_OR_RETURN(xla::XlaComputation constant_graph,
-                      handle().builder()->BuildConstantSubGraph(handle()));
+                      handle().builder()->BuildConstantSubGraph(
+                          handle(), dynamic_dimension_is_minus_one));
 
   TF_ASSIGN_OR_RETURN(TensorShape shape, GetShape());
 
diff --git a/tensorflow/compiler/tf2xla/xla_expression.h b/tensorflow/compiler/tf2xla/xla_expression.h
index ac0232d8924..5d0bb35b182 100644
--- a/tensorflow/compiler/tf2xla/xla_expression.h
+++ b/tensorflow/compiler/tf2xla/xla_expression.h
@@ -97,7 +97,7 @@ class XlaExpression {
   // optional if it cannot be resolved. Returns an error if passed a resource
   // expression.
   xla::StatusOr<absl::optional<Tensor>> ResolveConstant(
-      xla::Client* client) const;
+      xla::Client* client, bool dynamic_dimension_is_minus_one = false) const;
 
   // Returns the shape of the tensor.
   // The shape of a resource is the shape of a resource handle (i.e., a scalar),
diff --git a/tensorflow/compiler/tf2xla/xla_op_kernel.cc b/tensorflow/compiler/tf2xla/xla_op_kernel.cc
index c95cd4e5475..a1941cc5fdf 100644
--- a/tensorflow/compiler/tf2xla/xla_op_kernel.cc
+++ b/tensorflow/compiler/tf2xla/xla_op_kernel.cc
@@ -31,7 +31,7 @@ limitations under the License.
 namespace tensorflow {
 
 XlaOpKernelContext::XlaOpKernelContext(OpKernelContext* context)
-    : context_(context) {}
+    : context_(context), dynamic_dimension_is_minus_one_(false) {}
 
 bool XlaOpKernelContext::ValidateInputsAreSameShape(OpKernel* op) {
   return context_->ValidateInputsAreSameShape(op);
@@ -166,7 +166,7 @@ Status XlaOpKernelContext::ConstantInputReshaped(
     xla::Literal* constant_literal) {
   XlaExpression e = InputExpression(index);
   xla::StatusOr<absl::optional<Tensor>> constant_or_status =
-      e.ResolveConstant(compiler()->client());
+      e.ResolveConstant(compiler()->client(), dynamic_dimension_is_minus_one_);
   if (!constant_or_status.ok()) {
     Status status = constant_or_status.status();
     errors::AppendToMessage(&status, "while evaluating input ", index, " of ",
diff --git a/tensorflow/compiler/tf2xla/xla_op_kernel.h b/tensorflow/compiler/tf2xla/xla_op_kernel.h
index 7794786f905..3e75cf7fa58 100644
--- a/tensorflow/compiler/tf2xla/xla_op_kernel.h
+++ b/tensorflow/compiler/tf2xla/xla_op_kernel.h
@@ -202,6 +202,17 @@ class XlaOpKernelContext {
   Status GetVariableTypeAndShape(int index, DataType* type,
                                  TensorShape* shape) const;
 
+  // When dynamic_dimension_is_minus_one is set, querying a dynamic dimension
+  // returns "-1", this is useful when the underlying ops expect explicit
+  // dynamic index like reshape.
+  void set_dynamic_dimension_is_minus_one(bool value) {
+    dynamic_dimension_is_minus_one_ = value;
+  }
+
+  bool dynamic_dimension_is_minus_one() const {
+    return dynamic_dimension_is_minus_one_;
+  }
+
   // Reads the current value of the resouce variable referred to by input
   // `index`. If `shape` is not nullptr, sets `*shape` to the shape of the
   // variable. Returns an error if the variable has not been initialized, or if
@@ -280,6 +291,7 @@ class XlaOpKernelContext {
                                xla::Literal* constant_literal);
 
   OpKernelContext* const context_;
+  bool dynamic_dimension_is_minus_one_;
 };
 
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/xla/BUILD b/tensorflow/compiler/xla/BUILD
index 24105f2f162..b4752813c8c 100644
--- a/tensorflow/compiler/xla/BUILD
+++ b/tensorflow/compiler/xla/BUILD
@@ -58,6 +58,16 @@ xla_proto_library(
     ],
 )
 
+tf_proto_library_py(
+    name = "xla_proto",  # bzl adds a _py suffix
+    srcs = ["xla.proto"],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":xla_data_proto_py",
+        "//tensorflow/compiler/xla/service:hlo_proto_py",
+    ],
+)
+
 cc_library(
     name = "bit_cast",
     hdrs = ["bit_cast.h"],
diff --git a/tensorflow/compiler/xla/client/lib/BUILD b/tensorflow/compiler/xla/client/lib/BUILD
index b46d04dc328..38a34dfb563 100644
--- a/tensorflow/compiler/xla/client/lib/BUILD
+++ b/tensorflow/compiler/xla/client/lib/BUILD
@@ -310,7 +310,9 @@ xla_test(
     srcs = ["slicing_test.cc"],
     deps = [
         ":slicing",
+        "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:literal_util",
+        "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla/client:xla_builder",
diff --git a/tensorflow/compiler/xla/client/lib/arithmetic.cc b/tensorflow/compiler/xla/client/lib/arithmetic.cc
index 0940a873fa4..de573429fdc 100644
--- a/tensorflow/compiler/xla/client/lib/arithmetic.cc
+++ b/tensorflow/compiler/xla/client/lib/arithmetic.cc
@@ -27,9 +27,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 
 namespace xla {
-namespace {
-
-using XlaOpGenerator = XlaOp (*)(XlaBuilder*, const XlaOp&, const XlaOp&);
 
 XlaComputation CreateScalarComputation(const string& name, PrimitiveType type,
                                        XlaBuilder* builder,
@@ -45,69 +42,50 @@ XlaComputation CreateScalarComputation(const string& name, PrimitiveType type,
   const Shape scalar = ShapeUtil::MakeShape(type, {});
   auto lhs = Parameter(b.get(), 0, scalar, "lhs");
   auto rhs = Parameter(b.get(), 1, scalar, "rhs");
-  generator(b.get(), lhs, rhs);
+  generator(lhs, rhs);
   return b->BuildAndNoteError();
 }
 
-}  // namespace
-
 XlaComputation CreateScalarAddComputation(PrimitiveType type,
                                           XlaBuilder* builder) {
   return CreateScalarComputation(
-      "add", type, builder,
-      [](XlaBuilder* b, const XlaOp& lhs, const XlaOp& rhs) {
-        return Add(lhs, rhs);
-      });
+      "add", type, builder, [](XlaOp lhs, XlaOp rhs) { return Add(lhs, rhs); });
 }
 
 XlaComputation CreateScalarMultiplyComputation(PrimitiveType type,
                                                XlaBuilder* builder) {
   return CreateScalarComputation(
-      "mul", type, builder,
-      [](XlaBuilder* b, const XlaOp& lhs, const XlaOp& rhs) {
-        return Mul(lhs, rhs);
-      });
+      "mul", type, builder, [](XlaOp lhs, XlaOp rhs) { return Mul(lhs, rhs); });
 }
 
 XlaComputation CreateScalarGeComputation(PrimitiveType type,
                                          XlaBuilder* builder) {
-  return CreateScalarComputation("ge", type, builder,
-                                 [](XlaBuilder* b, const XlaOp& lhs,
-                                    const XlaOp& rhs) { return Ge(lhs, rhs); });
+  return CreateScalarComputation(
+      "ge", type, builder, [](XlaOp lhs, XlaOp rhs) { return Ge(lhs, rhs); });
 }
 
 XlaComputation CreateScalarMaxComputation(PrimitiveType type,
                                           XlaBuilder* builder) {
   return CreateScalarComputation(
-      "max", type, builder,
-      [](XlaBuilder* b, const XlaOp& lhs, const XlaOp& rhs) {
-        return Max(lhs, rhs);
-      });
+      "max", type, builder, [](XlaOp lhs, XlaOp rhs) { return Max(lhs, rhs); });
 }
 
 XlaComputation CreateScalarMinComputation(PrimitiveType type,
                                           XlaBuilder* builder) {
   return CreateScalarComputation(
-      "min", type, builder,
-      [](XlaBuilder* b, const XlaOp& lhs, const XlaOp& rhs) {
-        return Min(lhs, rhs);
-      });
+      "min", type, builder, [](XlaOp lhs, XlaOp rhs) { return Min(lhs, rhs); });
 }
 
 XlaComputation CreateScalarAndComputation(PrimitiveType type,
                                           XlaBuilder* builder) {
   return CreateScalarComputation(
-      "and", type, builder,
-      [](XlaBuilder* b, const XlaOp& lhs, const XlaOp& rhs) {
-        return And(lhs, rhs);
-      });
+      "and", type, builder, [](XlaOp lhs, XlaOp rhs) { return And(lhs, rhs); });
 }
 
 XlaComputation CreateScalarOrComputation(PrimitiveType type,
                                          XlaBuilder* builder) {
-  return CreateScalarComputation("or", type, builder,
-                                 [](XlaBuilder* b, const XlaOp& lhs,
-                                    const XlaOp& rhs) { return Or(lhs, rhs); });
+  return CreateScalarComputation(
+      "or", type, builder, [](XlaOp lhs, XlaOp rhs) { return Or(lhs, rhs); });
 }
 
 XlaComputation CreateScalarIdentityWithZeroComputation(PrimitiveType type,
diff --git a/tensorflow/compiler/xla/client/lib/arithmetic.h b/tensorflow/compiler/xla/client/lib/arithmetic.h
index 270076a1586..350dcc5531d 100644
--- a/tensorflow/compiler/xla/client/lib/arithmetic.h
+++ b/tensorflow/compiler/xla/client/lib/arithmetic.h
@@ -24,6 +24,13 @@ limitations under the License.
 
 namespace xla {
 
+using XlaOpGenerator = std::function<XlaOp(XlaOp, XlaOp)>;
+
+// Creates a scalar computation based on a lambda and returns it.
+XlaComputation CreateScalarComputation(const string& name, PrimitiveType type,
+                                       XlaBuilder* builder,
+                                       XlaOpGenerator generator);
+
 // Creates a scalar add computation and returns it.
 XlaComputation CreateScalarAddComputation(PrimitiveType type,
                                           XlaBuilder* builder);
diff --git a/tensorflow/compiler/xla/client/lib/qr.cc b/tensorflow/compiler/xla/client/lib/qr.cc
index 5a7c826c389..6c4b9f9c973 100644
--- a/tensorflow/compiler/xla/client/lib/qr.cc
+++ b/tensorflow/compiler/xla/client/lib/qr.cc
@@ -207,14 +207,39 @@ StatusOr<QRBlockResult> QRBlock(XlaOp a, PrecisionConfig::Precision precision) {
     auto new_x = Mul(x, predecessor_mask,
                      /*broadcast_dimensions=*/{num_dims - 2, num_dims - 1}) +
                  Mul(beta, mask, /*broadcast_dimensions=*/batch_dim_indices);
-    a = DynamicUpdateSliceInMinorDims(a, new_x, {j});
+    // Update a[:,j]
+    std::vector<int64> dim_ids(num_dims);
+    std::iota(dim_ids.begin(), dim_ids.end(), 0);
+    new_x = BroadcastInDim(new_x, ConcatVectors(batch_dims, {m, n}),
+                           /*broadcast_dimensions=*/dim_ids);
+    const int64 minor_dim = batch_dims.size();
+    auto iota_mn = Iota(
+        builder, ShapeUtil::MakeShape(S32, ConcatVectors(batch_dims, {m, n})),
+        minor_dim + 1);
+    a = Select(Eq(iota_mn, j), new_x, a);
 
     // vs[:, j] = v
-    vs = DynamicUpdateSliceInMinorDims(
-        vs, Reshape(v, ConcatVectors(batch_dims, {m, 1})), {j});
+    std::vector<int64> vs_broadcast_dims(batch_dims.size() + 1);
+    std::iota(vs_broadcast_dims.begin(), vs_broadcast_dims.end(), 0);
+    auto vs_zeros = ZerosLike(vs);
+    auto vs_update = Select(
+        Eq(iota_mn, j),
+        Add(vs_zeros, v, /*broadcast_dimensions=*/vs_broadcast_dims), vs_zeros);
+    vs = vs + vs_update;
+
     // taus[j] = tau
-    taus = DynamicUpdateSliceInMinorDims(
-        taus, Reshape(tau, ConcatVectors(batch_dims, {1})), {j});
+    std::vector<int64> tau_broadcast_dims(batch_dims.size());
+    std::iota(tau_broadcast_dims.begin(), tau_broadcast_dims.end(), 0);
+
+    auto iota_n =
+        Iota(builder, ShapeUtil::MakeShape(S32, ConcatVectors(batch_dims, {n})),
+             minor_dim);
+    auto taus_zeros = ZerosLike(taus);
+    auto taus_update = Select(
+        Eq(iota_n, j),
+        Add(taus_zeros, tau, /*broadcast_dimensions=*/tau_broadcast_dims),
+        taus_zeros);
+    taus = taus + taus_update;
     return std::vector<XlaOp>{a, vs, taus};
   };
 
diff --git a/tensorflow/compiler/xla/client/lib/slicing.cc b/tensorflow/compiler/xla/client/lib/slicing.cc
index 83c1045448d..b47ddb7919f 100644
--- a/tensorflow/compiler/xla/client/lib/slicing.cc
+++ b/tensorflow/compiler/xla/client/lib/slicing.cc
@@ -208,6 +208,43 @@ XlaOp TorchGather(XlaOp input, XlaOp index, int64 dim, bool sparse) {
   });
 }
 
+XlaOp TorchScatterDense(XlaOp input, XlaOp index, XlaOp src, int64 dim,
+                        const std::function<XlaOp(XlaOp, XlaOp)>& combiner) {
+  XlaBuilder* builder = input.builder();
+  return builder->ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    TF_ASSIGN_OR_RETURN(Shape index_shape, builder->GetShape(index));
+    TF_ASSIGN_OR_RETURN(Shape input_shape, builder->GetShape(input));
+    std::vector<int64> index_broacast_dims;
+    std::vector<int64> sizes;
+    for (int64 i = 0; i < index_shape.rank(); ++i) {
+      if (i < dim) {
+        index_broacast_dims.push_back(i);
+      } else {
+        if (i == dim) {
+          sizes.push_back(input_shape.dimensions(i));
+        }
+        index_broacast_dims.push_back(i + 1);
+      }
+      sizes.push_back(index_shape.dimensions(i));
+    }
+    auto mask =
+        Eq(BroadcastInDim(index, sizes, index_broacast_dims),
+           Iota(builder,
+                ShapeUtil::MakeShape(index_shape.element_type(), sizes), dim));
+    auto masked_src =
+        Select(mask, BroadcastInDim(src, sizes, index_broacast_dims),
+               Zeros(builder,
+                     ShapeUtil::MakeShape(input_shape.element_type(), sizes)));
+
+    return combiner(
+        input,
+        Reduce(masked_src, Zero(builder, input_shape.element_type()),
+               CreateScalarComputation("reducer", input_shape.element_type(),
+                                       builder, combiner),
+               {dim + 1}));
+  });
+}
+
 XlaOp TorchIndexSelect(XlaOp input, XlaOp index, int64 dim, int64 batch_dims) {
   XlaBuilder* builder = input.builder();
   return builder->ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
@@ -238,10 +275,8 @@ XlaOp TorchIndexSelect(XlaOp input, XlaOp index, int64 dim, int64 batch_dims) {
     }
     for (int64 i = 0; i < input_shape.rank(); ++i) {
       if (i < batch_dims || i == dim) {
-        if (slice_sizes[i] != 0) {
-          slice_sizes[i] = 1;
-          gather_dnums.add_collapsed_slice_dims(i);
-        }
+        slice_sizes[i] = std::min<int64>(slice_sizes[i], 1);
+        gather_dnums.add_collapsed_slice_dims(i);
         gather_dnums.add_start_index_map(i);
       } else {
         if (i < dim) {
diff --git a/tensorflow/compiler/xla/client/lib/slicing.h b/tensorflow/compiler/xla/client/lib/slicing.h
index 9a59a048b9f..cf83d63cec2 100644
--- a/tensorflow/compiler/xla/client/lib/slicing.h
+++ b/tensorflow/compiler/xla/client/lib/slicing.h
@@ -57,6 +57,13 @@ XlaOp DynamicUpdateSliceInMinorDims(XlaOp x, XlaOp update,
 // `index`.
 XlaOp TorchGather(XlaOp input, XlaOp index, int64 dim, bool sparse = true);
 
+// idx = index[i][j][k]
+// output[idx][j][k] = combiner(input[idx][j][k], src[i][j][k])  # if dim == 0
+// output[i][idx][k] = combiner(input[i][idx][k], src[i][j][k])  # if dim == 1
+// output[i][j][idx] = combiner(input[i][j][idx], src[i][j][k])  # if dim == 2
+XlaOp TorchScatterDense(XlaOp input, XlaOp index, XlaOp src, int64 dim,
+                        const std::function<XlaOp(XlaOp, XlaOp)>& combiner);
+
 // Returns a new tensor which indexes the input tensor along dimension dim using
 // the entries in index.
 //
diff --git a/tensorflow/compiler/xla/client/lib/slicing_test.cc b/tensorflow/compiler/xla/client/lib/slicing_test.cc
index 107cbae0a73..8e2e713c45c 100644
--- a/tensorflow/compiler/xla/client/lib/slicing_test.cc
+++ b/tensorflow/compiler/xla/client/lib/slicing_test.cc
@@ -16,7 +16,9 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/lib/slicing.h"
 
 #include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/literal_util.h"
+#include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/tests/client_library_test_base.h"
 #include "tensorflow/compiler/xla/tests/test_macros.h"
@@ -130,6 +132,24 @@ XLA_TEST_F(SlicingTest, TorchGatherDense) {
                            {input_data.get(), index_data.get()});
 }
 
+XLA_TEST_F(SlicingTest, TorchScatterDense) {
+  xla::XlaBuilder builder(TestName());
+
+  xla::XlaOp src, index, input;
+  auto input_data = CreateR2Parameter<int>({{0, 0, 0}, {0, 0, 0}}, 0, "input",
+                                           &builder, &input);
+  auto index_data =
+      CreateR2Parameter<int>({{1, 0}, {1, 2}}, 1, "index", &builder, &index);
+  auto src_data =
+      CreateR2Parameter<int>({{1, 2}, {3, 4}}, 2, "src", &builder, &src);
+  TorchScatterDense(input, index, src, 1,
+                    [](XlaOp l, XlaOp r) { return l + r; });
+
+  ComputeAndCompareR2<int>(
+      &builder, {{2, 1, 0}, {0, 3, 4}},
+      {input_data.get(), index_data.get(), src_data.get()});
+}
+
 XLA_TEST_F(SlicingTest, TorchIndexSelectOn0) {
   xla::XlaBuilder builder(TestName());
 
@@ -180,6 +200,35 @@ XLA_TEST_F(SlicingTest, EmptyIndexSelect) {
                              {input_data.get(), index_data.get()});
 }
 
+XLA_TEST_F(SlicingTest, DoubleEmptyIndexSelect) {
+  xla::XlaBuilder builder(TestName());
+
+  xla::XlaOp input, index;
+  Literal l(ShapeUtil::MakeShape(F32, {0, 1, 2, 0}));
+  Literal i(ShapeUtil::MakeShape(S32, {0}));
+  auto input_data =
+      CreateParameterAndTransferLiteral(0, l, "input", &builder, &input);
+  auto index_data =
+      CreateParameterAndTransferLiteral(1, i, "index", &builder, &index);
+  TorchIndexSelect(input, index, 0);
+  ComputeAndCompareLiteral(&builder, l, {input_data.get(), index_data.get()});
+}
+
+XLA_TEST_F(SlicingTest, EmptyIndexSelectNonZero) {
+  xla::XlaBuilder builder(TestName());
+
+  xla::XlaOp input, index;
+  Literal l(ShapeUtil::MakeShape(F32, {0, 2}));
+  auto input_data =
+      CreateParameterAndTransferLiteral(0, l, "input", &builder, &input);
+  auto index_data =
+      CreateR1Parameter<int>({0, 0, 0}, 1, "index", &builder, &index);
+  TorchIndexSelect(input, index, 0);
+  ComputeAndCompareR2<float>(&builder,
+                             {{0.0f, 0.0f}, {0.0f, 0.0f}, {0.0f, 0.0f}},
+                             {input_data.get(), index_data.get()});
+}
+
 XLA_TEST_F(SlicingTest, BatchTorchIndexSelectOn0) {
   xla::XlaBuilder builder(TestName());
 
diff --git a/tensorflow/compiler/xla/client/local_client.h b/tensorflow/compiler/xla/client/local_client.h
index b697fb031fd..f5e66c6d586 100644
--- a/tensorflow/compiler/xla/client/local_client.h
+++ b/tensorflow/compiler/xla/client/local_client.h
@@ -37,6 +37,11 @@ namespace xla {
 
 class LocalExecutable {
  public:
+  // Low-level constructor; LocalClient::Compile() is the usual way to create
+  // executables.
+  LocalExecutable(std::unique_ptr<Executable> executable, Backend* backend,
+                  ExecutableBuildOptions build_options);
+
   // Run the compiled computation with the given arguments and options and
   // return the result.
   StatusOr<ScopedShapedBuffer> Run(
@@ -56,13 +61,6 @@ class LocalExecutable {
   Executable* executable() const { return executable_.get(); }
 
  private:
-  // Only a local client can construct these objects.
-  friend class LocalClient;
-
-  // Constructor invoked by LocalClient.
-  LocalExecutable(std::unique_ptr<Executable> executable, Backend* backend,
-                  ExecutableBuildOptions build_options);
-
   // Validates that the given arguments and options satisfy various constraints
   // of the computation.
   //
diff --git a/tensorflow/compiler/xla/client/xla_builder.cc b/tensorflow/compiler/xla/client/xla_builder.cc
index 7fea245f69a..7a9b9856271 100644
--- a/tensorflow/compiler/xla/client/xla_builder.cc
+++ b/tensorflow/compiler/xla/client/xla_builder.cc
@@ -213,16 +213,10 @@ void XlaBuilder::IsConstantVisitor(const int64 op_handle,
       // TODO(b/32495713): We aren't checking the called computations.
       break;
     case HloOpcode::kGetDimensionSize: {
-      int64 dimension_number = instr.dimensions(0);
-      const HloInstructionProto& operand =
-          *(LookUpInstructionByHandle(instr.operand_ids(0)).ValueOrDie());
-      Shape operand_shape(operand.shape());
-      if (operand_shape.is_dynamic_dimension(dimension_number)) {
-        *is_constant = false;
-      }
+      // DimensionSize is always considered constant in XLA -- If a dynamic
+      // dimension is presented, uint_max is returned.
       break;
     }
-
     // Non functional ops.
     case HloOpcode::kRng:
     case HloOpcode::kAllReduce:
@@ -268,8 +262,8 @@ Status XlaBuilder::SetDynamicBinding(int64 dynamic_size_param_num,
       for (int64 index : target_param_index) {
         param_shape_ptr = param_shape_ptr->mutable_tuple_shapes(index);
       }
-      // TODO(b/121223198): Set `is_dynamic` to the parameter shape when XLA
-      // backend can handle dynamic dimensions.
+      param_shape_ptr->set_dynamic_dimension(target_dim_num,
+                                             /*is_dynamic=*/true);
       *instr.mutable_shape() = param_shape.ToProto();
     }
   }
@@ -435,6 +429,7 @@ StatusOr<XlaOp> XlaBuilder::InDimBroadcast(
   for (int64 dim : broadcast_dimensions) {
     instr.add_dimensions(dim);
   }
+
   return AddInstruction(std::move(instr), HloOpcode::kBroadcast, {operand});
 }
 
@@ -468,11 +463,21 @@ StatusOr<XlaOp> XlaBuilder::AddBroadcastSequence(const Shape& output_shape,
           << operand_shape << "; output_shape: " << output_shape;
     }
   }
+
+  Shape reshaped_shape =
+      ShapeUtil::MakeShape(operand_shape.element_type(), reshaped_dimensions);
+
+  std::vector<std::pair<int64, int64>> unmodified_dims =
+      ShapeUtil::DimensionsUnmodifiedByReshape(operand_shape, reshaped_shape);
+
+  for (auto& unmodified : unmodified_dims) {
+    if (operand_shape.is_dynamic_dimension(unmodified.first)) {
+      reshaped_shape.set_dynamic_dimension(unmodified.second, true);
+    }
+  }
+
   // Eliminate the size one dimensions.
-  TF_ASSIGN_OR_RETURN(XlaOp reshaped_operand,
-                      Reshape(ShapeUtil::MakeShape(operand_shape.element_type(),
-                                                   reshaped_dimensions),
-                              operand));
+  TF_ASSIGN_OR_RETURN(XlaOp reshaped_operand, Reshape(reshaped_shape, operand));
   // Broadcast 'reshape' up to the larger size.
   return InDimBroadcast(broadcast_shape, reshaped_operand,
                         broadcast_dimensions);
@@ -2428,7 +2433,7 @@ StatusOr<bool> XlaBuilder::IsConstant(const XlaOp& operand) const {
 }
 
 StatusOr<XlaComputation> XlaBuilder::BuildConstantSubGraph(
-    const XlaOp& root_op) {
+    XlaOp root_op, bool dynamic_dimension_is_minus_one) {
   TF_ASSIGN_OR_RETURN(bool is_constant, IsConstant(root_op));
   if (!is_constant) {
     auto op_status = LookUpInstruction(root_op);
@@ -2483,9 +2488,12 @@ StatusOr<XlaComputation> XlaBuilder::BuildConstantSubGraph(
       TF_ASSIGN_OR_RETURN(const HloInstructionProto* operand_proto,
                           LookUpInstructionByHandle(operand_handle));
 
-      TF_RET_CHECK(!operand_proto->shape().is_dynamic_dimension(dimension));
-      auto constant_dimension_size =
-          static_cast<uint32>(operand_proto->shape().dimensions(dimension));
+      int32 constant_dimension_size = -1;
+      if (!(operand_proto->shape().is_dynamic_dimension(dimension) &&
+            dynamic_dimension_is_minus_one)) {
+        constant_dimension_size =
+            static_cast<int32>(operand_proto->shape().dimensions(dimension));
+      }
 
       Literal literal = LiteralUtil::CreateR0(constant_dimension_size);
 
diff --git a/tensorflow/compiler/xla/client/xla_builder.h b/tensorflow/compiler/xla/client/xla_builder.h
index 187cd261833..693ea3c493e 100644
--- a/tensorflow/compiler/xla/client/xla_builder.h
+++ b/tensorflow/compiler/xla/client/xla_builder.h
@@ -258,7 +258,8 @@ class XlaBuilder {
   // compile-time constant (see `IsConstant`), returns an error.
   //
   // This will copy the needed ops/computations to the subgraph.
-  StatusOr<XlaComputation> BuildConstantSubGraph(const XlaOp& root_op);
+  StatusOr<XlaComputation> BuildConstantSubGraph(
+      XlaOp root_op, bool dynamic_dimension_is_uint_max = false);
 
   // Returns the first error that was encountered while building the
   // computation. When an error is encountered, by default we return a vacuous
diff --git a/tensorflow/compiler/xla/client/xla_builder_test.cc b/tensorflow/compiler/xla/client/xla_builder_test.cc
index 701729b94f3..32a34c801f0 100644
--- a/tensorflow/compiler/xla/client/xla_builder_test.cc
+++ b/tensorflow/compiler/xla/client/xla_builder_test.cc
@@ -917,10 +917,7 @@ TEST_F(XlaBuilderTest, DynamicSelectNotCompatible) {
   auto gte1 = GetTupleElement(p0, 1);  // f32[4,5,<=6]
   Select(pred, gte0, gte1);
   Status status = BuildHloModule(&b).status();
-  ASSERT_IS_NOT_OK(status);
-  EXPECT_THAT(status.error_message(),
-              ::testing::HasSubstr("Operands to select must be the same shape; "
-                                   "got f32[4,<=5,6] and f32[4,5,<=6]"));
+  ASSERT_IS_OK(status);
 }
 
 TEST_F(XlaBuilderTest, DynamicTranspose) {
diff --git a/tensorflow/compiler/xla/cpu_function_runtime.h b/tensorflow/compiler/xla/cpu_function_runtime.h
index 281ca5b2203..0c3355cbbfb 100644
--- a/tensorflow/compiler/xla/cpu_function_runtime.h
+++ b/tensorflow/compiler/xla/cpu_function_runtime.h
@@ -138,6 +138,17 @@ class BufferInfo {
 // Align to 64-bytes, to mimic tensorflow::Allocator::kAllocatorAlignment.
 constexpr size_t kAlign = 64;
 
+// When declaring variables that will be passed to an XLA instance as input via
+// set_arg_data(), be it a regular input or a resource variable in the graph,
+// the C++ variables must be aligned.
+//
+// Example usage:
+//   XLA_ALIGN std::array<float, 4> arg_x;
+//   XLA_ALIGN float arg_y;
+//   xla_instance.set_arg_data(0, arg_x.date());
+//   xla_instance.set_arg_data(0, &arg_y);
+#define XLA_ALIGN alignas(xla::cpu_function_runtime::kAlign)
+
 // AlignedBufferBytes returns the sum of the size of each buffer in
 // `buffer_infos`, skipping constants, on-stack buffers and, if
 // allocate_entry_params is false, entry parameters.  There are `n` entries in
diff --git a/tensorflow/compiler/xla/debug_options_flags.cc b/tensorflow/compiler/xla/debug_options_flags.cc
index 13173e0dbc8..ec0059d37d9 100644
--- a/tensorflow/compiler/xla/debug_options_flags.cc
+++ b/tensorflow/compiler/xla/debug_options_flags.cc
@@ -436,10 +436,10 @@ static void AllocateFlags() {
           "behavior to help run tests on the host that run models in parallel "
           "across multiple devices."),
       tensorflow::Flag(
-          "xla_gpu_disable_ptxas_optimizations",
+          "xla_gpu_disable_gpuasm_optimizations",
           bool_setter_for(
-              &DebugOptions::set_xla_gpu_disable_ptxas_optimizations),
-          flag_values->xla_gpu_disable_ptxas_optimizations(),
+              &DebugOptions::set_xla_gpu_disable_gpuasm_optimizations),
+          flag_values->xla_gpu_disable_gpuasm_optimizations(),
           "In XLA:GPU run ptxas in -O0 (default is -O3)."),
       tensorflow::Flag(
           "xla_fuel", setter_for_xla_fuel, /*default_value_for_display=*/"",
diff --git a/tensorflow/compiler/xla/layout.h b/tensorflow/compiler/xla/layout.h
index 300abff395d..1234d01755b 100644
--- a/tensorflow/compiler/xla/layout.h
+++ b/tensorflow/compiler/xla/layout.h
@@ -214,6 +214,7 @@ class Layout {
     element_size_in_bits_ = value;
     return *this;
   }
+  static constexpr int64 kDefaultMemorySpace = 0;
   int64 memory_space() const { return memory_space_; }
   Layout& set_memory_space(int64 value) {
     memory_space_ = value;
diff --git a/tensorflow/compiler/xla/literal.cc b/tensorflow/compiler/xla/literal.cc
index 03b47ba7089..c949cc6a5ba 100644
--- a/tensorflow/compiler/xla/literal.cc
+++ b/tensorflow/compiler/xla/literal.cc
@@ -944,6 +944,8 @@ absl::optional<complex128> LiteralBase::GetAsComplex128(
       return {Get<complex64>(multi_index)};
     case C128:
       return {Get<complex128>(multi_index)};
+    case S8:
+      return {Get<int8>(multi_index)};
     default:
       return absl::nullopt;
   }
diff --git a/tensorflow/compiler/xla/python/local_client.cc b/tensorflow/compiler/xla/python/local_client.cc
index 1d9bd1f0695..eb8be012176 100644
--- a/tensorflow/compiler/xla/python/local_client.cc
+++ b/tensorflow/compiler/xla/python/local_client.cc
@@ -242,10 +242,35 @@ PyLocalClient::PyLocalClient(
     allocator_ = client_->backend().memory_allocator();
   }
 
+  local_devices_.resize(device_states_.size());
   for (const std::shared_ptr<Device>& device : devices_) {
     CHECK(id_to_device_.insert({device->id(), device}).second)
         << "Duplicate device id: " << device->id();
+
+    if (device->local_device_ordinal() != -1) {
+      int idx = device->local_device_ordinal();
+      CHECK(local_devices_[idx] == nullptr) << idx;
+      CHECK_LT(idx, local_devices_.size());
+      local_devices_[idx] = device;
+    }
   }
+  for (int idx = 0; idx < local_devices_.size(); ++idx) {
+    CHECK(local_devices_[idx] != nullptr) << idx;
+  }
+}
+
+StatusOr<std::string> PyLocalClient::SerializeExecutable(
+    const PyLocalExecutable& executable) const {
+  return Unimplemented("Cannot serialize executables on platform '%s'",
+                       platform_name());
+}
+
+StatusOr<std::unique_ptr<PyLocalExecutable>>
+PyLocalClient::DeserializeExecutable(
+    const std::string& serialized,
+    std::shared_ptr<PyLocalClient> this_shared) const {
+  return Unimplemented("Cannot deserialize executables on platform '%s'",
+                       platform_name());
 }
 
 Status PyLocalClient::TransferToInfeed(const LiteralSlice& literal,
diff --git a/tensorflow/compiler/xla/python/local_client.h b/tensorflow/compiler/xla/python/local_client.h
index 37b3c56b7d2..1cc8175b402 100644
--- a/tensorflow/compiler/xla/python/local_client.h
+++ b/tensorflow/compiler/xla/python/local_client.h
@@ -38,6 +38,8 @@ limitations under the License.
 
 namespace xla {
 
+class PyLocalExecutable;
+
 class Device {
  public:
   explicit Device(int id, int local_device_ordinal, int host_id = 0)
@@ -127,14 +129,17 @@ class PyLocalClient {
       int num_replicas) const;
 
   int device_count() const { return devices_.size(); }
+  int local_device_count() const { return local_devices_.size(); }
   const std::vector<std::shared_ptr<Device>>& devices() { return devices_; }
+  const std::vector<std::shared_ptr<Device>>& local_devices() {
+    return local_devices_;
+  }
   const std::map<int, std::shared_ptr<Device>>& id_to_device() const {
     return id_to_device_;
   }
   int host_id() const { return host_id_; }
   const std::string& platform_name() const { return platform_name_; }
 
-  int local_device_count() const { return device_states_.size(); }
   DeviceState& device_state(int device_ordinal) const {
     return *device_states_.at(device_ordinal);
   }
@@ -149,14 +154,29 @@ class PyLocalClient {
     return &h2d_transfer_pool_;
   }
 
+  // Returns a platform-specific serialization of `executable`. This is meant
+  // for transferring executables and not for storage, and the serialization is
+  // not guaranteed to be stable over time.
+  virtual StatusOr<std::string> SerializeExecutable(
+      const PyLocalExecutable& executable) const;
+
+  // Deserializes a serialized executable as produced by
+  // SerializeExecutable(). `serialized` must have been produced by client of
+  // the same platform. `this_shared` should point to this PyLocalClient.
+  virtual StatusOr<std::unique_ptr<PyLocalExecutable>> DeserializeExecutable(
+      const std::string& serialized,
+      std::shared_ptr<PyLocalClient> this_shared) const;
+
  protected:
   std::string platform_name_;
   LocalClient* client_;
 
   // Includes all devices, including non-local devices on multi-host platforms.
   std::vector<std::shared_ptr<Device>> devices_;
-  // Maps Device::id() to the corresponding Device.
+  // Maps Device::id() to the corresponding Device. Includes all devices.
   std::map<int, std::shared_ptr<Device>> id_to_device_;
+  // Local devices indexed by local device ordinal.
+  std::vector<std::shared_ptr<Device>> local_devices_;
   int host_id_;
 
   // Device states local to this host. Indexed by local device ordinal.
@@ -203,6 +223,7 @@ class PyLocalBuffer {
   const Shape& on_host_shape() const { return on_host_shape_; }
   int device_ordinal() const { return device_ordinal_; }
   const std::string& platform_name() const { return client_->platform_name(); }
+  std::shared_ptr<PyLocalClient> client() const { return client_; }
 
   // Returns the buffer's value as a tuple DAG of Python arrays. If the value
   // has previously been prefetched to the host, then returns the prefetched
@@ -299,6 +320,8 @@ class PyLocalExecutable {
 
   void Delete() { executable_ = nullptr; }
 
+  LocalExecutable* executable() const { return executable_.get(); }
+
  private:
   StatusOr<std::unique_ptr<PyLocalBuffer>> ExecuteHelper(
       absl::Span<PyLocalBuffer* const> argument_handles, int replica,
diff --git a/tensorflow/compiler/xla/python/xla.cc b/tensorflow/compiler/xla/python/xla.cc
index 078fee8f652..08bfe78c47b 100644
--- a/tensorflow/compiler/xla/python/xla.cc
+++ b/tensorflow/compiler/xla/python/xla.cc
@@ -344,6 +344,7 @@ PYBIND11_MODULE(xla_extension, m) {
       .def("device_count", &PyLocalClient::device_count)
       .def("local_device_count", &PyLocalClient::local_device_count)
       .def("devices", &PyLocalClient::devices)
+      .def("local_devices", &PyLocalClient::local_devices)
       .def("host_id", &PyLocalClient::host_id)
       .def("TransferToInfeed",
            [](PyLocalClient* client, const LiteralSlice& literal,
@@ -364,7 +365,15 @@ PYBIND11_MODULE(xla_extension, m) {
                literal_shared = std::make_shared<Literal>(std::move(literal));
              }
              return LiteralToPython(std::move(literal_shared));
-           });
+           })
+      .def("SerializeExecutable",
+           [](PyLocalClient* client,
+              PyLocalExecutable* executable) -> StatusOr<py::bytes> {
+             TF_ASSIGN_OR_RETURN(std::string serialized,
+                                 client->SerializeExecutable(*executable));
+             return py::bytes(serialized);
+           })
+      .def("DeserializeExecutable", &PyLocalClient::DeserializeExecutable);
 
   py::class_<PyLocalBuffer>(m, "PyLocalBuffer")
       .def_static(
@@ -417,7 +426,12 @@ PYBIND11_MODULE(xla_extension, m) {
              return LiteralToPython(std::move(literal));
            })
       .def("shape", &PyLocalBuffer::on_host_shape)
-      .def("device", &PyLocalBuffer::device_ordinal)
+      .def("device",
+           [](PyLocalBuffer* buffer) -> std::shared_ptr<Device> {
+             return buffer->client()->local_devices()[buffer->device_ordinal()];
+           })
+      // TODO(skye): get rid of `device_ordinal` once everything uses `device`
+      .def("device_ordinal", &PyLocalBuffer::device_ordinal)
       .def("platform", &PyLocalBuffer::platform_name)
       .def("is_deleted",
            [](const PyLocalBuffer& buffer) {
diff --git a/tensorflow/compiler/xla/python/xla_client.py b/tensorflow/compiler/xla/python/xla_client.py
index 7abd2f7429d..4dcf3a26301 100644
--- a/tensorflow/compiler/xla/python/xla_client.py
+++ b/tensorflow/compiler/xla/python/xla_client.py
@@ -137,6 +137,12 @@ class LocalBackend(Backend):
                                         options, self.client,
                                         compile_options.device_assignment)
 
+  def serialize(self, executable):
+    return self.client.SerializeExecutable(executable)
+
+  def deserialize(self, serialized_executable):
+    return self.client.DeserializeExecutable(serialized_executable, self.client)
+
 
 xla_platform_names = {
     'cpu': 'Host',
diff --git a/tensorflow/compiler/xla/python/xrt.cc b/tensorflow/compiler/xla/python/xrt.cc
index 147aafc356a..b2d7bbb829a 100644
--- a/tensorflow/compiler/xla/python/xrt.cc
+++ b/tensorflow/compiler/xla/python/xrt.cc
@@ -148,7 +148,10 @@ void AddXrtSubmodule(py::module* module) {
            })
       .def("delete", &XrtBuffer::Delete)
       .def("destructure", &XrtBuffer::DestructureTuple)
+      // TODO(skyewm): remove after we update jax to call device_ordinal instead
+      // of device.
       .def("device", &XrtBuffer::xrt_device_ordinal)
+      .def("device_ordinal", &XrtBuffer::xrt_device_ordinal)
       .def("shape", &XrtBuffer::shape)
       .def("is_deleted",
            [](const XrtBuffer& buffer) { return !buffer.handle().valid(); })
diff --git a/tensorflow/compiler/xla/service/BUILD b/tensorflow/compiler/xla/service/BUILD
index 25f88004f98..46d014f48d8 100755
--- a/tensorflow/compiler/xla/service/BUILD
+++ b/tensorflow/compiler/xla/service/BUILD
@@ -945,9 +945,10 @@ cc_library(
     deps = [
         ":service",
         "//tensorflow/compiler/xla/service/gpu:gpu_transfer_manager",
-        "//tensorflow/compiler/xla/service/mlir_gpu:mlir_compiler",
         "//tensorflow/core:stream_executor_no_cuda",
-    ],
+    ] + if_cuda_is_configured([
+        "//tensorflow/compiler/xla/service/mlir_gpu:mlir_compiler",
+    ]),
 )
 
 cc_library(
@@ -2289,7 +2290,7 @@ cc_library(
     ],
 )
 
-tf_cc_test(
+xla_test(
     name = "dynamic_padder_test",
     srcs = ["dynamic_padder_test.cc"],
     deps = [
@@ -2306,7 +2307,9 @@ tf_cc_test(
         "//tensorflow/compiler/xla:test_helpers",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/client:xla_builder",
+        "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
+        "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/core:test",
     ],
 )
@@ -2589,9 +2592,13 @@ cc_library(
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/core:lib",
+        "//tensorflow/core/platform:logging",
+        "//tensorflow/core/platform:macros",
+        "//tensorflow/core/platform:types",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/types:span",
     ],
 )
@@ -2873,6 +2880,7 @@ cc_library(
         ":call_graph",
         ":computation_layout",
         ":hlo",
+        ":hlo_alias_analysis",
         ":hlo_casting_utils",
         ":hlo_dce",
         ":hlo_graph_dumper",
diff --git a/tensorflow/compiler/xla/service/algebraic_simplifier.cc b/tensorflow/compiler/xla/service/algebraic_simplifier.cc
index 0296805a24b..1cfd1196508 100644
--- a/tensorflow/compiler/xla/service/algebraic_simplifier.cc
+++ b/tensorflow/compiler/xla/service/algebraic_simplifier.cc
@@ -1878,14 +1878,17 @@ Status AlgebraicSimplifierVisitor::HandleDot(HloInstruction* dot) {
 
 Status AlgebraicSimplifierVisitor::HandleGather(HloInstruction* gather) {
   const Shape& operand_shape = gather->operand(0)->shape();
+  if (ShapeUtil::IsZeroElementArray(operand_shape)) {
+    return ReplaceInstruction(gather, MakeScalarLike(gather, 0));
+  }
   // If the operand of a gather is very small, it is easier to fuse a
   // sequence of selects.
+  const Shape& index_shape = gather->operand(1)->shape();
   if (operand_shape.rank() == 1 &&
       operand_shape.dimensions(0) <= options_.very_small_gather_size() &&
       gather->gather_dimension_numbers().index_vector_dim() ==
-          gather->operand(1)->shape().rank() &&
+          index_shape.rank() &&
       gather->gather_dimension_numbers().collapsed_slice_dims_size() == 1) {
-    const Shape& index_shape = gather->operand(1)->shape();
     const int64 operand_elements = operand_shape.dimensions(0);
     auto get_value = [&](int64 i) {
       auto slice = computation_->AddInstruction(HloInstruction::CreateSlice(
@@ -2165,13 +2168,34 @@ Status AlgebraicSimplifierVisitor::HandleLog(HloInstruction* log) {
     return Status::OK();
   }
 
-  // ln(pow(A,B)) => B*ln(A)
+  // ln(pow(A,B)) => B*ln(abs(A))
+  // or B*ln(A) if A is complex.
   if (Match(log, m::Log(m::Power(m::Op(&a), m::Op(&b))))) {
+    auto abs_a = ShapeUtil::ElementIsComplex(a->shape())
+                     ? a
+                     : computation_->AddInstruction(HloInstruction::CreateUnary(
+                           log->shape(), HloOpcode::kAbs, a));
+    auto new_log = computation_->AddInstruction(
+        HloInstruction::CreateUnary(log->shape(), HloOpcode::kLog, abs_a));
+    return ReplaceWithNewInstruction(
+        log, HloInstruction::CreateBinary(log->shape(), HloOpcode::kMultiply,
+                                          new_log, b));
+  }
+
+  if (Match(log, m::Log(m::Sqrt(m::Op(&a))))) {
     auto new_log = computation_->AddInstruction(
         HloInstruction::CreateUnary(log->shape(), HloOpcode::kLog, a));
     return ReplaceWithNewInstruction(
         log, HloInstruction::CreateBinary(log->shape(), HloOpcode::kMultiply,
-                                          new_log, b));
+                                          new_log, MakeScalarLike(log, 0.5)));
+  }
+
+  if (Match(log, m::Log(m::Rsqrt(m::Op(&a))))) {
+    auto new_log = computation_->AddInstruction(
+        HloInstruction::CreateUnary(log->shape(), HloOpcode::kLog, a));
+    return ReplaceWithNewInstruction(
+        log, HloInstruction::CreateBinary(log->shape(), HloOpcode::kMultiply,
+                                          new_log, MakeScalarLike(log, -0.5)));
   }
 
   return Status::OK();
@@ -2574,6 +2598,7 @@ Status AlgebraicSimplifierVisitor::HandlePower(HloInstruction* power) {
         power, HloInstruction::CreateUnary(power->shape(), HloOpcode::kExp,
                                            a_times_b));
   }
+
   VLOG(10) << "trying transform [pow(A, 2) => A*A]: " << power->ToString();
   if (IsAll(rhs, 2)) {
     return ReplaceWithNewInstruction(
@@ -3158,6 +3183,24 @@ Status AlgebraicSimplifierVisitor::HandleSlice(HloInstruction* slice) {
     return Status::OK();
   }
 
+  // Try to simplify concat -> slice to an operand of concat.
+  if (slice->operand(0)->opcode() == HloOpcode::kConcatenate &&
+      IsUnstridedSlice(slice)) {
+    auto concat = slice->operand(0);
+    int64 concat_dim = concat->concatenate_dimension();
+    int64 piece_start = 0;
+    for (auto piece : concat->operands()) {
+      if (!SameShape(piece, slice)) {
+        piece_start += piece->shape().dimensions(concat_dim);
+        continue;
+      }
+      if (slice->slice_starts(concat_dim) == piece_start) {
+        return ReplaceInstruction(slice, piece);
+      }
+      piece_start += piece->shape().dimensions(concat_dim);
+    }
+  }
+
   // Do not try to reorder slices and reshapes after layout assignment as it may
   // be invalid.
   if (!options_.is_layout_sensitive()) {
diff --git a/tensorflow/compiler/xla/service/algebraic_simplifier_test.cc b/tensorflow/compiler/xla/service/algebraic_simplifier_test.cc
index f918634e075..3e4c906a4a5 100644
--- a/tensorflow/compiler/xla/service/algebraic_simplifier_test.cc
+++ b/tensorflow/compiler/xla/service/algebraic_simplifier_test.cc
@@ -1273,9 +1273,57 @@ TEST_F(AlgebraicSimplifierTest, LnPow) {
   AlgebraicSimplifier simplifier(default_options_);
   ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
 
+  EXPECT_THAT(computation->root_instruction(),
+              GmockMatch(m::Multiply(m::Log(m::Abs(m::Parameter(0))),
+                                     m::Parameter(1))));
+}
+
+TEST_F(AlgebraicSimplifierTest, LnSqrt) {
+  auto m = CreateNewVerifiedModule();
+  Shape r0f32 = ShapeUtil::MakeShape(F32, {});
+  HloComputation::Builder builder(TestName());
+  HloInstruction* param0 = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, r0f32, "param0"));
+  HloInstruction* sqrt = builder.AddInstruction(
+      HloInstruction::CreateUnary(r0f32, HloOpcode::kSqrt, param0));
+  builder.AddInstruction(
+      HloInstruction::CreateUnary(r0f32, HloOpcode::kLog, sqrt));
+
+  auto computation = m->AddEntryComputation(builder.Build());
+
+  EXPECT_THAT(computation->root_instruction(),
+              GmockMatch(m::Log(m::Sqrt(m::Parameter(0)))));
+
+  AlgebraicSimplifier simplifier(default_options_);
+  ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
+
   EXPECT_THAT(
       computation->root_instruction(),
-      GmockMatch(m::Multiply(m::Log(m::Parameter(0)), m::Parameter(1))));
+      GmockMatch(m::Multiply(m::Log(m::Parameter(0)), m::ConstantScalar(0.5))));
+}
+
+TEST_F(AlgebraicSimplifierTest, LnRsqrt) {
+  auto m = CreateNewVerifiedModule();
+  Shape r0f32 = ShapeUtil::MakeShape(F32, {});
+  HloComputation::Builder builder(TestName());
+  HloInstruction* param0 = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, r0f32, "param0"));
+  HloInstruction* rsqrt = builder.AddInstruction(
+      HloInstruction::CreateUnary(r0f32, HloOpcode::kRsqrt, param0));
+  builder.AddInstruction(
+      HloInstruction::CreateUnary(r0f32, HloOpcode::kLog, rsqrt));
+
+  auto computation = m->AddEntryComputation(builder.Build());
+
+  EXPECT_THAT(computation->root_instruction(),
+              GmockMatch(m::Log(m::Rsqrt(m::Parameter(0)))));
+
+  AlgebraicSimplifier simplifier(default_options_);
+  ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
+
+  EXPECT_THAT(computation->root_instruction(),
+              GmockMatch(m::Multiply(m::Log(m::Parameter(0)),
+                                     m::ConstantScalar(-0.5))));
 }
 
 // Test that ln(exp(A)) is simplified to A
@@ -5639,5 +5687,21 @@ TEST_F(AlgebraicSimplifierTest, MaxOfClamp) {
       GmockMatch(m::Clamp(m::Parameter(0), m::Parameter(1), m::Parameter(2))));
 }
 
+TEST_F(AlgebraicSimplifierTest, SliceOfConcat) {
+  const char* kModuleStr = R"(
+    HloModule m
+    test {
+      p0 = f32[100,50] parameter(0)
+      p1 = f32[50,50] parameter(1)
+      c0 = f32[150,50] concatenate(p0, p1), dimensions={0}
+      ROOT s0 = f32[50,50] slice(c0), slice={[100:150], [0:50]}
+    }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(kModuleStr));
+  ASSERT_TRUE(AlgebraicSimplifier(default_options_).Run(m.get()).ValueOrDie());
+  EXPECT_THAT(m->entry_computation()->root_instruction(),
+              GmockMatch(m::Parameter(1)));
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/batchnorm_expander.cc b/tensorflow/compiler/xla/service/batchnorm_expander.cc
index 131b50efc9c..de9c4f16efe 100644
--- a/tensorflow/compiler/xla/service/batchnorm_expander.cc
+++ b/tensorflow/compiler/xla/service/batchnorm_expander.cc
@@ -105,8 +105,8 @@ class BatchNormExpanderVisitor : public DfsHloRewriteVisitor {
       HloInstruction* operand, int64 feature_index,
       const std::function<HloInstruction*(std::unique_ptr<HloInstruction>)>&
           add_instruction) {
-    auto elements_per_feature_u32 = add_instruction(
-        HloInstruction::CreateConstant(LiteralUtil::CreateR0<uint32>(1)));
+    auto elements_per_feature_s32 = add_instruction(
+        HloInstruction::CreateConstant(LiteralUtil::CreateR0<int32>(1)));
 
     for (int64 i = 0; i < operand->shape().rank(); ++i) {
       if (i == feature_index) {
@@ -114,15 +114,15 @@ class BatchNormExpanderVisitor : public DfsHloRewriteVisitor {
       }
       auto dynamic_dimension_size =
           add_instruction(HloInstruction::CreateGetDimensionSize(
-              ShapeUtil::MakeShape(U32, {}), operand, i));
-      elements_per_feature_u32 = add_instruction(HloInstruction::CreateBinary(
-          ShapeUtil::MakeShape(U32, {}), HloOpcode::kMultiply,
-          dynamic_dimension_size, elements_per_feature_u32));
+              ShapeUtil::MakeShape(S32, {}), operand, i));
+      elements_per_feature_s32 = add_instruction(HloInstruction::CreateBinary(
+          ShapeUtil::MakeShape(S32, {}), HloOpcode::kMultiply,
+          dynamic_dimension_size, elements_per_feature_s32));
     }
 
     return HloInstruction::CreateConvert(
         ShapeUtil::MakeShape(operand->shape().element_type(), {}),
-        elements_per_feature_u32);
+        elements_per_feature_s32);
   }
 
   // Current HloComputation instance the BatchNormExpander is
diff --git a/tensorflow/compiler/xla/service/buffer_assignment.cc b/tensorflow/compiler/xla/service/buffer_assignment.cc
index d72a91f45df..6bf745df968 100644
--- a/tensorflow/compiler/xla/service/buffer_assignment.cc
+++ b/tensorflow/compiler/xla/service/buffer_assignment.cc
@@ -234,8 +234,9 @@ BufferAllocation::Slice BufferAllocation::GetSlice(
 
 void BufferAllocation::AddAssignment(const HloValue& buffer, int64 offset,
                                      int64 size) {
-  VLOG(4) << "Adding the following buffer to allocation #" << index() << " ["
-          << offset << ", " << size << "]: " << buffer;
+  VLOG(4) << "Adding the following buffer to allocation #" << index()
+          << absl::StrFormat(" (size=%d, offset=%d) %s", size, offset,
+                             buffer.ToShortString());
   CHECK(!assigned_buffers_.contains(&buffer))
       << "LogicalBuffer " << buffer << " already assigned to allocation "
       << index_;
@@ -291,6 +292,10 @@ BufferAllocationProto BufferAllocation::ToProto() const {
   return proto;
 }
 
+static bool CompareHloValuesById(const HloValue* a, const HloValue* b) {
+  return a->id() < b->id();
+}
+
 string BufferAllocation::ToString() const {
   string output;
   StrAppendFormat(&output, "allocation %d: %p, size %d", index_, this, size());
@@ -319,15 +324,14 @@ string BufferAllocation::ToString() const {
   for (const auto& buffer_offset_size : assigned_buffers_) {
     sorted_buffers.push_back(buffer_offset_size.first);
   }
-  absl::c_sort(sorted_buffers, [](const HloValue* a, const HloValue* b) {
-    return a->id() < b->id();
-  });
+  absl::c_sort(sorted_buffers, &CompareHloValuesById);
   for (const HloValue* buffer : sorted_buffers) {
     const OffsetSize& offset_size = FindOrDie(assigned_buffers_, buffer);
-    StrAppend(&output, absl::StrFormat(
-                           "  %s [%d,%d]: %s\n", buffer->ToString(),
-                           offset_size.offset, offset_size.size,
-                           ShapeUtil::HumanStringWithLayout(buffer->shape())));
+    StrAppend(&output,
+              absl::StrFormat(
+                  " value: %s (size=%d,offset=%d): %s\n",
+                  buffer->ToShortString(), offset_size.size, offset_size.offset,
+                  ShapeUtil::HumanStringWithLayout(buffer->shape())));
   }
   return output;
 }
@@ -715,8 +719,17 @@ string BufferAssignment::Stats::ToString() const {
 string BufferAssignment::ToString() const {
   string output;
   absl::StrAppend(&output, "BufferAssignment:\n");
+  std::vector<const HloValue*> used_values;
   for (auto& allocation : allocations_) {
     absl::StrAppend(&output, allocation.ToString());
+    for (const auto& p : allocation.assigned_buffers()) {
+      used_values.push_back(p.first);
+    }
+  }
+  absl::StrAppend(&output, "\nUsed values:\n");
+  absl::c_sort(used_values, &CompareHloValuesById);
+  for (const HloValue* value : used_values) {
+    absl::StrAppend(&output, value->ToString());
   }
   return output;
 }
@@ -808,12 +821,18 @@ bool BufferAssigner::LiveRangeInterferes(const HloValue* buffer1,
       auto operand_value = buffer1;
       auto user_value = buffer2;
       if (!can_share_as_operand(user_value, operand_value)) {
+        VLOG(4) << "End of live range of " << buffer1->ToShortString()
+                << " is equal to the start of live range of "
+                << buffer2->ToShortString() << ", buffer cannot be shared.";
         return true;
       }
     } else if (live_range_2.end == live_range_1.start) {
       auto operand_value = buffer2;
       auto user_value = buffer1;
       if (!can_share_as_operand(user_value, operand_value)) {
+        VLOG(4) << "End of live range of " << buffer2->ToShortString()
+                << " is equal to the start of live range of "
+                << buffer1->ToShortString() << ", buffer cannot be shared.";
         return true;
       }
     } else {
@@ -898,6 +917,9 @@ bool BufferAssigner::MaybeAssignBuffer(BufferAllocation* allocation,
     for (const HloValue* new_value : hlo_buffer.values()) {
       if (assignment->hlo_live_range().total_order_scheduled()) {
         if (LiveRangeInterferes(new_value, &assigned_buffer, assignment)) {
+          VLOG(4) << "Can't assign: assignee " << assigned_buffer
+                  << " live range interferes with "
+                  << new_value->ToShortString();
           return false;
         }
       } else if (assignment->hlo_ordering().MayInterfere(
@@ -1235,20 +1257,24 @@ Status BufferAssigner::AssignBuffersForComputations(
           return a_size > b_size;  // use ">" for decreasing size.
         }
 
+        // Values which live out the computation lifetime will be assigned
+        // first, as they can not be given to the heap simulator.
         const bool a_live_out = alias_analysis.BufferLivesOut(*a);
         const bool b_live_out = alias_analysis.BufferLivesOut(*b);
         if (a_live_out != b_live_out) {
           return a_live_out;
         }
+
+        // Process values in the reverse postorder, since we have to start
+        // with the last value.
         auto compare = [&post_order_position](const HloValue* value1,
                                               const HloValue* value2) {
-          return post_order_position.at(value1->instruction()) <
+          return post_order_position.at(value1->instruction()) >
                  post_order_position.at(value2->instruction());
         };
         const HloValue* a_min = *absl::c_min_element(a->values(), compare);
         const HloValue* b_min = *absl::c_min_element(b->values(), compare);
-        return post_order_position.at(a_min->instruction()) <
-               post_order_position.at(b_min->instruction());
+        return compare(a_min, b_min);
       });
 
   std::vector<BufferAllocation::Index> allocation_indices;
diff --git a/tensorflow/compiler/xla/service/buffer_assignment_test.cc b/tensorflow/compiler/xla/service/buffer_assignment_test.cc
index 1c985485d43..3ec5c1e3d49 100644
--- a/tensorflow/compiler/xla/service/buffer_assignment_test.cc
+++ b/tensorflow/compiler/xla/service/buffer_assignment_test.cc
@@ -2432,6 +2432,50 @@ ENTRY Main {
             GetAllocation(*buffers, param0, {1, 1}));
 }
 
+TEST_F(BufferAssignmentTest, ProcessingOrderTest) {
+  const char* hlo_text = R"(
+HloModule nested_convolution
+
+ENTRY %nested_convolution (param: f32[200,32,32,1]) -> f32[200,32,32,1] {
+  %param = f32[200,32,32,1]{3,2,1,0} parameter(0)
+  %bitcast = f32[200,32,32,1]{2,1,3,0} bitcast(f32[200,32,32,1]{3,2,1,0} %param)
+  %one = f32[] constant(1)
+  %conv_window = f32[3,3,1,1]{1,0,2,3} broadcast(f32[] %one), dimensions={}
+  %conv0 = (f32[200,32,32,1]{2,1,3,0}, u8[6152]{0}) custom-call(f32[200,32,32,1]{2,1,3,0} %bitcast, f32[3,3,1,1]{1,0,2,3} %conv_window),
+    window={size=3x3 pad=1_1x1_1}, dim_labels=b01f_01io->b01f, custom_call_target="__cudnn$convForward", backend_config="{algorithm:1,tensor_ops_enabled:true,conv_result_scale:1}"
+  %get-tuple-element.6 = f32[200,32,32,1]{2,1,3,0} get-tuple-element((f32[200,32,32,1]{2,1,3,0}, u8[6152]{0}) %conv0), index=0
+  %conv1 = (f32[200,32,32,1]{2,1,3,0}, u8[6152]{0}) custom-call(f32[200,32,32,1]{2,1,3,0} %get-tuple-element.6, f32[3,3,1,1]{1,0,2,3} %conv_window),
+    window={size=3x3 pad=1_1x1_1}, dim_labels=b01f_01io->b01f, custom_call_target="__cudnn$convForward", backend_config="{algorithm:1,tensor_ops_enabled:true,conv_result_scale:1}"
+  %get-tuple-element.7 = f32[200,32,32,1]{2,1,3,0} get-tuple-element((f32[200,32,32,1]{2,1,3,0}, u8[6152]{0}) %conv1), index=0
+  %conv2 = (f32[200,32,32,1]{2,1,3,0}, u8[6152]{0}) custom-call(f32[200,32,32,1]{2,1,3,0} %get-tuple-element.7, f32[3,3,1,1]{1,0,2,3} %conv_window),
+    window={size=3x3 pad=1_1x1_1}, dim_labels=b01f_01io->b01f, custom_call_target="__cudnn$convForward", backend_config="{algorithm:1,tensor_ops_enabled:true,conv_result_scale:1}"
+  %get-tuple-element.8 = f32[200,32,32,1]{2,1,3,0} get-tuple-element((f32[200,32,32,1]{2,1,3,0}, u8[6152]{0}) %conv2), index=0
+  %conv3 = (f32[200,32,32,1]{2,1,3,0}, u8[6152]{0}) custom-call(f32[200,32,32,1]{2,1,3,0} %get-tuple-element.8, f32[3,3,1,1]{1,0,2,3} %conv_window),
+    window={size=3x3 pad=1_1x1_1}, dim_labels=b01f_01io->b01f, custom_call_target="__cudnn$convForward", backend_config="{algorithm:1,tensor_ops_enabled:true,conv_result_scale:1}"
+  %get-tuple-element.9 = f32[200,32,32,1]{2,1,3,0} get-tuple-element((f32[200,32,32,1]{2,1,3,0}, u8[6152]{0}) %conv3), index=0
+  %conv4 = (f32[200,32,32,1]{2,1,3,0}, u8[6152]{0}) custom-call(f32[200,32,32,1]{2,1,3,0} %get-tuple-element.9, f32[3,3,1,1]{1,0,2,3} %conv_window),
+    window={size=3x3 pad=1_1x1_1}, dim_labels=b01f_01io->b01f, custom_call_target="__cudnn$convForward", backend_config="{algorithm:1,tensor_ops_enabled:true,conv_result_scale:1}"
+  %get-tuple-element.10 = f32[200,32,32,1]{2,1,3,0} get-tuple-element((f32[200,32,32,1]{2,1,3,0}, u8[6152]{0}) %conv4), index=0
+  %conv5 = (f32[200,32,32,1]{2,1,3,0}, u8[6152]{0}) custom-call(f32[200,32,32,1]{2,1,3,0} %get-tuple-element.10, f32[3,3,1,1]{1,0,2,3} %conv_window),
+    window={size=3x3 pad=1_1x1_1}, dim_labels=b01f_01io->b01f, custom_call_target="__cudnn$convForward", backend_config="{algorithm:1,tensor_ops_enabled:true,conv_result_scale:1}"
+  %get-tuple-element.11 = f32[200,32,32,1]{2,1,3,0} get-tuple-element((f32[200,32,32,1]{2,1,3,0}, u8[6152]{0}) %conv5), index=0
+  ROOT %bitcast.1 = f32[200,32,32,1]{3,2,1,0} bitcast(f32[200,32,32,1]{2,1,3,0} %get-tuple-element.11)
+}
+)";
+
+  HloModuleConfig config;
+  config.set_debug_options(GetDebugOptionsFromFlags());
+  TF_ASSERT_OK_AND_ASSIGN(auto m,
+                          ParseAndReturnVerifiedModule(hlo_text, config));
+
+  std::unique_ptr<BufferAssignment> buffers = RunBufferAssignment(m.get());
+
+  // We should occupy strictly less size than 4 * size of the buffer required
+  // for convolution.
+  int64 conv_size_bytes = 200 * 32 * 32 * 4;
+  EXPECT_LT(buffers->GetStats().total_allocation_bytes, conv_size_bytes * 4);
+}
+
 TEST_F(WhileBufferAssignmentTest, WhileLoopsInterferingResultRange) {
   auto module = CreateNewVerifiedModule();
   auto builder = HloComputation::Builder(TestName());
diff --git a/tensorflow/compiler/xla/service/convolution_group_converter.cc b/tensorflow/compiler/xla/service/convolution_group_converter.cc
index 20ebafcf780..cfcf059ba5f 100644
--- a/tensorflow/compiler/xla/service/convolution_group_converter.cc
+++ b/tensorflow/compiler/xla/service/convolution_group_converter.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/convolution_group_converter.h"
 
+#include <algorithm>
 #include <memory>
 #include <vector>
 
@@ -474,8 +475,6 @@ Status ConvolutionVisitor::HandleConvolution(HloInstruction* convolution) {
                                                      new_convolution)));
     }
   } else {
-    int64 activation_input_feature_dim = dim_numbers.input_feature_dimension();
-
     int64 output_feature =
         filter->shape().dimensions(kernel_output_feature_dim);
 
@@ -487,11 +486,62 @@ Status ConvolutionVisitor::HandleConvolution(HloInstruction* convolution) {
     // [3, 2, 4]{S, B, IF} depth conv [3, 1, 4]{S, IF, OF}, where S is the
     // additional spatial dimension. The generated convolution output will be
     // [1, 2, 4]{S, B, OF} and then reshape the output back to [2, 4] {B, OF}.
-
-    if (group_count == output_feature && !filter_expansion_) {
+    // We only do this for b0..0f or f0..0b dimension labels on activations.
+    const int64 input_feature_dim = dim_numbers.input_feature_dimension();
+    const int64 input_batch_dim = dim_numbers.input_batch_dimension();
+    const int64 activations_dimension_count =
+        convolution->operand(0)->shape().dimensions().size();
+    if (group_count == output_feature && !filter_expansion_ &&
+        ((input_feature_dim == 0 &&
+          input_batch_dim == activations_dimension_count - 1) ||
+         (input_batch_dim == 0 &&
+          input_feature_dim == activations_dimension_count - 1))) {
       auto filter = convolution->mutable_operand(1);
       auto activation = convolution->mutable_operand(0);
 
+      // We want b0..0f logical dimensions on activations. If they are f0..0b
+      // instead, we transpose the activations to have the right dimension
+      // ordering.
+      if (input_feature_dim < input_batch_dim) {
+        // Generate the required shape for activations by swapping batch and
+        // feature dimension sizes.
+        Shape new_act_shape = activation->shape();
+        new_act_shape.set_dimensions(dim_numbers.input_feature_dimension(),
+                                     activation->shape().dimensions(
+                                         dim_numbers.input_batch_dimension()));
+        new_act_shape.set_dimensions(
+            dim_numbers.input_batch_dimension(),
+            activation->shape().dimensions(
+                dim_numbers.input_feature_dimension()));
+
+        // Generate dimension mapping.
+        std::vector<int64> transpose_dims(new_act_shape.dimensions_size());
+        std::iota(transpose_dims.begin(), transpose_dims.end(), 0);
+        std::iter_swap(transpose_dims.begin(), transpose_dims.end() - 1);
+
+        // Transpose the activations. Change the convolution input.
+        auto transposed_activations =
+            computation_->AddInstruction(HloInstruction::CreateTranspose(
+                new_act_shape, activation, transpose_dims));
+        TF_CHECK_OK(convolution->ReplaceOperandWithDifferentShape(
+            0, transposed_activations));
+
+        const int64 old_feature_dim = dim_numbers.input_feature_dimension();
+        const int64 old_batch_dim = dim_numbers.input_batch_dimension();
+
+        // Rectify the convolution dimension numbers.
+        dim_numbers.set_input_feature_dimension(old_batch_dim);
+        dim_numbers.set_input_batch_dimension(old_feature_dim);
+        convolution->set_convolution_dimension_numbers(dim_numbers);
+
+        // Update the data structures we'd use.
+        dim_numbers = convolution->convolution_dimension_numbers();
+        activation = convolution->mutable_operand(0);
+      }
+
+      const int64 activation_input_feature_dim =
+          dim_numbers.input_feature_dimension();
+
       // Add spatial dimension to the activation, and reshape.
       Shape reshaped_activation_shape = activation->shape();
       ShapeUtil::AppendMajorDimension(group_size, &reshaped_activation_shape);
@@ -534,12 +584,16 @@ Status ConvolutionVisitor::HandleConvolution(HloInstruction* convolution) {
           /*batch_group_count=*/1, new_window, dim_numbers,
           convolution->precision_config()));
 
+      VLOG(2) << "New convolution " << new_convolution->ToString();
+
       // Delete the extra spatial dimension, and reshape.
       Shape reshaped_convolution_shape =
           ShapeUtil::DeleteDimension(new_spatial_dim, new_convolution->shape());
       auto reshaped_convolution = HloInstruction::CreateReshape(
           reshaped_convolution_shape, new_convolution);
 
+      VLOG(2) << "Reshaped convolution " << reshaped_convolution->ToString();
+
       TF_RETURN_IF_ERROR(computation_->ReplaceWithNewInstruction(
           convolution, std::move(reshaped_convolution)));
 
diff --git a/tensorflow/compiler/xla/service/copy_insertion.cc b/tensorflow/compiler/xla/service/copy_insertion.cc
index 7606e31b24d..85cf5e70f55 100644
--- a/tensorflow/compiler/xla/service/copy_insertion.cc
+++ b/tensorflow/compiler/xla/service/copy_insertion.cc
@@ -637,19 +637,6 @@ class CopyRemover {
     DCHECK(src != nullptr);
     DCHECK(dest != nullptr);
 
-    auto is_live_range_before = [this](const ValueNode& a, const ValueNode& b) {
-      VLOG(3) << "Checking live range of " << *a.value << " WRT " << *b.value;
-      if (LiveRangeBefore(a, b)) {
-        VLOG(2) << "  Live range of " << a.value->ToShortString()
-                << " is before " << b.value->ToShortString();
-        return true;
-      } else {
-        VLOG(2) << "  Live range of " << a.value->ToShortString()
-                << " is not before " << b.value->ToShortString();
-        return false;
-      }
-    };
-
     VLOG(3) << copy->name() << " copies value " << src->value->ToShortString();
     VLOG(3) << "Source buffer values: " << ValueListToString(src);
     VLOG(3) << "Dest buffer values: " << ValueListToString(dest);
@@ -715,7 +702,7 @@ class CopyRemover {
       ValueNode* next_dest = Next(*dest);
       if (next_dest != nullptr) {
         // Live range of 'from' value (s_x) must be before 'next_dest' (d_1);
-        if (!is_live_range_before(*src, *next_dest)) {
+        if (!LiveRangeBefore(*src, *next_dest)) {
           return false;
         }
       }
@@ -725,7 +712,7 @@ class CopyRemover {
         // Live range of 'last_dest' (d_m) must be before 'next_src' s_{x+1}.
         ValueNode* last_dest = dest->prev;
         DCHECK(IsTail(*last_dest));
-        if (!is_live_range_before(*last_dest, *next_src)) {
+        if (!LiveRangeBefore(*last_dest, *next_src)) {
           return false;
         }
       }
@@ -754,13 +741,13 @@ class CopyRemover {
       DCHECK(prev_dest != nullptr);
       ValueNode* first_src = src->next;
       DCHECK(IsHead(*first_src));
-      if (!is_live_range_before(*prev_dest, *first_src)) {
+      if (!LiveRangeBefore(*prev_dest, *first_src)) {
         // Live range of value d_{y-1} is not before s_0.
         return false;
       }
       ValueNode* next_dest = Next(*dest);
       if (next_dest != nullptr) {
-        if (!is_live_range_before(*src, *next_dest)) {
+        if (!LiveRangeBefore(*src, *next_dest)) {
           // Live range of value s_n is not before d_{y+1}.
           return false;
         }
@@ -829,19 +816,30 @@ class CopyRemover {
   // We cannot use LiveRangeStrictlyBefore because HloValue::uses() is not
   // updated as copies are removed.
   bool LiveRangeBefore(const ValueNode& a, const ValueNode& b) {
-    if (a.uses.empty()) {
-      VLOG(2) << "Empty uses for " << *a.value;
-      return ordering_.IsDefinedBefore(*a.value, *b.value);
-    }
-    for (const HloUse* use : a.uses) {
-      VLOG(2) << "Checking use " << *use << " against " << *b.value;
-      if (!ordering_.UseIsBeforeValueDefinition(*use, *b.value, dataflow_)) {
-        VLOG(2) << "Use " << *use << " is NOT before " << *b.value;
-        return false;
+    VLOG(3) << "Checking live range of " << *a.value << " WRT " << *b.value;
+    bool is_live_range_before = [&] {
+      if (a.uses.empty()) {
+        VLOG(2) << "Empty uses for " << *a.value;
+        return ordering_.IsDefinedBefore(*a.value, *b.value);
       }
-      VLOG(2) << "Use " << *use << " is before " << *b.value;
+      for (const HloUse* use : a.uses) {
+        VLOG(3) << "Checking use " << *use << " against " << *b.value;
+        if (!ordering_.UseIsBeforeValueDefinition(*use, *b.value, dataflow_)) {
+          VLOG(2) << "Use " << *use << " is NOT before " << *b.value;
+          return false;
+        }
+        VLOG(3) << "Use " << *use << " is before " << *b.value;
+      }
+      return true;
+    }();
+    if (is_live_range_before) {
+      VLOG(2) << "  Live range of " << a.value->ToShortString() << " is before "
+              << b.value->ToShortString();
+    } else {
+      VLOG(2) << "  Live range of " << a.value->ToShortString()
+              << " is not before " << b.value->ToShortString();
     }
-    return true;
+    return is_live_range_before;
   }
 
   // Returns whether 'node' is the last node in its list.
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc b/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc
index e7371c79b39..7b15d49cc47 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc
@@ -344,6 +344,9 @@ Status CpuCompiler::RunHloPassesThroughLayoutAssn(
       TransposeFolding::NeverFoldTranspose);
   pipeline.AddPass<HloCSE>(/*is_layout_sensitive=*/false);
 
+  // Layout assignment uses alias analysis, which requires the call graph to be
+  // flattened.
+  pipeline.AddPass<FlattenCallGraph>();
   pipeline.AddPass<CpuLayoutAssignment>(
       module->mutable_entry_computation_layout(),
       LayoutAssignment::InstructionCanChangeLayout, target_machine_features);
@@ -407,7 +410,6 @@ Status CpuCompiler::RunHloPassesAfterLayoutAssn(
   // before (and sometime after) copy insertion, to avoid dead code from
   // interfering with the rewrites.
   pipeline.AddPass<HloDCE>();
-  pipeline.AddPass<FlattenCallGraph>();
   pipeline.AddPass<CopyInsertion>();
   pipeline.AddPass<HloDCE>();
   return pipeline.Run(module).status();
diff --git a/tensorflow/compiler/xla/service/dynamic_dimension_inference.cc b/tensorflow/compiler/xla/service/dynamic_dimension_inference.cc
index 1f7d41c7b94..e02a58210c2 100644
--- a/tensorflow/compiler/xla/service/dynamic_dimension_inference.cc
+++ b/tensorflow/compiler/xla/service/dynamic_dimension_inference.cc
@@ -449,10 +449,15 @@ Status DynamicDimensionInferenceVisitor::HandleElementwiseBinary(
 
 Status DynamicDimensionInferenceVisitor::HandleReshape(HloInstruction* hlo) {
   return ForEachOperandDynamicDimension(
-      hlo, [&](HloInstruction* operand, ShapeIndex index, int64 dimension,
-               int64 operand_index, HloInstruction* dynamic_size,
-               DimensionConstraint constraint) {
+      hlo,
+      [&](HloInstruction* operand, ShapeIndex index, int64 dimension,
+          int64 operand_index, HloInstruction* dynamic_size,
+          DimensionConstraint constraint) -> Status {
         HloInstruction* reshape = hlo;
+        TF_RET_CHECK(reshape->shape().rank() > 0)
+            << "Reshaping a dynamic dimension into a scalar, which has "
+               "undefined behavior. The offending instruction is: "
+            << reshape->ToString();
         // Reshape is supported as long as it is the most
         // major one and it is combining with other non-dynamic dimensions.
         const int64 output_most_major = reshape->shape().dimensions(0);
@@ -463,7 +468,7 @@ Status DynamicDimensionInferenceVisitor::HandleReshape(HloInstruction* hlo) {
                 reshape->shape().dimensions(0) / operand->shape().dimensions(0);
             HloInstruction* multiplier_hlo =
                 hlo->parent()->AddInstruction(HloInstruction::CreateConstant(
-                    LiteralUtil::CreateR0<uint32>(multiplier)));
+                    LiteralUtil::CreateR0<int32>(multiplier)));
 
             HloInstruction* new_dynamic_size =
                 hlo->parent()->AddInstruction(HloInstruction::CreateBinary(
@@ -638,7 +643,7 @@ Status DynamicDimensionInferenceVisitor::HandleReshape(HloInstruction* hlo) {
                 reshape->shape().dimensions(dynamic_dimension);
             HloInstruction* divisor_hlo =
                 hlo->parent()->AddInstruction(HloInstruction::CreateConstant(
-                    LiteralUtil::CreateR0<uint32>(divisor)));
+                    LiteralUtil::CreateR0<int32>(divisor)));
 
             HloInstruction* new_dynamic_size =
                 hlo->parent()->AddInstruction(HloInstruction::CreateBinary(
@@ -828,20 +833,13 @@ Status DynamicDimensionInferenceVisitor::HandleScatter(HloInstruction* hlo) {
           int64 operand_index, HloInstruction* operand_dynamic_size,
           DimensionConstraint constraint) {
         if (operand_index == 0) {
-          return Unimplemented(
-              "Detects a dynamic dimension on the data input of scatter, which "
-              "is not supported: %s",
-              hlo->ToString());
-        }
-
-        const ScatterDimensionNumbers& scatter_dims =
-            hlo->scatter_dimension_numbers();
-        if (operand_index == 1) {
           parent_->SetDynamicSize(hlo, {}, dimension, operand_dynamic_size,
                                   constraint);
           return Status::OK();
         }
 
+        const ScatterDimensionNumbers& scatter_dims =
+            hlo->scatter_dimension_numbers();
         if (operand_index == 2 &&
             absl::c_linear_search(scatter_dims.update_window_dims(),
                                   dimension)) {
diff --git a/tensorflow/compiler/xla/service/dynamic_dimension_inference.h b/tensorflow/compiler/xla/service/dynamic_dimension_inference.h
index 12af09fee4a..e8e89c8357b 100644
--- a/tensorflow/compiler/xla/service/dynamic_dimension_inference.h
+++ b/tensorflow/compiler/xla/service/dynamic_dimension_inference.h
@@ -164,6 +164,8 @@ class DynamicDimensionInference {
   // by a scalar instruction `size`.
   void SetDynamicSize(HloInstruction* inst, const ShapeIndex& index, int64 dim,
                       HloInstruction* size, DimensionConstraint constraint) {
+    VLOG(1) << "Set dimension inst " << inst->name() << " index "
+            << index.ToString() << "@" << dim << " to " << size->ToString();
     Shape subshape = ShapeUtil::GetSubshape(inst->shape(), index);
     CHECK(!subshape.IsTuple())
         << "Can't set a tuple shape to dynamic dimension";
diff --git a/tensorflow/compiler/xla/service/dynamic_dimension_inference_test.cc b/tensorflow/compiler/xla/service/dynamic_dimension_inference_test.cc
index 7a13307ffbf..264263570cb 100644
--- a/tensorflow/compiler/xla/service/dynamic_dimension_inference_test.cc
+++ b/tensorflow/compiler/xla/service/dynamic_dimension_inference_test.cc
@@ -94,7 +94,7 @@ class DynamicDimensionInferenceTest : public HloTestBase {
 
   std::unique_ptr<HloModule> module_;
   std::unique_ptr<DynamicDimensionInference> inference_;
-  const Shape scalar_shape_ = ShapeUtil::MakeShape(U32, {});
+  const Shape scalar_shape_ = ShapeUtil::MakeShape(S32, {});
 };
 
 TEST_F(DynamicDimensionInferenceTest, ParamTest) {
@@ -557,7 +557,7 @@ TEST_F(DynamicDimensionInferenceTest, ReshapeTestMajorDimension) {
   EXPECT_NE(inference_->GetDynamicSize(reshape, {}, 0), nullptr);
   const Literal& multiplier =
       inference_->GetDynamicSize(reshape, {}, 0)->operand(1)->literal();
-  LiteralTestUtil::ExpectR0Equal<uint32>(10, multiplier);
+  LiteralTestUtil::ExpectR0Equal<int32>(10, multiplier);
 }
 
 TEST_F(DynamicDimensionInferenceTest, GatherTest) {
@@ -895,7 +895,7 @@ TEST_F(DynamicDimensionInferenceTest, DynamicSliceTest) {
   std::vector<HloInstruction*> params;
   for (int i = 0; i < 2; ++i) {
     params.push_back(builder.AddInstruction(HloInstruction::CreateParameter(
-        i + 2, ShapeUtil::MakeShape(U32, {}), "slice_indices")));
+        i + 2, ShapeUtil::MakeShape(S32, {}), "slice_indices")));
   }
 
   auto* slice = builder.AddInstruction(HloInstruction::CreateDynamicSlice(
@@ -997,7 +997,7 @@ TEST_F(DynamicDimensionInferenceTest, DynamicSliceSingleElementTest) {
   std::vector<HloInstruction*> params;
   for (int i = 0; i < 2; ++i) {
     params.push_back(builder.AddInstruction(HloInstruction::CreateParameter(
-        i + 2, ShapeUtil::MakeShape(U32, {}), "slice_indices")));
+        i + 2, ShapeUtil::MakeShape(S32, {}), "slice_indices")));
   }
 
   auto* slice = builder.AddInstruction(HloInstruction::CreateDynamicSlice(
diff --git a/tensorflow/compiler/xla/service/dynamic_padder.cc b/tensorflow/compiler/xla/service/dynamic_padder.cc
index 5fea5d823de..dc16ef4d9f9 100644
--- a/tensorflow/compiler/xla/service/dynamic_padder.cc
+++ b/tensorflow/compiler/xla/service/dynamic_padder.cc
@@ -78,9 +78,18 @@ StatusOr<HloInstruction*> ChooseIdentityValue(HloInstruction* inst,
     case HloOpcode::kSelectAndScatter: {
       return inst->mutable_operand(2);
     }
+    case HloOpcode::kScatter: {
+      if (operand_number != 1) {
+        return nullptr;
+      }
+      PrimitiveType indices_ptype =
+          inst->operand(operand_number)->shape().element_type();
+
+      return comp->AddInstruction(
+          HloInstruction::CreateConstant(LiteralUtil::MaxValue(indices_ptype)));
+    }
     case HloOpcode::kParameter:
     case HloOpcode::kGather:
-    case HloOpcode::kScatter:
     case HloOpcode::kDynamicSlice:
     case HloOpcode::kDynamicUpdateSlice:
     case HloOpcode::kGetDimensionSize:
@@ -128,17 +137,19 @@ StatusOr<bool> DynamicPadder::Run(HloModule* module) {
     for (HloInstruction* inst : computation->instructions()) {
       for (int64 operand_num = 0; operand_num < inst->operand_count();
            ++operand_num) {
-        HloInstruction* operand = inst->mutable_operand(operand_num);
+        HloInstruction* original_operand = inst->mutable_operand(operand_num);
+        HloInstruction* operand = original_operand;
         if (!operand->shape().IsArray()) {
           continue;
         }
         for (int64 dim = 0; dim < operand->shape().rank(); ++dim) {
           HloInstruction* dynamic_size =
-              dynamic_dimension_inference.GetDynamicSize(operand, {}, dim);
+              dynamic_dimension_inference.GetDynamicSize(original_operand, {},
+                                                         dim);
           if (dynamic_size == nullptr) {
             continue;
           }
-          VLOG(1) << "Has dynamic dimension of operand" << operand_num << " @"
+          VLOG(2) << "Has dynamic dimension of operand" << operand_num << " @"
                   << dim;
 
           if (ShouldSkipPadOnOperand(inst, operand_num, dim)) {
@@ -164,7 +175,7 @@ StatusOr<bool> DynamicPadder::Run(HloModule* module) {
           // mask and pad value.
           //
           const Shape mask_shape =
-              ShapeUtil::ChangeElementType(operand->shape(), xla::U32);
+              ShapeUtil::ChangeElementType(operand->shape(), xla::S32);
           const Shape pred_shape =
               ShapeUtil::ChangeElementType(operand->shape(), xla::PRED);
           HloInstruction* iota = computation->AddInstruction(
diff --git a/tensorflow/compiler/xla/service/dynamic_padder_test.cc b/tensorflow/compiler/xla/service/dynamic_padder_test.cc
index 2963deaa317..4dfb93ee7d8 100644
--- a/tensorflow/compiler/xla/service/dynamic_padder_test.cc
+++ b/tensorflow/compiler/xla/service/dynamic_padder_test.cc
@@ -28,7 +28,10 @@ limitations under the License.
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/test_helpers.h"
+#include "tensorflow/compiler/xla/tests/client_library_test_base.h"
 #include "tensorflow/compiler/xla/tests/hlo_test_base.h"
+#include "tensorflow/compiler/xla/tests/literal_test_util.h"
+#include "tensorflow/compiler/xla/tests/test_macros.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/platform/test_benchmark.h"
@@ -65,7 +68,7 @@ class DynamicPadderTest : public HloTestBase {
   }
 
   std::unique_ptr<HloModule> module_;
-  const Shape scalar_shape_ = ShapeUtil::MakeShape(U32, {});
+  const Shape scalar_shape_ = ShapeUtil::MakeShape(S32, {});
 };
 
 TEST_F(DynamicPadderTest, ReduceTest) {
@@ -212,5 +215,189 @@ TEST_F(DynamicPadderTest, ReduceWindowNoPadForTrivialWindow) {
   EXPECT_THAT(output->operand(0), op::Parameter());
 }
 
+// Test that dynamic padder has the same result as if not padded.
+class ExecutionTest : public HloTestBase {
+ protected:
+  std::unique_ptr<HloModule> GetHloModule(const string& hlo_text) {
+    HloModuleConfig config;
+    config.set_debug_options(GetDebugOptionsForTest());
+    std::unique_ptr<HloModule> module =
+        ParseAndReturnUnverifiedModule(hlo_text, config).ValueOrDie();
+    return module;
+  }
+};
+
+XLA_TEST_F(ExecutionTest, ScatterUpdate) {
+  // Test that scattering on indices=[2] is same as scattering on indices=[4]
+  // and dynamic dimension = 2
+  const string hlo_text = R"(
+HloModule TensorFlowScatterV1
+
+update_s32 (lhs: s32[], rhs: s32[]) -> s32[] {
+  lhs = s32[] parameter(0)
+  ROOT rhs = s32[] parameter(1)
+}
+
+ENTRY main {
+  operand = s32[3,3] parameter(0)
+  indices = s32[INDICES_BOUND] parameter(1)
+  updates = s32[INDICES_BOUND,3] parameter(2)
+  dynamic_size = s32[] parameter(3)
+  ROOT scatter = s32[3,3] scatter(operand, indices, updates),
+      to_apply=update_s32,
+      update_window_dims={1},
+      inserted_window_dims={0},
+      scatter_dims_to_operand_dims={0},
+      index_vector_dim=1
+
+}
+)";
+  const string hlo_text_not_padded =
+      absl::StrReplaceAll(hlo_text, {{"INDICES_BOUND", "2"}});
+  auto module_not_padded = GetHloModule(hlo_text_not_padded);
+
+  Literal operand =
+      LiteralUtil::CreateR2<int32>({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}});
+  Literal scatter_indices = LiteralUtil::CreateR1<int32>({0, 2});
+  Literal updates = LiteralUtil::CreateR2<int32>({{10, 20, 30}, {70, 80, 90}});
+  Literal dynamic_size = LiteralUtil::CreateR0<int32>(2);
+
+  Literal not_padded =
+      ExecuteAndTransfer(std::move(module_not_padded),
+                         {&operand, &scatter_indices, &updates, &dynamic_size});
+
+  // Pad input to 4.
+  const string hlo_text_padded =
+      absl::StrReplaceAll(hlo_text, {{"INDICES_BOUND", "4"}});
+  auto module_padded = GetHloModule(hlo_text_padded);
+  // Set up dynamic parameter binding.
+  TF_CHECK_OK(module_padded->dynamic_parameter_binding().Bind(
+      DynamicParameterBinding::DynamicParameter{3, {}},
+      DynamicParameterBinding::DynamicDimension{1, {}, 0}));
+  TF_CHECK_OK(module_padded->dynamic_parameter_binding().Bind(
+      DynamicParameterBinding::DynamicParameter{3, {}},
+      DynamicParameterBinding::DynamicDimension{2, {}, 0}));
+  // Pad the rest of input with garbage data.
+  Literal scatter_indices_padded = LiteralUtil::CreateR1<int32>({0, 2, 0, 4});
+  Literal updates_padded = LiteralUtil::CreateR2<int32>(
+      {{10, 20, 30}, {70, 80, 90}, {30, 22, 11}, {-1, 20, -1}});
+  DynamicPadder padder;
+  TF_CHECK_OK(padder.Run(module_padded.get()).status());
+  Literal padded = ExecuteAndTransfer(
+      std::move(module_padded),
+      {&operand, &scatter_indices_padded, &updates_padded, &dynamic_size});
+
+  EXPECT_EQ(padded, not_padded);
+}
+
+XLA_TEST_F(ExecutionTest, ScatterUpdateF32) {
+  // Test that scattering on indices=[2] is same as scattering on indices=[4]
+  // and dynamic dimension = 2
+  const string hlo_text = R"(
+HloModule TensorFlowScatterV1
+
+update_f32 (lhs: f32[], rhs: f32[]) -> f32[] {
+  lhs = f32[] parameter(0)
+  ROOT rhs = f32[] parameter(1)
+}
+
+ENTRY main {
+  operand = f32[3,3] parameter(0)
+  indices = s32[2] parameter(1)
+  updates = f32[2,3] parameter(2)
+  dynamic_size = s32[] parameter(3)
+  ROOT scatter = f32[3,3] scatter(operand, indices, updates),
+      to_apply=update_f32,
+      update_window_dims={1},
+      inserted_window_dims={0},
+      scatter_dims_to_operand_dims={0},
+      index_vector_dim=1
+
+}
+)";
+
+  auto module_not_padded = GetHloModule(hlo_text);
+
+  Literal operand = LiteralUtil::CreateR2<float>(
+      {{1.0, 2.0, 3.0}, {4.0, 5.0, 6.0}, {7.0, 8.0, 9.0}});
+  Literal scatter_indices = LiteralUtil::CreateR1<int32>({0, 2});
+  Literal updates =
+      LiteralUtil::CreateR2<float>({{10.0, 20.0, 30.0}, {70.0, 80.0, 90.0}});
+  // Dynamic Size is 1, pad to 2
+  Literal dynamic_size = LiteralUtil::CreateR0<int32>(1);
+
+  auto module_padded = GetHloModule(hlo_text);
+  // Set up dynamic parameter binding.
+  TF_CHECK_OK(module_padded->dynamic_parameter_binding().Bind(
+      DynamicParameterBinding::DynamicParameter{3, {}},
+      DynamicParameterBinding::DynamicDimension{1, {}, 0}));
+  TF_CHECK_OK(module_padded->dynamic_parameter_binding().Bind(
+      DynamicParameterBinding::DynamicParameter{3, {}},
+      DynamicParameterBinding::DynamicDimension{2, {}, 0}));
+  DynamicPadder padder;
+  TF_CHECK_OK(padder.Run(module_padded.get()).status());
+  Literal not_padded =
+      ExecuteAndTransfer(std::move(module_padded),
+                         {&operand, &scatter_indices, &updates, &dynamic_size});
+  // Although we have two indices, only the first element is updated because of
+  // padding.
+  EXPECT_EQ(LiteralUtil::CreateR2<float>(
+                {{10.0, 20.0, 30.0}, {4.0, 5.0, 6.0}, {7.0, 8.0, 9.0}}),
+            not_padded);
+}
+
+XLA_TEST_F(ExecutionTest, TwoDimensionReduce) {
+  // Test that reducing on operand=[2,2] is same as reducing on operand=[4,4]
+  // and dynamic dimension = 2
+  const string hlo_text = R"(
+HloModule TensorFlowScatterV1
+
+update_s32 (lhs: s32[], rhs: s32[]) -> s32[] {
+  lhs = s32[] parameter(0)
+  rhs = s32[] parameter(1)
+  ROOT add = s32[] add(lhs, rhs)
+}
+
+ENTRY main {
+  param = s32[INDICES_BOUND, INDICES_BOUND] parameter(0)
+  dynamic_size = s32[] parameter(1)
+  const = s32[] constant(0)
+  ROOT reduce = s32[] reduce(param, const),
+      dimensions={0, 1},
+      to_apply=update_s32
+}
+)";
+  const string hlo_text_not_padded =
+      absl::StrReplaceAll(hlo_text, {{"INDICES_BOUND", "2"}});
+  auto module_not_padded = GetHloModule(hlo_text_not_padded);
+
+  Literal operand = LiteralUtil::CreateR2<int32>({{1, 2}, {4, 5}});
+  Literal dynamic_size = LiteralUtil::CreateR0<int32>(2);
+
+  Literal not_padded = ExecuteAndTransfer(std::move(module_not_padded),
+                                          {&operand, &dynamic_size});
+
+  // Pad input to 4.
+  const string hlo_text_padded =
+      absl::StrReplaceAll(hlo_text, {{"INDICES_BOUND", "4"}});
+  auto module_padded = GetHloModule(hlo_text_padded);
+  // Set up dynamic parameter binding.
+  TF_CHECK_OK(module_padded->dynamic_parameter_binding().Bind(
+      DynamicParameterBinding::DynamicParameter{1, {}},
+      DynamicParameterBinding::DynamicDimension{0, {}, 0}));
+  TF_CHECK_OK(module_padded->dynamic_parameter_binding().Bind(
+      DynamicParameterBinding::DynamicParameter{1, {}},
+      DynamicParameterBinding::DynamicDimension{0, {}, 1}));
+  // Pad the rest of input with garbage data.
+  Literal operand_padded = LiteralUtil::CreateR2<int32>(
+      {{1, 2, 3, 4}, {4, 5, 6, 7}, {1, 2, 3, 4}, {4, 5, 6, 7}});
+  DynamicPadder padder;
+  TF_CHECK_OK(padder.Run(module_padded.get()).status());
+  Literal padded = ExecuteAndTransfer(std::move(module_padded),
+                                      {&operand_padded, &dynamic_size});
+
+  EXPECT_EQ(padded, not_padded);
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/BUILD b/tensorflow/compiler/xla/service/gpu/BUILD
index 21476c7e921..7b871951ed0 100755
--- a/tensorflow/compiler/xla/service/gpu/BUILD
+++ b/tensorflow/compiler/xla/service/gpu/BUILD
@@ -456,6 +456,7 @@ tf_cc_test(
 cc_library(
     name = "gpu_executable",
     srcs = [
+        "cholesky_thunk.cc",
         "collective_permute_thunk.cc",
         "conditional_thunk.cc",
         "convolution_thunk.cc",
@@ -476,10 +477,9 @@ cc_library(
         "triangular_solve_thunk.cc",
         "tuple_thunk.cc",
         "while_thunk.cc",
-    ] + if_cuda_is_configured([
-        "cholesky_thunk.cc",
-    ]),
+    ],
     hdrs = [
+        "cholesky_thunk.h",
         "collective_permute_thunk.h",
         "conditional_thunk.h",
         "convolution_thunk.h",
@@ -500,12 +500,11 @@ cc_library(
         "triangular_solve_thunk.h",
         "tuple_thunk.h",
         "while_thunk.h",
-    ] + if_cuda_is_configured([
-        "cholesky_thunk.h",
-    ]),
+    ],
     deps = [
         ":backend_configs",
         ":buffer_allocations",
+        ":cusolver_context",
         ":cudnn_conv_runner",
         ":gpu_debug_info_manager",
         ":gpu_types",
@@ -559,7 +558,6 @@ cc_library(
         "@com_google_absl//absl/types:optional",
         "@com_google_absl//absl/types:span",
     ] + if_cuda_is_configured([
-        ":cusolver_context",
         "//tensorflow/stream_executor/cuda:cuda_stream",
         "//tensorflow/core/platform/default/build_config:cublas_plugin",
         "//tensorflow/core/platform/default/build_config:cudnn_plugin",
@@ -633,7 +631,7 @@ cc_library(
         "//tensorflow/stream_executor:blas",
         "//tensorflow/stream_executor:device_memory",
         "//tensorflow/stream_executor:device_memory_allocator",
-        "//tensorflow/stream_executor/cuda:redzone_allocator",
+        "//tensorflow/stream_executor/gpu:redzone_allocator",
         "@com_google_absl//absl/types:optional",
     ],
 )
@@ -664,7 +662,7 @@ cc_library(
         "//tensorflow/core:stream_executor_no_cuda",
         "//tensorflow/core/util/proto:proto_utils",
         "//tensorflow/stream_executor:device_memory_allocator",
-        "//tensorflow/stream_executor/cuda:redzone_allocator",
+        "//tensorflow/stream_executor/gpu:redzone_allocator",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
@@ -718,6 +716,7 @@ tf_cc_test(
     deps = [
         ":cudnn_conv_rewriter",
         ":ir_emission_utils",
+        "//tensorflow/compiler/jit:xla_gpu_jit",
         "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla:test_helpers",
         "//tensorflow/compiler/xla/service:hlo",
@@ -731,21 +730,22 @@ tf_cc_test(
 
 cc_library(
     name = "cusolver_context",
-    srcs = ["cusolver_context.cc"],
+    srcs = if_cuda_is_configured(["cusolver_context.cc"]),
     hdrs = ["cusolver_context.h"],
     deps = [
         # LINT.IfChange
         "@local_config_cuda//cuda:cublas_headers",
         # LINT.ThenChange(//tensorflow/copy.bara.sky:cublas_headers)
-        "@local_config_cuda//cuda:cuda_headers",
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/core:lib",
         "//tensorflow/core:stream_executor_no_cuda",
         "//tensorflow/stream_executor:blas",
+    ] + if_cuda_is_configured([
+        "@local_config_cuda//cuda:cuda_headers",
         "//tensorflow/stream_executor/cuda:cusolver_lib",
-    ],
+    ]),
 )
 
 cc_library(
@@ -950,11 +950,12 @@ cc_library(
 )
 
 cc_library(
-    name = "cudnn_conv_pad_for_tensor_cores",
-    srcs = ["cudnn_conv_pad_for_tensor_cores.cc"],
-    hdrs = ["cudnn_conv_pad_for_tensor_cores.h"],
+    name = "cudnn_pad_for_convolutions",
+    srcs = ["cudnn_pad_for_convolutions.cc"],
+    hdrs = ["cudnn_pad_for_convolutions.h"],
     deps = [
         ":ir_emission_utils",
+        ":stream_executor_util",
         "//tensorflow/compiler/xla:literal_util",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:window_util",
@@ -964,10 +965,10 @@ cc_library(
 )
 
 tf_cc_test(
-    name = "cudnn_conv_pad_for_tensor_cores_test",
-    srcs = ["cudnn_conv_pad_for_tensor_cores_test.cc"],
+    name = "cudnn_pad_for_convolutions_test",
+    srcs = ["cudnn_pad_for_convolutions_test.cc"],
     deps = [
-        ":cudnn_conv_pad_for_tensor_cores",
+        ":cudnn_pad_for_convolutions",
         ":ir_emission_utils",
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:util",
@@ -1053,9 +1054,9 @@ cc_library(
     deps = [
         ":alias_passthrough_params",
         ":cudnn_batchnorm_rewriter",
-        ":cudnn_conv_algorithm_picker",
         ":cudnn_conv_padding_legalization",
         ":cudnn_conv_rewriter",
+        ":cudnn_pad_for_convolutions",
         ":fusion_merger",
         ":gpu_constants",
         ":gpu_copy_insertion",
@@ -1156,10 +1157,10 @@ cc_library(
     deps = [
         ":cublas_gemm_pad_for_tensor_cores",
         ":cudnn_conv_algorithm_picker",
-        ":cudnn_conv_pad_for_tensor_cores",
         ":cudnn_conv_padding_legalization",
         ":cudnn_conv_rewriter",
         ":cudnn_fused_conv_rewriter",
+        ":cudnn_pad_for_convolutions",
         ":cusolver_rewriter",
         ":gemm_algorithm_picker",
         ":gemm_rewriter",
@@ -1190,7 +1191,7 @@ cc_library(
         "//tensorflow/core/profiler/lib:traceme",
         "//tensorflow/stream_executor:stream_executor_headers",
         "//tensorflow/stream_executor/cuda:cuda_diagnostics",
-        "//tensorflow/stream_executor/cuda:ptxas_utils",
+        "//tensorflow/stream_executor/gpu:asm_compiler",
         "@com_google_absl//absl/container:node_hash_map",
         "@com_google_absl//absl/types:optional",
     ],
@@ -1413,7 +1414,7 @@ cc_library(
         "//tensorflow/core:stream_executor_no_cuda",
         "//tensorflow/core/profiler/lib:traceme",
         "//tensorflow/stream_executor:kernel_spec",
-        "//tensorflow/stream_executor/cuda:ptxas_utils",
+        "//tensorflow/stream_executor/gpu:asm_compiler",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:span",
@@ -1509,9 +1510,15 @@ cc_library(
 tf_cc_test(
     name = "cudnn_fused_conv_rewriter_test",
     srcs = ["cudnn_fused_conv_rewriter_test.cc"],
-    tags = tf_cuda_tests_tags(),
+    tags = [
+        "noasan",
+        "nomsan",
+        "requires-gpu-sm70",
+    ],
     deps = [
+        ":cudnn_fused_conv_rewriter",
         ":ir_emission_utils",
+        "//tensorflow/compiler/xla:test_helpers",
         "//tensorflow/compiler/xla/service:hlo_parser",
         "//tensorflow/compiler/xla/service/gpu/tests:gpu_codegen_test",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
diff --git a/tensorflow/compiler/xla/service/gpu/buffer_comparator.cc b/tensorflow/compiler/xla/service/gpu/buffer_comparator.cc
index 30108315e4d..37095adf7c6 100644
--- a/tensorflow/compiler/xla/service/gpu/buffer_comparator.cc
+++ b/tensorflow/compiler/xla/service/gpu/buffer_comparator.cc
@@ -34,7 +34,7 @@ namespace gpu {
 
 static constexpr double kTolerance = 0.1f;
 
-// Comparison kernel code: compare two buffers of fp16/fp32/fp64 of length
+// Comparison kernel code: compare two buffers of fp16/fp32/fp64/int8 of length
 // buffer_length where the relative error does not exceed the passed
 // rel_error_threshold. Write the number of mismatches into out parameter
 // mismatch_count.
@@ -46,12 +46,20 @@ static constexpr double kTolerance = 0.1f;
 //
 // #include<cuda_fp16.h>
 // extern "C" { // avoid name mangling
-// __device__ float canonicalize(float input) {
+// __device__ float __xla_buffer_comparator_canonicalize(float input) {
 //   // All fp16 infinities are treated as 65505 or -65505, in order to avoid
 //   // differences due to overflows.
 //   return isnan(input) ? input : max(-65505.0f, min(input, 65505.0f));
 // }
-//
+
+// __device__ float __xla_buffer_comparator_extract_int8(int pack) {
+//   // Extract the lower 8 bits from pack and convert it to float
+//   const unsigned int bit_mask = 0xff;
+//   unsigned int bits = pack & bit_mask;
+//   char* int8_ptr = (char*)&bits;
+//   return __int2float_rn(*int8_ptr);
+// }
+
 // __global__ void __xla_fp16_comparison(__half* buffer_a, __half* buffer_b,
 //                                       float rel_error_threshold,
 //                                       unsigned long long buffer_length,
@@ -60,15 +68,15 @@ static constexpr double kTolerance = 0.1f;
 //   if (idx >= buffer_length) return;
 //   float elem_a = __half2float(buffer_a[idx]);
 //   float elem_b = __half2float(buffer_b[idx]);
-//   elem_a = canonicalize(elem_a);
-//   elem_b = canonicalize(elem_b);
+//   elem_a = __xla_buffer_comparator_canonicalize(elem_a);
+//   elem_b = __xla_buffer_comparator_canonicalize(elem_b);
 //   if (isnan(elem_a) && isnan(elem_b)) return;
 //   float rel_error = abs(elem_a - elem_b)
 //       / (max(abs(elem_a), abs(elem_b)) + 1);
 //   if (rel_error > rel_error_threshold || isnan(rel_error))
 //     atomicAdd(mismatch_count, 1);
 // }
-//
+
 // __global__ void __xla_fp32_comparison(float* buffer_a, float* buffer_b,
 //                                       float rel_error_threshold,
 //                                       unsigned long long buffer_length,
@@ -85,7 +93,7 @@ static constexpr double kTolerance = 0.1f;
 //   if (rel_error > rel_error_threshold || isnan(rel_error))
 //     atomicAdd(mismatch_count, 1);
 // }
-//
+
 // __global__ void __xla_fp64_comparison(double* buffer_a, double* buffer_b,
 //                                       float rel_error_threshold,
 //                                       unsigned long long buffer_length,
@@ -102,234 +110,440 @@ static constexpr double kTolerance = 0.1f;
 //   if (rel_error > rel_error_threshold || isnan(rel_error))
 //     atomicAdd(mismatch_count, 1);
 // }
+
+// __global__ void __xla_int8_comparison(int* buffer_a, int* buffer_b,
+//                                       float rel_error_threshold,
+//                                       unsigned long long buffer_length,
+//                                       int* mismatch_count) {
+//   int idx = threadIdx.x + blockIdx.x * blockDim.x;
+//   if (idx >= buffer_length) return;
+//   int pack_a = buffer_a[idx];
+//   int pack_b = buffer_b[idx];
+//   for(int i = 0; i < 4; ++i) {
+//     float elem_a = __xla_buffer_comparator_extract_int8(pack_a);
+//     float elem_b = __xla_buffer_comparator_extract_int8(pack_b);
+//     float rel_error = abs(elem_a - elem_b)
+//         / (max(abs(elem_a), abs(elem_b)) + 1);
+//     if (rel_error > rel_error_threshold || isnan(rel_error))
+//         atomicAdd(mismatch_count, 1);
+//     pack_a >>= 8;
+//     pack_b >>= 8;
+//   }
+// }
 // } // end extern declaration.
 static const char* buffer_compare_ptx = R"(
 .version 4.2
 .target sm_30
 .address_size 64
 
+ // .globl __xla_fp16_comparison
+
 .visible .entry __xla_fp16_comparison(
-  .param .u64 __xla_fp16_comparison_param_0,
-  .param .u64 __xla_fp16_comparison_param_1,
-  .param .f32 __xla_fp16_comparison_param_2,
-  .param .u64 __xla_fp16_comparison_param_3,
-  .param .u64 __xla_fp16_comparison_param_4
+ .param .u64 __xla_fp16_comparison_param_0,
+ .param .u64 __xla_fp16_comparison_param_1,
+ .param .f32 __xla_fp16_comparison_param_2,
+ .param .u64 __xla_fp16_comparison_param_3,
+ .param .u64 __xla_fp16_comparison_param_4
 )
 {
-  .reg .pred   %p<10>;
-  .reg .b16   %rs<3>;
-  .reg .f32   %f<20>;
-  .reg .b32   %r<6>;
-  .reg .b64   %rd<12>;
-  ld.param.u64   %rd8, [__xla_fp16_comparison_param_3];
-  mov.u32   %r1, %tid.x;
-  mov.u32   %r2, %ctaid.x;
-  mov.u32   %r3, %ntid.x;
-  mad.lo.s32   %r4, %r3, %r2, %r1;
-  cvt.s64.s32   %rd4, %r4;
-  setp.ge.u64   %p1, %rd4, %rd8;
-  @%p1 bra   LBB7_4;
-  ld.param.u64   %rd5, [__xla_fp16_comparison_param_0];
-  ld.param.u64   %rd7, [__xla_fp16_comparison_param_1];
-  cvta.to.global.u64   %rd2, %rd7;
-  cvta.to.global.u64   %rd3, %rd5;
-  shl.b64   %rd9, %rd4, 1;
-  add.s64   %rd10, %rd3, %rd9;
-  ld.global.u16   %rs1, [%rd10];
-  // begin inline asm
-  {  cvt.f32.f16 %f6, %rs1;}
+ .reg .pred  %p<9>;
+ .reg .b16  %rs<3>;
+ .reg .f32  %f<28>;
+ .reg .b32  %r<6>;
+ .reg .b64  %rd<12>;
 
-  // end inline asm
-  add.s64   %rd11, %rd2, %rd9;
-  ld.global.u16   %rs2, [%rd11];
-  // begin inline asm
-  {  cvt.f32.f16 %f7, %rs2;}
 
-  // end inline asm
-  abs.f32   %f8, %f6;
-  setp.gtu.f32   %p2, %f8, 0f7F800000;
-  min.f32   %f9, %f6, 0f477FE100;
-  max.f32   %f10, %f9, 0fC77FE100;
-  selp.f32   %f1, %f6, %f10, %p2;
-  abs.f32   %f11, %f7;
-  setp.gtu.f32   %p3, %f11, 0f7F800000;
-  min.f32   %f12, %f7, 0f477FE100;
-  max.f32   %f13, %f12, 0fC77FE100;
-  selp.f32   %f2, %f7, %f13, %p3;
-  abs.f32   %f3, %f1;
-  setp.gtu.f32   %p4, %f3, 0f7F800000;
-  abs.f32   %f4, %f2;
-  setp.gtu.f32   %p5, %f4, 0f7F800000;
-  and.pred    %p6, %p4, %p5;
-  @%p6 bra   LBB7_4;
-  ld.param.f32   %f5, [__xla_fp16_comparison_param_2];
-  sub.f32   %f14, %f1, %f2;
-  abs.f32   %f15, %f14;
-  max.f32   %f16, %f3, %f4;
-  add.f32   %f17, %f16, 0f3F800000;
-  div.rn.f32   %f18, %f15, %f17;
-  setp.leu.f32   %p7, %f18, %f5;
-  abs.f32   %f19, %f18;
-  setp.le.f32   %p8, %f19, 0f7F800000;
-  and.pred    %p9, %p7, %p8;
-  @%p9 bra   LBB7_4;
-  ld.param.u64   %rd6, [__xla_fp16_comparison_param_4];
-  cvta.to.global.u64   %rd1, %rd6;
-  atom.global.add.u32   %r5, [%rd1], 1;
-LBB7_4:
-  ret;
+ ld.param.u64  %rd1, [__xla_fp16_comparison_param_0];
+ ld.param.u64  %rd2, [__xla_fp16_comparison_param_1];
+ ld.param.f32  %f10, [__xla_fp16_comparison_param_2];
+ ld.param.u64  %rd4, [__xla_fp16_comparison_param_3];
+ ld.param.u64  %rd3, [__xla_fp16_comparison_param_4];
+ mov.u32  %r2, %ntid.x;
+ mov.u32  %r3, %ctaid.x;
+ mov.u32  %r4, %tid.x;
+ mad.lo.s32  %r1, %r2, %r3, %r4;
+ cvt.s64.s32 %rd5, %r1;
+ setp.ge.u64 %p1, %rd5, %rd4;
+ @%p1 bra  BB0_9;
 
+ cvta.to.global.u64  %rd6, %rd1;
+ mul.wide.s32  %rd7, %r1, 2;
+ add.s64  %rd8, %rd6, %rd7;
+ ld.global.u16  %rs1, [%rd8];
+ // inline asm
+ {  cvt.f32.f16 %f26, %rs1;}
+
+ // inline asm
+ cvta.to.global.u64  %rd9, %rd2;
+ add.s64  %rd10, %rd9, %rd7;
+ ld.global.u16  %rs2, [%rd10];
+ // inline asm
+ {  cvt.f32.f16 %f27, %rs2;}
+
+ // inline asm
+ abs.f32  %f13, %f26;
+ setp.gtu.f32 %p2, %f13, 0f7F800000;
+ @%p2 bra  BB0_3;
+
+ mov.f32  %f14, 0f477FE100;
+ min.f32  %f15, %f26, %f14;
+ mov.f32  %f16, 0fC77FE100;
+ max.f32  %f26, %f16, %f15;
+
+BB0_3:
+ abs.f32  %f17, %f27;
+ setp.gtu.f32 %p3, %f17, 0f7F800000;
+ @%p3 bra  BB0_5;
+
+ mov.f32  %f18, 0f477FE100;
+ min.f32  %f19, %f27, %f18;
+ mov.f32  %f20, 0fC77FE100;
+ max.f32  %f27, %f20, %f19;
+
+BB0_5:
+ abs.f32  %f7, %f26;
+ setp.gtu.f32 %p4, %f7, 0f7F800000;
+ abs.f32  %f8, %f27;
+ setp.gtu.f32 %p5, %f8, 0f7F800000;
+ and.pred   %p6, %p4, %p5;
+ @%p6 bra  BB0_9;
+
+ sub.f32  %f21, %f26, %f27;
+ abs.f32  %f22, %f21;
+ max.f32  %f23, %f7, %f8;
+ add.f32  %f24, %f23, 0f3F800000;
+ div.rn.f32  %f9, %f22, %f24;
+ setp.gt.f32 %p7, %f9, %f10;
+ @%p7 bra  BB0_8;
+
+ abs.f32  %f25, %f9;
+ setp.le.f32 %p8, %f25, 0f7F800000;
+ @%p8 bra  BB0_9;
+
+BB0_8:
+ cvta.to.global.u64  %rd11, %rd3;
+ atom.global.add.u32  %r5, [%rd11], 1;
+
+BB0_9:
+ ret;
 }
-  // .globl  __xla_fp32_comparison
+
+ // .globl __xla_fp32_comparison
 .visible .entry __xla_fp32_comparison(
-  .param .u64 __xla_fp32_comparison_param_0,
-  .param .u64 __xla_fp32_comparison_param_1,
-  .param .f32 __xla_fp32_comparison_param_2,
-  .param .u64 __xla_fp32_comparison_param_3,
-  .param .u64 __xla_fp32_comparison_param_4
+ .param .u64 __xla_fp32_comparison_param_0,
+ .param .u64 __xla_fp32_comparison_param_1,
+ .param .f32 __xla_fp32_comparison_param_2,
+ .param .u64 __xla_fp32_comparison_param_3,
+ .param .u64 __xla_fp32_comparison_param_4
 )
 {
-  .reg .pred   %p<12>;
-  .reg .f32   %f<12>;
-  .reg .b32   %r<9>;
-  .reg .b64   %rd<12>;
+ .reg .pred  %p<10>;
+ .reg .b16  %rs<3>;
+ .reg .f32  %f<13>;
+ .reg .b32  %r<10>;
+ .reg .b64  %rd<12>;
 
-  ld.param.u64   %rd8, [__xla_fp32_comparison_param_3];
-  mov.u32   %r1, %tid.x;
-  mov.u32   %r2, %ctaid.x;
-  mov.u32   %r3, %ntid.x;
-  mad.lo.s32   %r4, %r3, %r2, %r1;
-  cvt.s64.s32   %rd4, %r4;
-  setp.ge.u64   %p1, %rd4, %rd8;
-  @%p1 bra   LBB8_6;
-  ld.param.u64   %rd5, [__xla_fp32_comparison_param_0];
-  ld.param.u64   %rd7, [__xla_fp32_comparison_param_1];
-  cvta.to.global.u64   %rd2, %rd7;
-  cvta.to.global.u64   %rd3, %rd5;
-  shl.b64   %rd9, %rd4, 2;
-  add.s64   %rd10, %rd3, %rd9;
-  ld.global.f32   %f1, [%rd10];
-  add.s64   %rd11, %rd2, %rd9;
-  ld.global.f32   %f2, [%rd11];
-  abs.f32   %f3, %f1;
-  setp.gtu.f32   %p2, %f3, 0f7F800000;
-  abs.f32   %f4, %f2;
-  setp.gtu.f32   %p3, %f4, 0f7F800000;
-  and.pred    %p4, %p2, %p3;
-  @%p4 bra   LBB8_6;
-  setp.neu.f32   %p5, %f3, 0f7F800000;
-  setp.neu.f32   %p6, %f4, 0f7F800000;
-  or.pred    %p7, %p5, %p6;
-  @%p7 bra   LBB8_4;
-  mov.b32   %r5, %f1;
-  mov.b32   %r6, %f2;
-  xor.b32    %r7, %r6, %r5;
-  setp.gt.s32   %p8, %r7, -1;
-  @%p8 bra   LBB8_6;
-LBB8_4:
-  ld.param.f32   %f5, [__xla_fp32_comparison_param_2];
-  sub.f32   %f6, %f1, %f2;
-  abs.f32   %f7, %f6;
-  max.f32   %f8, %f3, %f4;
-  add.f32   %f9, %f8, 0f3F800000;
-  div.rn.f32   %f10, %f7, %f9;
-  setp.leu.f32   %p9, %f10, %f5;
-  abs.f32   %f11, %f10;
-  setp.le.f32   %p10, %f11, 0f7F800000;
-  and.pred    %p11, %p9, %p10;
-  @%p11 bra   LBB8_6;
-  ld.param.u64   %rd6, [__xla_fp32_comparison_param_4];
-  cvta.to.global.u64   %rd1, %rd6;
-  atom.global.add.u32   %r8, [%rd1], 1;
-LBB8_6:
-  ret;
 
+ ld.param.u64  %rd1, [__xla_fp32_comparison_param_0];
+ ld.param.u64  %rd2, [__xla_fp32_comparison_param_1];
+ ld.param.f32  %f6, [__xla_fp32_comparison_param_2];
+ ld.param.u64  %rd4, [__xla_fp32_comparison_param_3];
+ ld.param.u64  %rd3, [__xla_fp32_comparison_param_4];
+ mov.u32  %r2, %ntid.x;
+ mov.u32  %r3, %ctaid.x;
+ mov.u32  %r4, %tid.x;
+ mad.lo.s32  %r1, %r2, %r3, %r4;
+ cvt.s64.s32 %rd5, %r1;
+ setp.ge.u64 %p1, %rd5, %rd4;
+ @%p1 bra  BB1_8;
+
+ cvta.to.global.u64  %rd6, %rd1;
+ mul.wide.s32  %rd7, %r1, 4;
+ add.s64  %rd8, %rd6, %rd7;
+ cvta.to.global.u64  %rd9, %rd2;
+ add.s64  %rd10, %rd9, %rd7;
+ ld.global.f32  %f1, [%rd10];
+ ld.global.f32  %f2, [%rd8];
+ abs.f32  %f3, %f2;
+ setp.le.f32 %p2, %f3, 0f7F800000;
+ @%p2 bra  BB1_3;
+
+ abs.f32  %f7, %f1;
+ setp.gtu.f32 %p3, %f7, 0f7F800000;
+ @%p3 bra  BB1_8;
+
+BB1_3:
+ setp.neu.f32 %p4, %f3, 0f7F800000;
+ abs.f32  %f4, %f1;
+ setp.neu.f32 %p5, %f4, 0f7F800000;
+ or.pred   %p6, %p4, %p5;
+ @%p6 bra  BB1_5;
+
+ mov.b32   %r5, %f2;
+ shr.u32  %r6, %r5, 31;
+ cvt.u16.u32 %rs1, %r6;
+ mov.b32   %r7, %f1;
+ shr.u32  %r8, %r7, 31;
+ cvt.u16.u32 %rs2, %r8;
+ setp.eq.s16 %p7, %rs1, %rs2;
+ @%p7 bra  BB1_8;
+
+BB1_5:
+ sub.f32  %f8, %f2, %f1;
+ abs.f32  %f9, %f8;
+ max.f32  %f10, %f3, %f4;
+ add.f32  %f11, %f10, 0f3F800000;
+ div.rn.f32  %f5, %f9, %f11;
+ setp.gt.f32 %p8, %f5, %f6;
+ @%p8 bra  BB1_7;
+
+ abs.f32  %f12, %f5;
+ setp.le.f32 %p9, %f12, 0f7F800000;
+ @%p9 bra  BB1_8;
+
+BB1_7:
+ cvta.to.global.u64  %rd11, %rd3;
+ atom.global.add.u32  %r9, [%rd11], 1;
+
+BB1_8:
+ ret;
 }
-  // .globl  __xla_fp64_comparison
+
+ // .globl __xla_fp64_comparison
 .visible .entry __xla_fp64_comparison(
-  .param .u64 __xla_fp64_comparison_param_0,
-  .param .u64 __xla_fp64_comparison_param_1,
-  .param .f32 __xla_fp64_comparison_param_2,
-  .param .u64 __xla_fp64_comparison_param_3,
-  .param .u64 __xla_fp64_comparison_param_4
+ .param .u64 __xla_fp64_comparison_param_0,
+ .param .u64 __xla_fp64_comparison_param_1,
+ .param .f32 __xla_fp64_comparison_param_2,
+ .param .u64 __xla_fp64_comparison_param_3,
+ .param .u64 __xla_fp64_comparison_param_4
 )
 {
-  .reg .pred   %p<16>;
-  .reg .f32   %f<2>;
-  .reg .b32   %r<13>;
-  .reg .f64   %fd<12>;
-  .reg .b64   %rd<12>;
+ .reg .pred  %p<11>;
+ .reg .b16  %rs<3>;
+ .reg .f32  %f<2>;
+ .reg .b32  %r<14>;
+ .reg .f64  %fd<13>;
+ .reg .b64  %rd<12>;
 
-  ld.param.u64   %rd8, [__xla_fp64_comparison_param_3];
-  mov.u32   %r2, %tid.x;
-  mov.u32   %r3, %ctaid.x;
-  mov.u32   %r4, %ntid.x;
-  mad.lo.s32   %r5, %r4, %r3, %r2;
-  cvt.s64.s32   %rd4, %r5;
-  setp.ge.u64   %p1, %rd4, %rd8;
-  @%p1 bra   LBB9_6;
-  ld.param.u64   %rd5, [__xla_fp64_comparison_param_0];
-  ld.param.u64   %rd7, [__xla_fp64_comparison_param_1];
-  cvta.to.global.u64   %rd2, %rd7;
-  cvta.to.global.u64   %rd3, %rd5;
-  shl.b64   %rd9, %rd4, 3;
-  add.s64   %rd10, %rd3, %rd9;
-  ld.global.f64   %fd1, [%rd10];
-  add.s64   %rd11, %rd2, %rd9;
-  ld.global.f64   %fd2, [%rd11];
-  abs.f64   %fd3, %fd1;
-  setp.gtu.f64   %p2, %fd3, 0d7FF0000000000000;
-  abs.f64   %fd4, %fd2;
-  setp.gtu.f64   %p3, %fd4, 0d7FF0000000000000;
-  and.pred    %p4, %p2, %p3;
-  @%p4 bra   LBB9_6;
-  {
-  .reg .b32 %temp; 
-  mov.b64   {%r6, %temp}, %fd1;
-  }
-  {
-  .reg .b32 %temp; 
-  mov.b64   {%temp, %r1}, %fd1;
-  }
-  and.b32    %r7, %r1, 2147483647;
-  setp.ne.s32   %p5, %r7, 2146435072;
-  setp.ne.s32   %p6, %r6, 0;
-  or.pred    %p7, %p6, %p5;
-  @%p7 bra   LBB9_4;
-  {
-  .reg .b32 %temp; 
-  mov.b64   {%r8, %temp}, %fd2;
-  }
-  {
-  .reg .b32 %temp; 
-  mov.b64   {%temp, %r9}, %fd2;
-  }
-  and.b32    %r10, %r9, 2147483647;
-  setp.eq.s32   %p8, %r10, 2146435072;
-  setp.eq.s32   %p9, %r8, 0;
-  and.pred    %p10, %p8, %p9;
-  xor.b32    %r11, %r9, %r1;
-  setp.gt.s32   %p11, %r11, -1;
-  and.pred    %p12, %p11, %p10;
-  @%p12 bra   LBB9_6;
-LBB9_4:
-  ld.param.f32   %f1, [__xla_fp64_comparison_param_2];
-  sub.f64   %fd5, %fd1, %fd2;
-  abs.f64   %fd6, %fd5;
-  max.f64   %fd7, %fd3, %fd4;
-  add.f64   %fd8, %fd7, 0d3FF0000000000000;
-  div.rn.f64   %fd9, %fd6, %fd8;
-  cvt.f64.f32   %fd10, %f1;
-  setp.leu.f64   %p13, %fd9, %fd10;
-  abs.f64   %fd11, %fd9;
-  setp.le.f64   %p14, %fd11, 0d7FF0000000000000;
-  and.pred    %p15, %p13, %p14;
-  @%p15 bra   LBB9_6;
-  ld.param.u64   %rd6, [__xla_fp64_comparison_param_4];
-  cvta.to.global.u64   %rd1, %rd6;
-  atom.global.add.u32   %r12, [%rd1], 1;
-LBB9_6:
-  ret;
+
+ ld.param.u64  %rd1, [__xla_fp64_comparison_param_0];
+ ld.param.u64  %rd2, [__xla_fp64_comparison_param_1];
+ ld.param.f32  %f1, [__xla_fp64_comparison_param_2];
+ ld.param.u64  %rd4, [__xla_fp64_comparison_param_3];
+ ld.param.u64  %rd3, [__xla_fp64_comparison_param_4];
+ mov.u32  %r4, %ntid.x;
+ mov.u32  %r5, %ctaid.x;
+ mov.u32  %r6, %tid.x;
+ mad.lo.s32  %r1, %r4, %r5, %r6;
+ cvt.s64.s32 %rd5, %r1;
+ setp.ge.u64 %p1, %rd5, %rd4;
+ @%p1 bra  BB2_11;
+
+ cvta.to.global.u64  %rd6, %rd1;
+ mul.wide.s32  %rd7, %r1, 8;
+ add.s64  %rd8, %rd6, %rd7;
+ cvta.to.global.u64  %rd9, %rd2;
+ add.s64  %rd10, %rd9, %rd7;
+ ld.global.f64  %fd1, [%rd10];
+ ld.global.f64  %fd2, [%rd8];
+ abs.f64  %fd3, %fd2;
+ setp.le.f64 %p2, %fd3, 0d7FF0000000000000;
+ @%p2 bra  BB2_3;
+
+ abs.f64  %fd5, %fd1;
+ setp.gtu.f64 %p3, %fd5, 0d7FF0000000000000;
+ @%p3 bra  BB2_11;
+
+BB2_3:
+ {
+ .reg .b32 %temp; 
+ mov.b64  {%temp, %r2}, %fd2;
+ }
+ and.b32   %r7, %r2, 2147483647;
+ setp.ne.s32 %p4, %r7, 2146435072;
+ @%p4 bra  BB2_8;
+
+ {
+ .reg .b32 %temp; 
+ mov.b64  {%r8, %temp}, %fd2;
+ }
+ setp.ne.s32 %p5, %r8, 0;
+ @%p5 bra  BB2_8;
+
+ {
+ .reg .b32 %temp; 
+ mov.b64  {%temp, %r3}, %fd1;
+ }
+ and.b32   %r9, %r3, 2147483647;
+ setp.ne.s32 %p6, %r9, 2146435072;
+ @%p6 bra  BB2_8;
+
+ {
+ .reg .b32 %temp; 
+ mov.b64  {%r10, %temp}, %fd1;
+ }
+ setp.ne.s32 %p7, %r10, 0;
+ @%p7 bra  BB2_8;
+
+ shr.u32  %r11, %r2, 31;
+ cvt.u16.u32 %rs1, %r11;
+ shr.u32  %r12, %r3, 31;
+ cvt.u16.u32 %rs2, %r12;
+ setp.eq.s16 %p8, %rs1, %rs2;
+ @%p8 bra  BB2_11;
+
+BB2_8:
+ sub.f64  %fd6, %fd2, %fd1;
+ abs.f64  %fd7, %fd6;
+ abs.f64  %fd8, %fd1;
+ max.f64  %fd9, %fd3, %fd8;
+ add.f64  %fd10, %fd9, 0d3FF0000000000000;
+ div.rn.f64  %fd4, %fd7, %fd10;
+ cvt.f64.f32 %fd11, %f1;
+ setp.gt.f64 %p9, %fd4, %fd11;
+ @%p9 bra  BB2_10;
+
+ abs.f64  %fd12, %fd4;
+ setp.le.f64 %p10, %fd12, 0d7FF0000000000000;
+ @%p10 bra  BB2_11;
+
+BB2_10:
+ cvta.to.global.u64  %rd11, %rd3;
+ atom.global.add.u32  %r13, [%rd11], 1;
+
+BB2_11:
+ ret;
+}
+
+ // .globl __xla_int8_comparison
+.visible .entry __xla_int8_comparison(
+ .param .u64 __xla_int8_comparison_param_0,
+ .param .u64 __xla_int8_comparison_param_1,
+ .param .f32 __xla_int8_comparison_param_2,
+ .param .u64 __xla_int8_comparison_param_3,
+ .param .u64 __xla_int8_comparison_param_4
+)
+{
+ .reg .pred  %p<10>;
+ .reg .f32  %f<42>;
+ .reg .b32  %r<23>;
+ .reg .b64  %rd<12>;
+
+
+ ld.param.u64  %rd2, [__xla_int8_comparison_param_0];
+ ld.param.u64  %rd3, [__xla_int8_comparison_param_1];
+ ld.param.f32  %f5, [__xla_int8_comparison_param_2];
+ ld.param.u64  %rd4, [__xla_int8_comparison_param_3];
+ ld.param.u64  %rd5, [__xla_int8_comparison_param_4];
+ cvta.to.global.u64  %rd1, %rd5;
+ mov.u32  %r4, %ntid.x;
+ mov.u32  %r5, %ctaid.x;
+ mov.u32  %r6, %tid.x;
+ mad.lo.s32  %r1, %r4, %r5, %r6;
+ cvt.s64.s32 %rd6, %r1;
+ setp.ge.u64 %p1, %rd6, %rd4;
+ @%p1 bra  BB3_13;
+
+ cvta.to.global.u64  %rd7, %rd2;
+ mul.wide.s32  %rd8, %r1, 4;
+ add.s64  %rd9, %rd7, %rd8;
+ cvta.to.global.u64  %rd10, %rd3;
+ add.s64  %rd11, %rd10, %rd8;
+ ld.global.u32  %r2, [%rd9];
+ cvt.s32.s8  %r7, %r2;
+ cvt.rn.f32.s32 %f6, %r7;
+ ld.global.u32  %r3, [%rd11];
+ cvt.s32.s8  %r8, %r3;
+ cvt.rn.f32.s32 %f7, %r8;
+ sub.f32  %f8, %f6, %f7;
+ abs.f32  %f9, %f8;
+ abs.f32  %f10, %f6;
+ abs.f32  %f11, %f7;
+ max.f32  %f12, %f10, %f11;
+ add.f32  %f13, %f12, 0f3F800000;
+ div.rn.f32  %f1, %f9, %f13;
+ setp.gt.f32 %p2, %f1, %f5;
+ @%p2 bra  BB3_3;
+
+ abs.f32  %f14, %f1;
+ setp.le.f32 %p3, %f14, 0f7F800000;
+ @%p3 bra  BB3_4;
+
+BB3_3:
+ atom.global.add.u32  %r9, [%rd1], 1;
+
+BB3_4:
+ shr.u32  %r10, %r3, 8;
+ shr.u32  %r11, %r2, 8;
+ cvt.s32.s8  %r12, %r11;
+ cvt.rn.f32.s32 %f15, %r12;
+ cvt.s32.s8  %r13, %r10;
+ cvt.rn.f32.s32 %f16, %r13;
+ sub.f32  %f17, %f15, %f16;
+ abs.f32  %f18, %f17;
+ abs.f32  %f19, %f15;
+ abs.f32  %f20, %f16;
+ max.f32  %f21, %f19, %f20;
+ add.f32  %f22, %f21, 0f3F800000;
+ div.rn.f32  %f2, %f18, %f22;
+ setp.gt.f32 %p4, %f2, %f5;
+ @%p4 bra  BB3_6;
+
+ abs.f32  %f23, %f2;
+ setp.le.f32 %p5, %f23, 0f7F800000;
+ @%p5 bra  BB3_7;
+
+BB3_6:
+ atom.global.add.u32  %r14, [%rd1], 1;
+
+BB3_7:
+ shr.u32  %r15, %r3, 16;
+ shr.u32  %r16, %r2, 16;
+ cvt.s32.s8  %r17, %r16;
+ cvt.rn.f32.s32 %f24, %r17;
+ cvt.s32.s8  %r18, %r15;
+ cvt.rn.f32.s32 %f25, %r18;
+ sub.f32  %f26, %f24, %f25;
+ abs.f32  %f27, %f26;
+ abs.f32  %f28, %f24;
+ abs.f32  %f29, %f25;
+ max.f32  %f30, %f28, %f29;
+ add.f32  %f31, %f30, 0f3F800000;
+ div.rn.f32  %f3, %f27, %f31;
+ setp.gt.f32 %p6, %f3, %f5;
+ @%p6 bra  BB3_9;
+
+ abs.f32  %f32, %f3;
+ setp.le.f32 %p7, %f32, 0f7F800000;
+ @%p7 bra  BB3_10;
+
+BB3_9:
+ atom.global.add.u32  %r19, [%rd1], 1;
+
+BB3_10:
+ shr.s32  %r20, %r2, 24;
+ cvt.rn.f32.s32 %f33, %r20;
+ shr.s32  %r21, %r3, 24;
+ cvt.rn.f32.s32 %f34, %r21;
+ sub.f32  %f35, %f33, %f34;
+ abs.f32  %f36, %f35;
+ abs.f32  %f37, %f33;
+ abs.f32  %f38, %f34;
+ max.f32  %f39, %f37, %f38;
+ add.f32  %f40, %f39, 0f3F800000;
+ div.rn.f32  %f4, %f36, %f40;
+ setp.gt.f32 %p8, %f4, %f5;
+ @%p8 bra  BB3_12;
+
+ abs.f32  %f41, %f4;
+ setp.le.f32 %p9, %f41, 0f7F800000;
+ @%p9 bra  BB3_13;
+
+BB3_12:
+ atom.global.add.u32  %r22, [%rd1], 1;
+
+BB3_13:
+ ret;
 }
 )";
 
@@ -364,9 +578,9 @@ static StatusOr<bool> DeviceCompare(se::Stream* stream,
   uint64 buffer_size = lhs_typed.ElementCount();
 
   TF_ASSIGN_OR_RETURN(absl::Span<const uint8> compiled_ptx,
-                      se::cuda::CompilePtxOrGetCached(
-                          executor->device_ordinal(), buffer_compare_ptx,
-                          PtxOptsFromConfig(config)));
+                      se::CompileGpuAsmOrGetCached(executor->device_ordinal(),
+                                                   buffer_compare_ptx,
+                                                   PtxOptsFromConfig(config)));
 
   TF_ASSIGN_OR_RETURN(
       std::unique_ptr<ComparisonKernelT<ElementT>> comparison_kernel,
@@ -472,6 +686,9 @@ StatusOr<bool> BufferComparator::CompareEqual(se::Stream* stream,
     case xla::F64:
       return CompareEqualParameterized<double, double>(
           stream, lhs, rhs, shape_, config_, "__xla_fp64_comparison");
+    case xla::S8:
+      return CompareEqualParameterized<int8, float>(
+          stream, lhs, rhs, shape_, config_, "__xla_int8_comparison");
     default:
       return Unimplemented("Unimplemented element type");
   }
diff --git a/tensorflow/compiler/xla/service/gpu/buffer_comparator_test.cc b/tensorflow/compiler/xla/service/gpu/buffer_comparator_test.cc
index 139e4204304..0f547111096 100644
--- a/tensorflow/compiler/xla/service/gpu/buffer_comparator_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/buffer_comparator_test.cc
@@ -178,6 +178,13 @@ TEST_F(BufferComparatorTest, TestNumbers) {
   EXPECT_TRUE(CompareEqualFloatBuffers<double>({0.9}, {1}));
   EXPECT_TRUE(CompareEqualFloatBuffers<double>({9}, {10}));
   EXPECT_TRUE(CompareEqualFloatBuffers<double>({10}, {9}));
+
+  EXPECT_TRUE(CompareEqualFloatBuffers<int8>({200}, {201}));
+  EXPECT_FALSE(CompareEqualFloatBuffers<int8>({0}, {10}));
+  EXPECT_TRUE(CompareEqualFloatBuffers<int8>({9}, {10}));
+  EXPECT_TRUE(CompareEqualFloatBuffers<int8>({90}, {100}));
+  EXPECT_TRUE(CompareEqualFloatBuffers<int8>({100}, {90}));
+  EXPECT_FALSE(CompareEqualFloatBuffers<int8>({-128}, {127}));
 }
 
 TEST_F(BufferComparatorTest, TestMultiple) {
@@ -231,6 +238,23 @@ TEST_F(BufferComparatorTest, TestMultiple) {
       rhs[i] = 0;
     }
   }
+
+  {
+    EXPECT_TRUE(CompareEqualFloatBuffers<int8>({20, 30, 40, 50, 60},
+                                               {21, 31, 41, 51, 61}));
+    std::vector<float> lhs(200);
+    std::vector<float> rhs(200);
+    for (int i = 0; i < 200; i++) {
+      EXPECT_TRUE(CompareEqualFloatBuffers<int8>(lhs, rhs))
+          << "should be the same at index " << i;
+      lhs[i] = 3;
+      rhs[i] = 5;
+      EXPECT_FALSE(CompareEqualFloatBuffers<int8>(lhs, rhs))
+          << "should be the different at index " << i;
+      lhs[i] = 0;
+      rhs[i] = 0;
+    }
+  }
 }
 
 }  // namespace
diff --git a/tensorflow/compiler/xla/service/gpu/collective_permute_thunk.cc b/tensorflow/compiler/xla/service/gpu/collective_permute_thunk.cc
index 60301b4de64..2fe359861f8 100644
--- a/tensorflow/compiler/xla/service/gpu/collective_permute_thunk.cc
+++ b/tensorflow/compiler/xla/service/gpu/collective_permute_thunk.cc
@@ -22,7 +22,6 @@ limitations under the License.
 
 #include "absl/algorithm/container.h"
 #include "absl/memory/memory.h"
-#include "third_party/gpus/cuda/include/cuda_runtime_api.h"
 #include "tensorflow/compiler/xla/refcounting_hash_map.h"
 #include "tensorflow/compiler/xla/service/hlo_casting_utils.h"
 #include "tensorflow/compiler/xla/service/hlo_instructions.h"
diff --git a/tensorflow/compiler/xla/service/gpu/cudnn_conv_algorithm_picker.cc b/tensorflow/compiler/xla/service/gpu/cudnn_conv_algorithm_picker.cc
index 7a7ab6ba05f..c829fc92c87 100644
--- a/tensorflow/compiler/xla/service/gpu/cudnn_conv_algorithm_picker.cc
+++ b/tensorflow/compiler/xla/service/gpu/cudnn_conv_algorithm_picker.cc
@@ -37,7 +37,7 @@ limitations under the License.
 #include "tensorflow/core/platform/logger.h"
 #include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/util/proto/proto_utils.h"
-#include "tensorflow/stream_executor/cuda/redzone_allocator.h"
+#include "tensorflow/stream_executor/gpu/redzone_allocator.h"
 
 namespace xla {
 namespace gpu {
@@ -136,13 +136,13 @@ void PrintPlatformInfo(const se::Stream* stream) {
 //
 // `name` is a user-friendly name for the set of redzones being checked, e.g.
 // "input/output" or "scratch".
-StatusOr<bool> CheckRedzones(const se::cuda::RedzoneAllocator& allocator,
+StatusOr<bool> CheckRedzones(const se::RedzoneAllocator& allocator,
                              se::Stream* stream, absl::string_view name,
                              const HloInstruction* instr,
                              AutotuneResult* result) {
   XLA_SCOPED_LOGGING_TIMER_LEVEL("CudnnConvAlgorithmPicker checking redzones",
                                  2);
-  using RedzoneCheckStatus = se::cuda::RedzoneAllocator::RedzoneCheckStatus;
+  using RedzoneCheckStatus = se::RedzoneAllocator::RedzoneCheckStatus;
   TF_ASSIGN_OR_RETURN(RedzoneCheckStatus redzone_check,
                       allocator.CheckRedzones());
   if (redzone_check.ok()) {
@@ -271,29 +271,29 @@ StatusOr<AutotuneResult> CudnnConvAlgorithmPicker::PickBestAlgorithmNoCache(
 
   int64 rng_state = 0;
 
-  const auto initialize_buffer = [stream, &result_shape,
-                                  &rng_state](DeviceMemoryBase buffer) {
-    InitializeFloatBuffer(stream, result_shape.element_type(), &rng_state,
-                          buffer);
+  const auto initialize_buffer = [&stream, &rng_state](
+                                     DeviceMemoryBase buffer,
+                                     const Shape& buffer_shape) {
+    InitializeBuffer(stream, buffer_shape.element_type(), &rng_state, buffer);
   };
 
   const HloModuleConfig& hlo_module_config = instr->GetModule()->config();
 
   // Allocate space for the input, filter, and output of the convolution.
-  se::cuda::RedzoneAllocator input_output_allocator(
+  se::RedzoneAllocator input_output_allocator(
       stream, allocator, PtxOptsFromConfig(hlo_module_config));
   std::vector<se::DeviceMemoryBase> operand_buffers;
   for (const auto* operand : instr->operands()) {
     TF_ASSIGN_OR_RETURN(auto buffer,
                         input_output_allocator.AllocateBytes(
                             ShapeUtil::ByteSizeOf(operand->shape())));
-    initialize_buffer(buffer);
+    initialize_buffer(buffer, operand->shape());
     operand_buffers.push_back(buffer);
   }
   TF_ASSIGN_OR_RETURN(auto result_buffer,
                       input_output_allocator.AllocateBytes(
                           ShapeUtil::ByteSizeOf(result_shape)));
-  initialize_buffer(result_buffer);
+  initialize_buffer(result_buffer, result_shape);
 
   TF_ASSIGN_OR_RETURN(auto backend_config,
                       instr->backend_config<CudnnConvBackendConfig>());
@@ -339,7 +339,7 @@ StatusOr<AutotuneResult> CudnnConvAlgorithmPicker::PickBestAlgorithmNoCache(
       continue;
     }
 
-    se::cuda::RedzoneAllocator scratch_allocator(
+    se::RedzoneAllocator scratch_allocator(
         stream, allocator, PtxOptsFromConfig(hlo_module_config));
     se::dnn::ProfileResult profile_result;
     VLOG(3) << "Trying algorithm " << AlgorithmToString(alg) << " for "
diff --git a/tensorflow/compiler/xla/service/gpu/cudnn_conv_pad_for_tensor_cores.h b/tensorflow/compiler/xla/service/gpu/cudnn_conv_pad_for_tensor_cores.h
deleted file mode 100644
index d4e51e86c1b..00000000000
--- a/tensorflow/compiler/xla/service/gpu/cudnn_conv_pad_for_tensor_cores.h
+++ /dev/null
@@ -1,47 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_GPU_CUDNN_CONV_PAD_FOR_TENSOR_CORES_H_
-#define TENSORFLOW_COMPILER_XLA_SERVICE_GPU_CUDNN_CONV_PAD_FOR_TENSOR_CORES_H_
-
-#include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
-
-namespace xla {
-namespace gpu {
-
-// Adds padding to cudnn convolutions to make them run faster on GPUs with
-// tensor cores.
-//
-//  - f16 convolutions are padded to have input/output channel dimensions that
-//    are multiples of 8, so that we can use tensor cores.
-//
-//  - f16 convolutions with 3 input channels and 32 or 64 output channels are
-//    padded to 4 input channels.  There's a special-cased cudnn algorithm just
-//    for this.
-//
-// Don't run this pass on GPUs without tensor cores -- it will make them slower!
-//
-// TODO(jlebar): Also pad dots.
-class CudnnConvPadForTensorCores : public HloModulePass {
- public:
-  absl::string_view name() const override { return "cudnn-conv-pad-for-speed"; }
-
-  StatusOr<bool> Run(HloModule* module) override;
-};
-
-}  // namespace gpu
-}  // namespace xla
-
-#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_GPU_CUDNN_CONV_PAD_FOR_TENSOR_CORES_H_
diff --git a/tensorflow/compiler/xla/service/gpu/cudnn_conv_rewriter.cc b/tensorflow/compiler/xla/service/gpu/cudnn_conv_rewriter.cc
index 8596f640fc2..7b23935fbac 100755
--- a/tensorflow/compiler/xla/service/gpu/cudnn_conv_rewriter.cc
+++ b/tensorflow/compiler/xla/service/gpu/cudnn_conv_rewriter.cc
@@ -619,43 +619,85 @@ CudnnConvBackendConfig GetDefaultBackendConfig() {
   return config;
 }
 
+// Helper function to create a custom_call instruction to replace the given
+// conv instruction
+static StatusOr<HloInstruction*> CreateCustomCallHelper(HloInstruction* conv) {
+  bool match;
+  Window window;
+  ConvolutionDimensionNumbers dnums;
+  HloInstruction* rhs;
+  HloInstruction* lhs;
+
+  std::tie(match, window, dnums, rhs) = MatchBackwardInput(conv);
+  if (match) {
+    return CreateCudnnConv(kCudnnConvBackwardInputCallTarget, conv->shape(),
+                           conv->mutable_operand(0), rhs, window, dnums,
+                           conv->feature_group_count(), conv->metadata());
+  }
+
+  std::tie(match, window, dnums, lhs) = MatchBackwardFilter(conv);
+  if (match) {
+    return CreateCudnnConv(kCudnnConvBackwardFilterCallTarget, conv->shape(),
+                           lhs, conv->mutable_operand(1), window, dnums,
+                           conv->feature_group_count(), conv->metadata());
+  }
+
+  // If all else fails, try a forward convolution.
+  if (CanImplementAsCudnnForwardConv(conv)) {
+    if (primitive_util::IsIntegralType(
+            conv->operand(0)->shape().element_type())) {
+      // In addition to replacing a convolution instruction with
+      // a custom call, integer convolutions must have this pattern to match
+      // CuDNN semantics:
+      // conv<InputT=int32, ResultT=int32>(
+      //   convert<int32>(int8_x), convert<int32>(int8_y))
+      // We transform it to:
+      // custom_call<int32>(int8_x, int8_y, target=cudnnConvolutionForward)
+      //
+      // We will error out, if the pattern is not found for integer
+      // convolution.
+      const auto is_int8_to_int32_cast =
+          [](const HloInstruction* instr) -> bool {
+        return (instr->opcode() == HloOpcode::kConvert &&
+                instr->operand(0)->shape().element_type() == S8 &&
+                instr->shape().element_type() == S32);
+      };
+      HloInstruction* input_convert = conv->mutable_operand(0);
+      HloInstruction* kernel_convert = conv->mutable_operand(1);
+      if (conv->shape().element_type() != S32 ||
+          !is_int8_to_int32_cast(input_convert) ||
+          !is_int8_to_int32_cast(kernel_convert)) {
+        return Unimplemented(
+            "Integer convolutions for CuDNN must have this pattern: "
+            "conv<InputT=int32, ResultT=int32>(convert<int32>(int8_x), "
+            "convert<int32>(int8_y))");
+      }
+      // Bypass the convert<int32> for both inputs.
+      TF_RETURN_IF_ERROR(conv->ReplaceOperandWithDifferentShape(
+          0, input_convert->mutable_operand(0)));
+      TF_RETURN_IF_ERROR(
+          conv->parent()->RemoveInstructionAndUnusedOperands(input_convert));
+      TF_RETURN_IF_ERROR(conv->ReplaceOperandWithDifferentShape(
+          1, kernel_convert->mutable_operand(0)));
+      TF_RETURN_IF_ERROR(
+          conv->parent()->RemoveInstructionAndUnusedOperands(kernel_convert));
+    }
+    return CreateCudnnConv(kCudnnConvForwardCallTarget, conv->shape(),
+                           conv->mutable_operand(0), conv->mutable_operand(1),
+                           conv->window(),
+                           conv->convolution_dimension_numbers(),
+                           conv->feature_group_count(), conv->metadata());
+  }
+
+  return nullptr;
+}
+
 // Tries to rewrite a single convolution into a call to cudnn.
 StatusOr<bool> RunOnInstruction(HloInstruction* conv) {
   CHECK_EQ(conv->opcode(), HloOpcode::kConvolution);
 
-  HloInstruction* custom_call = [&]() -> HloInstruction* {
-    bool match;
-    Window window;
-    ConvolutionDimensionNumbers dnums;
-    HloInstruction* rhs;
-    HloInstruction* lhs;
-
-    std::tie(match, window, dnums, rhs) = MatchBackwardInput(conv);
-    if (match) {
-      return CreateCudnnConv(kCudnnConvBackwardInputCallTarget, conv->shape(),
-                             conv->mutable_operand(0), rhs, window, dnums,
-                             conv->feature_group_count(), conv->metadata());
-    }
-
-    std::tie(match, window, dnums, lhs) = MatchBackwardFilter(conv);
-    if (match) {
-      return CreateCudnnConv(kCudnnConvBackwardFilterCallTarget, conv->shape(),
-                             lhs, conv->mutable_operand(1), window, dnums,
-                             conv->feature_group_count(), conv->metadata());
-    }
-
-    // If all else fails, try a forward convolution.
-    if (CanImplementAsCudnnForwardConv(conv)) {
-      return CreateCudnnConv(kCudnnConvForwardCallTarget, conv->shape(),
-                             conv->mutable_operand(0), conv->mutable_operand(1),
-                             conv->window(),
-                             conv->convolution_dimension_numbers(),
-                             conv->feature_group_count(), conv->metadata());
-    }
-
-    return nullptr;
-  }();
-
+  TF_ASSIGN_OR_RETURN(HloInstruction * custom_call,
+                      CreateCustomCallHelper(conv));
   if (custom_call == nullptr) {
     return false;
   }
@@ -666,8 +708,8 @@ StatusOr<bool> RunOnInstruction(HloInstruction* conv) {
   VLOG(1) << "Replacing convolution " << conv->ToString() << " with "
           << custom_call->ToString();
 
-  // The CustomCall returns a tuple (conv_result, scratch_memory).  Extract out
-  // the conv result and replace `conv` with it.
+  // The CustomCall returns a tuple (conv_result, scratch_memory).  Extract
+  // out the conv result and replace `conv` with it.
   TF_RETURN_IF_ERROR(conv->parent()->ReplaceWithNewInstruction(
       conv,
       HloInstruction::CreateGetTupleElement(conv->shape(), custom_call, 0)));
diff --git a/tensorflow/compiler/xla/service/gpu/cudnn_conv_rewriter.h b/tensorflow/compiler/xla/service/gpu/cudnn_conv_rewriter.h
index d8ec72c27ba..77b57c910c9 100644
--- a/tensorflow/compiler/xla/service/gpu/cudnn_conv_rewriter.h
+++ b/tensorflow/compiler/xla/service/gpu/cudnn_conv_rewriter.h
@@ -24,6 +24,14 @@ namespace gpu {
 
 // Rewrites plain convolutions, backwards-filter convolutions, and
 // backwards-input convolutions into CustomCall HLOs that call into cuDNN.
+// For integer convolution, it requires the following pattern:
+// conv<InputT=int32, ResultT=int32>(
+//   convert<int32>(int8_x), convert<int32>(int8_y))
+// We transform it to:
+// custom_call<int32>(int8_x, int8_y, target=cudnnForwardConvolution)
+// Note that this pattern is necessary but not sufficient to map convolutions
+// to CuDNN.  More patterns will be matched in cudnn_fused_conv_rewriter.
+
 class CudnnConvRewriter : public HloModulePass {
  public:
   absl::string_view name() const override { return "cudnn-conv-rewriter"; }
diff --git a/tensorflow/compiler/xla/service/gpu/cudnn_conv_rewriter_test.cc b/tensorflow/compiler/xla/service/gpu/cudnn_conv_rewriter_test.cc
index 362d8d13aab..815963bfa9f 100644
--- a/tensorflow/compiler/xla/service/gpu/cudnn_conv_rewriter_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/cudnn_conv_rewriter_test.cc
@@ -711,6 +711,21 @@ TEST_F(CudnnConvRewriterTest, BackwardInputConvolveConstantFilter) {
                           0));
 }
 
+// Check that a forward convolution instruction with int8 inputs is not allowed
+TEST_F(CudnnConvRewriterTest, TestForwardInt8Convolution) {
+  const string module_str = absl::StrFormat(R"(
+    HloModule Test
+
+    ENTRY Test {
+      input = s8[1,2,3,3] parameter(0)
+      filter = s8[3,3,2,5] parameter(1)
+
+      ROOT conv = s8[1,5,3,3] convolution(input, filter), window={size=3x3 pad=1_1x1_1}, dim_labels=bf01_01io->bf01, feature_group_count=1
+    })");
+  TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(module_str));
+
+  ASSERT_FALSE(CudnnConvRewriter().Run(m.get()).ok());
+}
 }  // anonymous namespace
 }  // namespace gpu
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/cudnn_fused_conv_rewriter.cc b/tensorflow/compiler/xla/service/gpu/cudnn_fused_conv_rewriter.cc
index aca7307e0c2..b2cac986761 100644
--- a/tensorflow/compiler/xla/service/gpu/cudnn_fused_conv_rewriter.cc
+++ b/tensorflow/compiler/xla/service/gpu/cudnn_fused_conv_rewriter.cc
@@ -26,8 +26,12 @@ namespace xla {
 namespace gpu {
 namespace {
 
-// Describes a matched pattern:
+// Describes matched patterns:
 //   max(0, alpha1 * conv(x, w) + alpha2 * side_input + broadcast(bias));
+//   for floating point types or
+//   max(0, alpha1 * conv<float>(int8_x, int8_w) + alpha2 *
+//   * side_input + broadcast(bias));
+//   for int8.
 // Where side_input has the shape of output buffer, and bias is a 1D array with
 // the dimension of number of output features.
 struct ConvWithRelu {
@@ -39,6 +43,13 @@ struct ConvWithRelu {
   HloConstantInstruction* alpha_side_input;
 };
 
+// The pattern we want to match:
+//   max(0, alpha1 * conv(x, w) + alpha2 * side_input + broadcast(bias));
+//   or
+//   max(0, alpha1 * conv<float>(int8_x, int8_w) + alpha2 *
+//   * side_input + broadcast(bias));
+// With its variants involving commute/reassociation of adds, multiplies, and
+// max, and omission of alpha1, side_input, alpha2, or bias.
 absl::optional<ConvWithRelu> FindConvWithRelu(HloInstruction* instr) {
   using match::Add;
   using match::AddAnyOrder;
@@ -50,12 +61,6 @@ absl::optional<ConvWithRelu> FindConvWithRelu(HloInstruction* instr) {
   using match::MultiplyAnyOrder;
   using match::Op;
 
-  // The pattern we want to match:
-  //   max(0, alpha1 * conv(x, w) + alpha2 * side_input + broadcast(bias));
-  //
-  // With its variants involving commute/reassociation of adds, multiplies, and
-  // max, and omission of alpha1, side_input, alpha2, or bias.
-
   HloInstruction* relu_input;
 
   // Match max(0, relu_input).
@@ -149,6 +154,14 @@ absl::optional<ConvWithRelu> FindConvWithRelu(HloInstruction* instr) {
     return absl::nullopt;
   }
 
+  // In order to map to cudnnConvolutionBiasActivationForward for int8, the
+  // convolution output is float, i.e. conv<float>(int8_x, int8_w)
+  if (conv->operand(0)->shape().element_type() == xla::S8) {
+    if (conv->shape().tuple_shapes(0).element_type() != xla::F32) {
+      return absl::nullopt;
+    }
+  }
+
   if (bias_broadcast) {
     // TODO(timshen): handle bias_broadcast_instr->dimensions() == {}.
     if (bias_broadcast_instr->dimensions().size() != 1) {
@@ -174,7 +187,6 @@ StatusOr<std::unique_ptr<HloInstruction>> TryRewriteToCudnnForwardRelu(
   auto conv = match.conv;
 
   HloComputation* computation = conv->parent();
-  PrimitiveType element_type = conv->operand(0)->shape().element_type();
 
   const auto get_alpha_value =
       [](HloConstantInstruction* instr) -> StatusOr<double> {
@@ -204,13 +216,15 @@ StatusOr<std::unique_ptr<HloInstruction>> TryRewriteToCudnnForwardRelu(
 
   auto bias = match.bias;
   if (!bias) {
+    PrimitiveType conv_output_type =
+        conv->shape().tuple_shapes(0).element_type();
     auto zero = computation->AddInstruction(
-        HloInstruction::CreateConstant(LiteralUtil::Zero(element_type)));
+        HloInstruction::CreateConstant(LiteralUtil::Zero(conv_output_type)));
 
     int64 num_output_feature = conv->shape().tuple_shapes(0).dimensions(
         conv->convolution_dimension_numbers().output_feature_dimension());
     bias = computation->AddInstruction(HloInstruction::CreateBroadcast(
-        ShapeUtil::MakeShapeWithDescendingLayout(element_type,
+        ShapeUtil::MakeShapeWithDescendingLayout(conv_output_type,
                                                  {num_output_feature}),
         zero, {}));
   }
@@ -242,9 +256,9 @@ StatusOr<std::unique_ptr<HloInstruction>> TryRewriteToCudnnForwardRelu(
                                                new_conv, 0);
 }
 
-}  // namespace
-
-StatusOr<bool> CudnnFusedConvRewriter::Run(HloModule* module) {
+// Fuse bias/scaling/ReLU with convolution custom call with floating point
+// output
+StatusOr<bool> RunFuseBiasSideActivation(HloModule* module) {
   bool changed = false;
   for (HloComputation* computation : module->MakeNonfusionComputations()) {
     std::vector<ConvWithRelu> matches;
@@ -277,5 +291,201 @@ StatusOr<bool> CudnnFusedConvRewriter::Run(HloModule* module) {
   return changed;
 }
 
+// Describes a matched pattern:
+// convert_or_clamp(get_tuple_element(custom_call(x,w, ...)));
+// where the custom_call targets CuDNN convolution (either pure convolution or
+// fused convolution).
+struct ConvWithConvertOrClamp {
+  HloInstruction* convert_or_clamp;
+  HloInstruction* gte;
+  HloCustomCallInstruction* conv;
+};
+
+// The pattern we want to match:
+//   convert<int8>(clamp(broadcast(-128), (get_tuple_element(custom_call(int8_x,
+//   int8_w, ...)), broadcast(127));
+absl::optional<ConvWithConvertOrClamp> FindConvWithClampAndConvertToInt8(
+    HloInstruction* instr) {
+  using match::Broadcast;
+  using match::Clamp;
+  using match::Convert;
+  using match::GetTupleElement;
+  using match::Op;
+
+  HloInstruction* gte = nullptr;
+  HloInstruction* conv_instr = nullptr;
+  auto lower_pattern = Broadcast(match::ConstantScalar(-128));
+  auto upper_pattern = Broadcast(match::ConstantScalar(127));
+  auto pattern = Convert(
+      Clamp(lower_pattern,
+            GetTupleElement(
+                &gte, Op(&conv_instr).WithOpcode(HloOpcode::kCustomCall), 0),
+            upper_pattern));
+
+  if (Match(instr, pattern)) {
+    if (conv_instr->operand(0)->shape().element_type() == xla::S8 &&
+        instr->shape().element_type() == xla::S8) {
+      HloCustomCallInstruction* conv =
+          CastOrNull<HloCustomCallInstruction>(conv_instr);
+      return ConvWithConvertOrClamp{instr, gte, conv};
+    }
+  }
+  return absl::nullopt;
+}
+
+// A help function to rewrite convert_or_clamp_or_other<new_type>(gte(conv()))
+// to gte<new_type>(conv<new_type>()).  It bypasses convert_or_clamp_or_other
+// and set the output data type on gte and conv.
+Status RewriteForConvertOrClampImpl(ConvWithConvertOrClamp match) {
+  auto conv = match.conv;
+  auto gte = match.gte;
+  auto convert_or_clamp = match.convert_or_clamp;
+
+  // Change type on conv and gte
+  auto convert_out_type = convert_or_clamp->shape().element_type();
+  conv->mutable_shape()->mutable_tuple_shapes(0)->set_element_type(
+      convert_out_type);
+  gte->mutable_shape()->set_element_type(convert_out_type);
+
+  // Remove clamp/convert and so on and just keep
+  // get_tuple_element(custom_call(x,w, ...))
+  TF_RETURN_IF_ERROR(convert_or_clamp->ReplaceAllUsesWithDifferentShape(gte));
+  TF_RETURN_IF_ERROR(
+      conv->parent()->RemoveInstructionAndUnusedOperands(convert_or_clamp));
+  return Status::OK();
+}
+
+Status RewriteForFinalOutput(ConvWithConvertOrClamp match) {
+  // When the matched clamp has a single user, which is convert<int8>, we
+  // will absorb it, if
+  // 1. the side_input matches a convert<float>(int8_side_input), or
+  // 2. there is no side input
+  const auto is_one_to_one_X_to_Y_cast = [](const HloInstruction* instr,
+                                            PrimitiveType X,
+                                            PrimitiveType Y) -> bool {
+    return (instr->opcode() == HloOpcode::kConvert &&
+            instr->shape().element_type() == Y && instr->operand_count() == 1 &&
+            instr->operand(0)->user_count() == 1 &&
+            instr->operand(0)->shape().element_type() == X);
+  };
+
+  if (match.conv->operand_count() < 4) {
+    // Conv input #3 (zero based) is side_input, after x, w, and bias.
+    // Side input doesn't exist in this case.
+    TF_RETURN_IF_ERROR(RewriteForConvertOrClampImpl(match));
+  } else if (is_one_to_one_X_to_Y_cast(match.conv->operand(3), S8, F32)) {
+    // If side_input has a convert_float_to_int8, absorb it as well.
+    auto side_converter = match.conv->mutable_operand(3);
+    TF_RETURN_IF_ERROR(side_converter->ReplaceAllUsesWithDifferentShape(
+        side_converter->mutable_operand(0)));
+    TF_RETURN_IF_ERROR(
+        side_converter->parent()->RemoveInstructionAndUnusedOperands(
+            side_converter));
+
+    TF_RETURN_IF_ERROR(RewriteForConvertOrClampImpl(match));
+  }
+  return Status::OK();
+}
+
+// Fuse the clamp/convert pattern with the int8 convolution custom call
+// (either pure or fused) for int8 output
+StatusOr<bool> RunFuseClamp(HloModule* module) {
+  bool changed = false;
+  for (HloComputation* computation : module->MakeNonfusionComputations()) {
+    std::vector<ConvWithConvertOrClamp> matches;
+    for (auto instr : computation->instructions()) {
+      auto match = FindConvWithClampAndConvertToInt8(instr);
+      if (match.has_value()) {
+        matches.push_back(*match);
+      }
+    }
+    for (const ConvWithConvertOrClamp& match : matches) {
+      TF_RETURN_IF_ERROR(RewriteForFinalOutput(match));
+      changed = true;
+    }
+
+    // Report error for any convolution still having int32 output.
+    // Although int32 output convolution will trigger other sanity check errors
+    // later, we want to give specific error message here.
+    for (auto instr : computation->instructions()) {
+      if (auto call = DynCast<HloCustomCallInstruction>(instr)) {
+        if ((call->custom_call_target() == kCudnnConvForwardCallTarget ||
+             call->custom_call_target() ==
+                 kCudnnConvBiasActivationForwardCallTarget) &&
+            call->shape().tuple_shapes(0).element_type() == xla::S32) {
+          return Unimplemented(
+              "Integer convolutions for CuDNN must have float or int8 output.  "
+              "Use convert to cast output to float or the following pattern to "
+              "int8: "
+              "clamp(broadcast(-128), conv(int8_x, int8_w, ...), "
+              "broadcast(127)).");
+        }
+      }
+    }
+  }
+  return changed;
+}
+
+// The pattern we want to match:
+//   convert<float>(get_tuple_element<int32>(custom_call()));
+absl::optional<ConvWithConvertOrClamp> FindConvWithConvertToFloat(
+    HloInstruction* instr) {
+  using match::Convert;
+  using match::GetTupleElement;
+  using match::Op;
+
+  HloInstruction* gte = nullptr;
+  HloInstruction* conv_instr = nullptr;
+  auto pattern =
+      Convert(GetTupleElement(
+                  &gte,
+                  Op(&conv_instr)
+                      .WithOpcode(HloOpcode::kCustomCall)
+                      .WithCustomCallTarget(kCudnnConvForwardCallTarget),
+                  0)
+                  .WithShape(match::Shape().WithElementType(xla::S32)))
+          .WithShape(match::Shape().WithElementType(xla::F32));
+  if (Match(instr, pattern)) {
+    HloCustomCallInstruction* conv =
+        CastOrNull<HloCustomCallInstruction>(conv_instr);
+    return ConvWithConvertOrClamp{instr, gte, conv};
+  }
+  return absl::nullopt;
+}
+
+// Transform
+// convert<float>(GetTupleElement<int32>(custom_call<int32>(int8_x, int8_w)))
+// to
+// GetTupleElement<float>(custom_call<int32>(int8_x, int8_w))
+StatusOr<bool> RunFuseConvertToFloat(HloModule* module) {
+  bool changed = false;
+  for (HloComputation* computation : module->MakeNonfusionComputations()) {
+    std::vector<ConvWithConvertOrClamp> matches;
+    for (auto instr : computation->instructions()) {
+      auto match = FindConvWithConvertToFloat(instr);
+      if (match.has_value()) {
+        matches.push_back(*match);
+      }
+    }
+
+    for (const ConvWithConvertOrClamp& match : matches) {
+      TF_RETURN_IF_ERROR(RewriteForConvertOrClampImpl(match));
+      changed = true;
+    }
+  }
+  return changed;
+}
+}  // namespace
+
+StatusOr<bool> CudnnFusedConvRewriter::Run(HloModule* module) {
+  TF_ASSIGN_OR_RETURN(bool fused_for_convert_to_float,
+                      RunFuseConvertToFloat(module));
+
+  TF_ASSIGN_OR_RETURN(bool fused_for_bias, RunFuseBiasSideActivation(module));
+
+  TF_ASSIGN_OR_RETURN(bool fused_for_clamp, RunFuseClamp(module));
+
+  return fused_for_convert_to_float || fused_for_bias || fused_for_clamp;
+}
 }  // namespace gpu
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/cudnn_fused_conv_rewriter.h b/tensorflow/compiler/xla/service/gpu/cudnn_fused_conv_rewriter.h
index 613ed8dbdc3..e3602b70d29 100644
--- a/tensorflow/compiler/xla/service/gpu/cudnn_fused_conv_rewriter.h
+++ b/tensorflow/compiler/xla/service/gpu/cudnn_fused_conv_rewriter.h
@@ -22,6 +22,40 @@ limitations under the License.
 namespace xla {
 namespace gpu {
 
+// Rewrite the custom call targeting cudnnConvolutionForward to
+// cudnnConvolutionBiasActivationForward by fusing applicable point-wise
+// operations following forward convolution.  This transform must run after
+// cudnn_conv_rewriter.
+// It is straightforward for floating point convolutions:
+// transforming
+//   max(0, alpha1 * conv(x, w) + alpha2 * side_input + broadcast(bias))
+// to
+//   cudnnConvolutionBiasActivationForward(x, w, bias, alpha1, alpha2, side)
+//
+// Integer convolution requires additional patterns to match CuDNN semantics:
+//   #1 from
+//   cast<int8>(clamp<-128, 127>(conv(int8_x, int8_w)))
+//   to
+//   cudnnConvolutionForward<int8>(int8_x, int8_w)
+// or #2 from
+//   cast<float>(conv(int8_x, int8_w))
+//   to
+//   cudnnConvolutionForward<float>(int8_x, int8_w)
+// or #3 from
+//   cast<int8>(clamp<-128, 127>(max(0, alpha1 *
+//                           cast<float>(conv(int8_x, int8_w)) +
+//                           alpha2 * cast<float>(int8_side) +
+//                           broadcast(bias)))
+//   to
+//   cudnnConvolutionBiasActivationForward<int8>(int8_x, int8_w, bias, alpha1,
+//   alpha2, int8_side)
+// or #4 from
+//   max(0, alpha1 * cast<float>(conv(int8_x, int8_w)) +
+//          alpha2 * float_side + broadcast(bias))
+//   to
+//   cudnnConvolutionBiasActivationForward<float>(int8_x, int8_w, bias, alpha1,
+//   alpha2, float_side)
+
 class CudnnFusedConvRewriter : public HloModulePass {
  public:
   absl::string_view name() const override {
diff --git a/tensorflow/compiler/xla/service/gpu/cudnn_fused_conv_rewriter_test.cc b/tensorflow/compiler/xla/service/gpu/cudnn_fused_conv_rewriter_test.cc
index b621880f639..bd6aa6e715a 100644
--- a/tensorflow/compiler/xla/service/gpu/cudnn_fused_conv_rewriter_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/cudnn_fused_conv_rewriter_test.cc
@@ -13,9 +13,13 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include "tensorflow/compiler/xla/service/gpu/cudnn_fused_conv_rewriter.h"
+
 #include "absl/strings/str_replace.h"
 #include "tensorflow/compiler/xla/service/gpu/ir_emission_utils.h"
+#include "tensorflow/compiler/xla/service/gpu/tests/gpu_codegen_test.h"
 #include "tensorflow/compiler/xla/service/hlo_parser.h"
+#include "tensorflow/compiler/xla/test_helpers.h"
 #include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 #include "tensorflow/core/platform/test.h"
 
@@ -26,7 +30,7 @@ namespace {
 using ::testing::HasSubstr;
 using ::testing::Not;
 
-class CudnnFusedConvRewriterTest : public HloTestBase {
+class CudnnFusedConvRewriterTest : public GpuCodegenTest {
  protected:
   string GetOptimizedHlo(absl::string_view hlo_string) {
     return backend()
@@ -53,6 +57,19 @@ class CudnnFusedConvRewriterTest : public HloTestBase {
     }
   }
 
+  void TestClamp(absl::string_view pre_hlo_string,
+                 absl::string_view post_hlo_string) {
+    string alpha_conv_scalar, alpha_side_input_scalar;
+    string elementwise_type;
+
+    string optimized_hlo_string = GetOptimizedHlo(pre_hlo_string);
+    EXPECT_THAT(optimized_hlo_string, Not(HasSubstr("Convert")));
+    EXPECT_THAT(optimized_hlo_string, HasSubstr("__cudnn$conv"));
+    EXPECT_TRUE(RunAndCompare(pre_hlo_string, ErrorSpec{0.01}))
+        << pre_hlo_string;
+    MatchOptimizedHlo(pre_hlo_string, post_hlo_string);
+  }
+
   void TestNotMatchWithAllTypes(absl::string_view hlo_string) {
     for (absl::string_view type : {"f16", "f32", "f64"}) {
       const string hlo_with_new_type =
@@ -349,6 +366,350 @@ TEST_F(CudnnFusedConvRewriterTest, TestPreservesFeatureGroupCount) {
   EXPECT_TRUE(RunAndCompare(kHloString, ErrorSpec{0.01}));
 }
 
+TEST_F(CudnnFusedConvRewriterTest, TestConvInt8ToInt8) {
+  // max(0, clamp(conv(x, w)))); for int8
+  TestClamp(
+      // pre_hlo
+      R"(
+    HloModule Test
+
+    ENTRY Test {
+      zero = s8[] constant(0)
+      zeros = s8[1,32,9,9] broadcast(zero), dimensions={}
+
+      input = s8[1,17,9,9] parameter(0)
+      filter = s8[3,3,17,32] parameter(1)
+
+      inputs32 = s32[1,17,9,9] convert(input)
+      filters32 = s32[3,3,17,32] convert(filter)
+
+      conv = s32[1,32,9,9] convolution(inputs32, filters32), window={size=3x3 pad=1_1x1_1}, dim_labels=bf01_01io->bf01, feature_group_count=1
+
+      lower = s32[] constant(-128)
+      lowers = s32[1,32,9,9] broadcast(lower), dimensions={}
+      upper = s32[] constant(127)
+      uppers = s32[1,32,9,9] broadcast(upper), dimensions={}
+
+      clamp = s32[1,32,9,9] clamp(lowers, conv, uppers)
+
+      convert = s8[1,32,9,9] convert(clamp)
+      ROOT relu = s8[1,32,9,9] maximum(zeros, convert)
+    })",
+      // post_hlo
+      R"(
+      ; CHECK-LABEL: ENTRY %Test (input: s8[1,17,9,9], filter: s8[3,3,17,32]) -> s8[1,32,9,9] {
+      ; CHECK:  %custom-call{{(\.[0-9])?}} = (s8[1,32,9,9]{1,3,2,0}, u8[{{[0-9]*}}]{0}) custom-call(%fusion{{(\.[0-9])?}}, %fusion{{(\.[0-9])?}}), window={size=3x3 pad=1_1x1_1}, dim_labels=bf01_01io->bf01, custom_call_target="__cudnn$convForward", backend_config=
+      )");
+}
+
+TEST_F(CudnnFusedConvRewriterTest, TestConvInt8ToFloat) {
+  // convert<float>(conv<int32>(convert<int32>(int8_x),
+  // convert<int32>(int8_w)));
+  TestClamp(
+      // pre_hlo
+      R"(
+    HloModule Test
+
+    ENTRY Test {
+      input = s8[1,17,9,9] parameter(0)
+      filter = s8[3,3,17,32] parameter(1)
+
+      inputs32 = s32[1,17,9,9] convert(input)
+      filters32 = s32[3,3,17,32] convert(filter)
+
+      conv = s32[1,32,9,9] convolution(inputs32, filters32), window={size=3x3 pad=1_1x1_1}, dim_labels=bf01_01io->bf01, feature_group_count=1
+
+      ROOT convert = f32[1,32,9,9] convert(conv)
+    })",
+      // post_hlo
+      R"(
+      ; CHECK-LABEL: ENTRY %Test (input: s8[1,17,9,9], filter: s8[3,3,17,32]) -> f32[1,32,9,9] {
+      ; CHECK:  %custom-call{{(\.[0-9])?}} = (f32[1,32,9,9]{1,3,2,0}, u8[{{[0-9]+}}]{0}) custom-call(%fusion{{(\.[0-9])?}}, %fusion{{(\.[0-9])?}}), window={size=3x3 pad=1_1x1_1}, dim_labels=bf01_01io->bf01, custom_call_target="__cudnn$convForward", backend_config=
+      )");
+}
+
+TEST_F(CudnnFusedConvRewriterTest, TestFusedConvInt8ToInt8) {
+  // clamp(max(0, conv(x, w)+bias)); for int8
+  TestClamp(
+      // pre_hlo
+      R"(
+    HloModule Test
+
+    ENTRY Test {
+      zero = f32[] constant(0)
+      zeros = f32[1,3,3,64] broadcast(zero), dimensions={}
+
+      input = s8[1,3,3,64] parameter(0)
+      filter = s8[3,3,64,64] parameter(1)
+      bias = f32[64] parameter(2)
+
+      inputs32 = s32[1,3,3,64] convert(input)
+      filters32 = s32[3,3,64,64] convert(filter)
+
+      conv = s32[1,3,3,64] convolution(inputs32, filters32), window={size=3x3 pad=1_1x1_1}, dim_labels=b01f_01io->b01f, feature_group_count=1
+
+      convfloat = f32[1,3,3,64] convert(conv)
+      broadcasted_bias = f32[1,3,3,64] broadcast(bias), dimensions={3}
+      add1 = f32[1,3,3,64] add(convfloat, broadcasted_bias)
+      relu = f32[1,3,3,64] maximum(zeros, add1)
+
+      lower = f32[] constant(-128)
+      lowers = f32[1,3,3,64] broadcast(lower), dimensions={}
+      upper = f32[] constant(127)
+      uppers = f32[1,3,3,64] broadcast(upper), dimensions={}
+
+      clamp = f32[1,3,3,64] clamp(lowers, relu, uppers)
+
+      ROOT convert = s8[1,3,3,64] convert(clamp)      
+    })",
+      // post_hlo
+      R"(
+      ; CHECK-LABEL: ENTRY %Test (input: s8[1,3,3,64], filter: s8[3,3,64,64], bias: f32[64]) -> s8[1,3,3,64]
+      ; CHECK:  %custom-call{{(\.[0-9])?}} = (s8[1,3,3,64]{3,2,1,0}, u8[{{[0-9]+}}]{0}) custom-call(%input, %copy{{(\.[0-9])?}}, %bias), window={size=3x3 pad=1_1x1_1}, dim_labels=b01f_01io->b01f, custom_call_target="__cudnn$convBiasActivationForward", backend_config=
+      ; CHECK-NEXT:  ROOT %get-tuple-element{{(\.[0-9])?}} = s8[1,3,3,64]{3,2,1,0} get-tuple-element(%custom-call{{(\.[0-9])?}}), index=0
+      )");
+}
+
+TEST_F(CudnnFusedConvRewriterTest, TestFusedConvInt8ToFloat) {
+  // max(0, convert<float>(conv<int32>(int8_x),
+  // conv<int32>(int8_w))+float_bias)); int8 to float via bias.
+  TestClamp(
+      // pre_hlo
+      R"(
+    HloModule Test
+
+    ENTRY Test {
+      zero = f32[] constant(0)
+      zeros = f32[1,3,3,64] broadcast(zero), dimensions={}
+
+      input = s8[1,3,3,64] parameter(0)
+      filter = s8[3,3,64,64] parameter(1)
+      bias = f32[64] parameter(2)
+
+      inputs32 = s32[1,3,3,64] convert(input)
+      filters32 = s32[3,3,64,64] convert(filter)
+
+      conv = s32[1,3,3,64] convolution(inputs32, filters32), window={size=3x3 pad=1_1x1_1}, dim_labels=b01f_01io->b01f, feature_group_count=1
+
+      convfloat = f32[1,3,3,64] convert(conv)
+      broadcasted_bias = f32[1,3,3,64] broadcast(bias), dimensions={3}
+      add1 = f32[1,3,3,64] add(convfloat, broadcasted_bias)
+      ROOT relu = f32[1,3,3,64] maximum(zeros, add1)     
+    })",
+      // post_hlo
+      R"(
+      ; CHECK-LABEL: ENTRY %Test (input: s8[1,3,3,64], filter: s8[3,3,64,64], bias: f32[64]) -> f32[1,3,3,64] {
+      ; CHECK:  %custom-call{{(\.[0-9])?}} = (f32[1,3,3,64]{3,2,1,0}, u8[{{[0-9]*}}]{0}) custom-call(%input, %copy{{(\.[0-9])?}}, %bias), window={size=3x3 pad=1_1x1_1}, dim_labels=b01f_01io->b01f, custom_call_target="__cudnn$convBiasActivationForward", backend_config=
+      ; CHECK-NEXT:  ROOT %get-tuple-element{{(\.[0-9])?}} = f32[1,3,3,64]{3,2,1,0} get-tuple-element(%custom-call{{(\.[0-9])?}}), index=0
+      )");
+}
+
+TEST_F(CudnnFusedConvRewriterTest,
+       TestFusedConvWithScaledInt8SideInputBiasInt8ToInt8) {
+  // clamp(max(0, alpha_conv * conv(x, w) + alpha_side *
+  // convert<int32>(int8_side_input) + bias)); for int8
+  TestClamp(
+      // pre_hlo
+      R"(
+    HloModule Test
+
+    ENTRY Test {
+      zero = f32[] constant(0)
+      zeros = f32[1,3,3,64] broadcast(zero), dimensions={}
+      alpha_conv_scalar = f32[] constant(0.999994934)
+      alpha_conv = f32[1,3,3,64] broadcast(alpha_conv_scalar), dimensions={}
+      alpha_side_input_scalar = f32[] constant(0.899994934)
+      alpha_side_input = f32[1,3,3,64] broadcast(alpha_side_input_scalar), dimensions={}
+
+      input = s8[1,3,3,64] parameter(0)
+      filter = s8[3,3,64,64] parameter(1)
+      side_input = s8[1,3,3,64] parameter(2)
+      bias = f32[64] parameter(3)
+
+      inputs32 = s32[1,3,3,64] convert(input)
+      filters32 = s32[3,3,64,64] convert(filter)
+      side_input_f32 = f32[1,3,3,64] convert(side_input)
+
+      conv = s32[1,3,3,64] convolution(inputs32, filters32), window={size=3x3 pad=1_1x1_1}, dim_labels=b01f_01io->b01f, feature_group_count=1
+
+      convfloat = f32[1,3,3,64] convert(conv)
+      scaled_conv = f32[1,3,3,64] multiply(convfloat, alpha_conv)
+      scaled_side_input = f32[1,3,3,64] multiply(side_input_f32, alpha_side_input)
+      broadcasted_bias = f32[1,3,3,64] broadcast(bias), dimensions={3}
+      add1 = f32[1,3,3,64] add(scaled_conv, broadcasted_bias)
+      add2 = f32[1,3,3,64] add(add1, scaled_side_input)
+      relu = f32[1,3,3,64] maximum(zeros, add2)
+
+      lower = f32[] constant(-128)
+      lowers = f32[1,3,3,64] broadcast(lower), dimensions={}
+      upper = f32[] constant(127)
+      uppers = f32[1,3,3,64] broadcast(upper), dimensions={}
+
+      clamp = f32[1,3,3,64] clamp(lowers, relu, uppers)
+
+      ROOT convert = s8[1,3,3,64] convert(clamp) 
+    })",
+      // post_hlo
+      R"(
+      ; CHECK-LABEL: ENTRY %Test (input: s8[1,3,3,64], filter: s8[3,3,64,64], side_input: s8[1,3,3,64], bias: f32[64]) -> s8[1,3,3,64] {
+      ; CHECK:  %custom-call{{(\.[0-9])?}} = (s8[1,3,3,64]{3,2,1,0}, u8[{{[0-9]+}}]{0}) custom-call(%input, %copy{{(\.[0-9])?}}, %bias, %side_input), window={size=3x3 pad=1_1x1_1}, dim_labels=b01f_01io->b01f, custom_call_target="__cudnn$convBiasActivationForward", backend_config=
+      ; CHECK-NEXT:  ROOT %get-tuple-element{{(\.[0-9])?}} = s8[1,3,3,64]{3,2,1,0} get-tuple-element(%custom-call{{(\.[0-9])?}}), index=0
+      )");
+}
+
+TEST_F(CudnnFusedConvRewriterTest,
+       TestFusedConvWithScaledFloatSideInputBiasInt8ToInt8) {
+  // From:
+  // convert<int8>(clamp(max(0, alpha_conv * conv(x, w) + alpha_side *
+  // float_side_input + bias))); To: convert<int8>(clamp(conv(int8_x, int8_w,
+  // float_alpha_side, float_side_input, float_bias)));
+  TestClamp(
+      // pre_hlo
+      R"(
+    HloModule Test
+
+    ENTRY Test {
+      zero = f32[] constant(0)
+      zeros = f32[1,3,3,64] broadcast(zero), dimensions={}
+      alpha_conv_scalar = f32[] constant(0.999994934)
+      alpha_conv = f32[1,3,3,64] broadcast(alpha_conv_scalar), dimensions={}
+      alpha_side_input_scalar = f32[] constant(0.899994934)
+      alpha_side_input = f32[1,3,3,64] broadcast(alpha_side_input_scalar), dimensions={}
+
+      input = s8[1,3,3,64] parameter(0)
+      filter = s8[3,3,64,64] parameter(1)
+      side_input = f32[1,3,3,64] parameter(2)
+      bias = f32[64] parameter(3)
+
+      inputs32 = s32[1,3,3,64] convert(input)
+      filters32 = s32[3,3,64,64] convert(filter)
+
+      conv = s32[1,3,3,64] convolution(inputs32, filters32), window={size=3x3 pad=1_1x1_1}, dim_labels=b01f_01io->b01f, feature_group_count=1
+
+      convfloat = f32[1,3,3,64] convert(conv)
+      scaled_conv = f32[1,3,3,64] multiply(convfloat, alpha_conv)
+      scaled_side_input = f32[1,3,3,64] multiply(side_input, alpha_side_input)
+      broadcasted_bias = f32[1,3,3,64] broadcast(bias), dimensions={3}
+      add1 = f32[1,3,3,64] add(scaled_conv, broadcasted_bias)
+      add2 = f32[1,3,3,64] add(add1, scaled_side_input)
+      relu = f32[1,3,3,64] maximum(zeros, add2)
+
+      lower = f32[] constant(-128)
+      lowers = f32[1,3,3,64] broadcast(lower), dimensions={}
+      upper = f32[] constant(127)
+      uppers = f32[1,3,3,64] broadcast(upper), dimensions={}
+
+      clamp = f32[1,3,3,64] clamp(lowers, relu, uppers)
+
+      ROOT convert = s8[1,3,3,64] convert(clamp) 
+    })",
+      //  post_hlo
+      R"(
+      ; CHECK-LABEL: ENTRY %Test (input: s8[1,3,3,64], filter: s8[3,3,64,64], side_input: f32[1,3,3,64], bias: f32[64]) -> s8[1,3,3,64] {
+      ; CHECK:  %custom-call{{(\.[0-9])?}} = (f32[1,3,3,64]{3,2,1,0}, u8[{{[0-9]+}}]{0}) custom-call(%input, %copy{{(\.[0-9])?}}, %bias, %side_input), window={size=3x3 pad=1_1x1_1}, dim_labels=b01f_01io->b01f, custom_call_target="__cudnn$convBiasActivationForward", backend_config=
+      ; CHECK:  ROOT %fusion = s8[1,3,3,64]{3,2,1,0} fusion(%get-tuple-element{{(\.[0-9])?}}), kind=kLoop, calls=%fused_computation
+      )");
+}
+
+TEST_F(CudnnFusedConvRewriterTest,
+       TestFusedConvWithScaledInt8SideInputBiasInt8ToFloat) {
+  // From:
+  // clamp(max(0, alpha_conv * conv(x, w) + alpha_side *
+  // convert<float>(int8_side_input) + bias)); To: clamp(conv(int8_x, int8_w,
+  // float_alpha_side, convert<float>(int8_side_input), float_bias));
+  TestClamp(
+      // pre_hlo
+      R"(
+    HloModule Test
+
+    ENTRY Test {
+      zero = f32[] constant(0)
+      zeros = f32[1,3,3,64] broadcast(zero), dimensions={}
+      alpha_conv_scalar = f32[] constant(0.999994934)
+      alpha_conv = f32[1,3,3,64] broadcast(alpha_conv_scalar), dimensions={}
+      alpha_side_input_scalar = f32[] constant(0.899994934)
+      alpha_side_input = f32[1,3,3,64] broadcast(alpha_side_input_scalar), dimensions={}
+
+      input = s8[1,3,3,64] parameter(0)
+      filter = s8[3,3,64,64] parameter(1)
+      side_input = s8[1,3,3,64] parameter(2)
+      bias = f32[64] parameter(3)
+
+      inputs32 = s32[1,3,3,64] convert(input)
+      filters32 = s32[3,3,64,64] convert(filter)
+      side_input_f32 = f32[1,3,3,64] convert(side_input)
+
+      conv = s32[1,3,3,64] convolution(inputs32, filters32), window={size=3x3 pad=1_1x1_1}, dim_labels=b01f_01io->b01f, feature_group_count=1
+
+      convfloat = f32[1,3,3,64] convert(conv)
+      scaled_conv = f32[1,3,3,64] multiply(convfloat, alpha_conv)
+      scaled_side_input = f32[1,3,3,64] multiply(side_input_f32, alpha_side_input)
+      broadcasted_bias = f32[1,3,3,64] broadcast(bias), dimensions={3}
+      add1 = f32[1,3,3,64] add(scaled_conv, broadcasted_bias)
+      add2 = f32[1,3,3,64] add(add1, scaled_side_input)
+      relu = f32[1,3,3,64] maximum(zeros, add2)
+
+      lower = f32[] constant(-128)
+      lowers = f32[1,3,3,64] broadcast(lower), dimensions={}
+      upper = f32[] constant(127)
+      uppers = f32[1,3,3,64] broadcast(upper), dimensions={}
+
+      ROOT clamp = f32[1,3,3,64] clamp(lowers, relu, uppers)
+    })",
+      // post_hlo
+      R"(
+      ; CHECK-LABEL: ENTRY %Test (input: s8[1,3,3,64], filter: s8[3,3,64,64], side_input: s8[1,3,3,64], bias: f32[64]) -> f32[1,3,3,64] {
+      ; CHECK:  %side_input_f32 = f32[1,3,3,64]{3,2,1,0} convert(%side_input)
+      ; CHECK:  %custom-call{{(\.[0-9])?}} = (f32[1,3,3,64]{3,2,1,0}, u8[{{[0-9]*}}]{0}) custom-call(%input, %copy{{(\.[0-9])?}}, %bias, %side_input_f32), window={size=3x3 pad=1_1x1_1}, dim_labels=b01f_01io->b01f, custom_call_target="__cudnn$convBiasActivationForward", backend_config=
+      ; CHECK:  ROOT %fusion = f32[1,3,3,64]{3,2,1,0} fusion(%get-tuple-element{{(\.[0-9])?}}), kind=kLoop, calls=%fused_computation
+      )");
+}
+
+TEST_F(CudnnFusedConvRewriterTest, TestConvInt8ToInt8NoClamp) {
+  // Check that integer convolution without clamp to int8 is not allowed.
+  // convert<int8>(custom_call<int32>(int32_x, int32_w,
+  // cudnnConvolutionForward))
+  const string module_str = absl::StrFormat(R"(
+    HloModule Test
+
+    ENTRY Test (input: s8[1,17,9,9], filter: s8[3,3,17,32]) -> s8[1,32,9,9] {
+      zero = s8[] constant(0)
+      zeros = s8[1,32,9,9]{3,2,1,0} broadcast(s8[] zero), dimensions={}
+      input = s8[1,17,9,9]{3,2,1,0} parameter(0)
+      filter = s8[3,3,17,32]{3,2,1,0} parameter(1)
+      custom-call = (s32[1,32,9,9]{3,2,1,0}, u8[0]{0}) custom-call(s8[1,17,9,9]{3,2,1,0} input, s8[3,3,17,32]{3,2,1,0} filter), window={size=3x3 pad=1_1x1_1}, dim_labels=bf01_01io->bf01, custom_call_target="__cudnn$convForward", backend_config="{\"convResultScale\":1}"
+      get-tuple-element = s32[1,32,9,9]{3,2,1,0} get-tuple-element((s32[1,32,9,9]{3,2,1,0}, u8[0]{0}) custom-call), index=0
+      convert = s8[1,32,9,9]{3,2,1,0} convert(s32[1,32,9,9]{3,2,1,0} get-tuple-element)
+      ROOT relu = s8[1,32,9,9]{3,2,1,0} maximum(s8[1,32,9,9]{3,2,1,0} zeros, s8[1,32,9,9]{3,2,1,0} convert)
+    })");
+  TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(module_str));
+
+  ASSERT_FALSE(CudnnFusedConvRewriter().Run(m.get()).ok());
+}
+
+TEST_F(CudnnFusedConvRewriterTest, TestFusedConvInt8ToInt8NoClamp) {
+  // Although bias and so on are fused with forward convolution,
+  // it is still not allowed if the output is not clampped/converted to int8
+  // max(0, alpha_conv * conv(x, w) + alpha_side * side_input + bias); for int8
+
+  const string module_str = absl::StrFormat(R"(
+    HloModule Test
+
+    ENTRY Test (input: s8[1,17,9,9], filter: s8[3,3,17,32]) -> s8[1,32,9,9] {
+      zero = s8[] constant(0)
+      zeros = s8[1,32,9,9]{3,2,1,0} broadcast(s8[] zero), dimensions={}
+      input = s8[1,17,9,9]{3,2,1,0} parameter(0)
+      filter = s8[3,3,17,32]{3,2,1,0} parameter(1)
+      custom-call = (s32[1,32,9,9]{3,2,1,0}, u8[0]{0}) custom-call(s8[1,17,9,9]{3,2,1,0} input, s8[3,3,17,32]{3,2,1,0} filter), window={size=3x3 pad=1_1x1_1}, dim_labels=bf01_01io->bf01, custom_call_target="__cudnn$convForward", backend_config="{\"convResultScale\":1}"
+      get-tuple-element = s32[1,32,9,9]{3,2,1,0} get-tuple-element((s32[1,32,9,9]{3,2,1,0}, u8[0]{0}) custom-call), index=0
+      convert = s8[1,32,9,9]{3,2,1,0} convert(s32[1,32,9,9]{3,2,1,0} get-tuple-element)
+      ROOT relu = s8[1,32,9,9]{3,2,1,0} maximum(s8[1,32,9,9]{3,2,1,0} zeros, s8[1,32,9,9]{3,2,1,0} convert)
+    })");
+  TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(module_str));
+
+  ASSERT_FALSE(CudnnFusedConvRewriter().Run(m.get()).ok());
+}
+
 }  // namespace
 }  // namespace gpu
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/cudnn_conv_pad_for_tensor_cores.cc b/tensorflow/compiler/xla/service/gpu/cudnn_pad_for_convolutions.cc
similarity index 52%
rename from tensorflow/compiler/xla/service/gpu/cudnn_conv_pad_for_tensor_cores.cc
rename to tensorflow/compiler/xla/service/gpu/cudnn_pad_for_convolutions.cc
index 958e0b9c6e7..17c02b64db5 100644
--- a/tensorflow/compiler/xla/service/gpu/cudnn_conv_pad_for_tensor_cores.cc
+++ b/tensorflow/compiler/xla/service/gpu/cudnn_pad_for_convolutions.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/compiler/xla/service/gpu/cudnn_conv_pad_for_tensor_cores.h"
+#include "tensorflow/compiler/xla/service/gpu/cudnn_pad_for_convolutions.h"
 
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/service/gpu/ir_emission_utils.h"
@@ -70,21 +70,26 @@ static HloInstruction* PadInstruction(HloInstruction* instr,
       HloInstruction::CreatePad(new_shape, instr, zero, pad_config));
 }
 
-// Modifies the given convolution to have the given LHS/RHS/result shapes.
+// Modifies the given convolution to have the given input and result shapes.
 static Status PadConv(HloCustomCallInstruction* conv,
-                      const Shape& new_lhs_shape, const Shape& new_rhs_shape,
+                      absl::Span<const Shape> new_input_shapes,
                       const Shape& new_result_shape) {
   CHECK_EQ(0, conv->shape().tuple_shapes(1).dimensions(0))
       << "conv must use 0 scratch bytes, i.e. this pass must be run "
          "before CudnnConvAlgorithmPicker.";
-
-  auto* lhs = conv->mutable_operand(0);
-  auto* rhs = conv->mutable_operand(1);
-  auto* new_lhs = PadInstruction(lhs, new_lhs_shape);
-  auto* new_rhs = PadInstruction(rhs, new_rhs_shape);
+  std::vector<HloInstruction*> new_operands;
+  new_operands.reserve(conv->operand_count());
+  for (int i = 0; i < conv->operand_count(); ++i) {
+    new_operands.push_back(
+        PadInstruction(conv->mutable_operand(i), new_input_shapes[i]));
+  }
   const Shape& result_shape = conv->shape().tuple_shapes(0);
-  CHECK(new_lhs != lhs || new_rhs != rhs)
-      << "We should have had to pad either LHS or RHS.";
+
+  bool changed = false;
+  for (int i = 0; i < conv->operand_count(); ++i) {
+    changed |= (new_operands[i] != conv->mutable_operand(i));
+  }
+  CHECK(changed) << "We should have had to pad at least one input operand.";
 
   auto add = [&](std::unique_ptr<HloInstruction> new_instr) {
     return conv->parent()->AddInstruction(std::move(new_instr));
@@ -93,10 +98,10 @@ static Status PadConv(HloCustomCallInstruction* conv,
   Shape new_conv_shape = ShapeUtil::MakeTupleShape(
       {new_result_shape, ShapeUtil::MakeShape(U8, {0})});
   auto* new_conv =
-      add(conv->CloneWithNewOperands(new_conv_shape, {new_lhs, new_rhs}));
+      add(conv->CloneWithNewOperands(new_conv_shape, new_operands));
 
-  // Slice the new conv result if necessary, keeping in mind that new_conv has
-  // tuple shape (new_result_shape, u8[0]).
+  // Slice the new conv result if necessary, keeping in mind that new_conv
+  // has tuple shape (new_result_shape, u8[0]).
   if (!ShapeUtil::Equal(result_shape, new_result_shape)) {
     std::vector<int64> start_indices(result_shape.dimensions_size(), 0);
     std::vector<int64> end_indices(result_shape.dimensions().begin(),
@@ -118,7 +123,61 @@ static Status PadConv(HloCustomCallInstruction* conv,
   return conv->parent()->ReplaceInstruction(conv, new_conv);
 }
 
-static StatusOr<bool> PadForTensorCores(HloCustomCallInstruction* conv) {
+static std::vector<HloCustomCallInstruction*> GetRelevantConvs(
+    HloComputation* comp) {
+  std::vector<HloCustomCallInstruction*> convs;
+  for (HloInstruction* instr : comp->instructions()) {
+    if (IsCustomCallToDnnConvolution(*instr)) {
+      convs.push_back(Cast<HloCustomCallInstruction>(instr));
+    }
+  }
+  return convs;
+}
+
+// This is the main function of the transform.  It runs on a given custom call
+// nodes to cuDNN convolution, calls resolve_pad_shapes to resolve
+// the desired input/output feature map shapes, and adds necessary padding and
+// slicing nodes around them.
+//
+// resolve_pad_shapes points to a function.  It takes conv, a custom call
+// instruction to cuDNN convolution that may need padding to figure out the
+// desired padded input and output tensor shapes and store the desired
+// shapes in new_input_shapes and new_input_shapes.  Notice that
+// new_input_shapes is a vector for multiple input tesnsors. This function
+// shall return true, if padding is necessary or false otherwise in addition to
+// status.
+static StatusOr<bool> ResolveAndPad(
+    HloCustomCallInstruction* conv,
+    StatusOr<bool> (*resolve_pad_shapes)(HloCustomCallInstruction* conv,
+                                         std::vector<Shape>* new_input_shapes,
+                                         Shape* new_result_shape)) {
+  std::vector<Shape> new_input_shapes;
+  Shape new_result_shape;
+  TF_ASSIGN_OR_RETURN(bool result, resolve_pad_shapes(conv, &new_input_shapes,
+                                                      &new_result_shape));
+  if (result) {
+    TF_RETURN_IF_ERROR(PadConv(conv, new_input_shapes, new_result_shape));
+    return true;
+  }
+  return false;
+}
+
+// Adds padding to cudnn convolutions to make them run faster on GPUs with
+// tensor cores.
+//
+//  - f16 convolutions are padded to have input/output channel dimensions that
+//    are multiples of 8, so that we can use tensor cores.
+//
+//  - f16 convolutions with 3 input channels and 32 or 64 output channels are
+//    padded to 4 input channels.  There's a special-cased cudnn algorithm just
+//    for this.
+//
+// Don't run this pass on GPUs without tensor cores -- it will make them slower!
+//
+// TODO(jlebar): Also pad dots.
+static StatusOr<bool> TryResolvePadedShapesForTensorCore(
+    HloCustomCallInstruction* conv, std::vector<Shape>* new_input_shapes_ptr,
+    Shape* new_result_shape_ptr) {
   TF_ASSIGN_OR_RETURN(auto kind, GetCudnnConvKind(conv));
   const auto& dnums = conv->convolution_dimension_numbers();
   auto* lhs = conv->mutable_operand(0);
@@ -138,7 +197,8 @@ static StatusOr<bool> PadForTensorCores(HloCustomCallInstruction* conv) {
 
   Shape new_lhs_shape = lhs->shape();
   Shape new_rhs_shape = rhs->shape();
-  Shape new_result_shape = conv->shape().tuple_shapes(0);
+  Shape& new_result_shape = *new_result_shape_ptr;
+  new_result_shape = conv->shape().tuple_shapes(0);
 
   // new_{input,filter_output}_shape points to the appropriate one of
   // new_{lhs,rhs,result}_shape.
@@ -211,29 +271,135 @@ static StatusOr<bool> PadForTensorCores(HloCustomCallInstruction* conv) {
     return false;
   }
 
-  // OK, let's do the transformation!
-  TF_RETURN_IF_ERROR(
-      PadConv(conv, new_lhs_shape, new_rhs_shape, new_result_shape));
+  new_input_shapes_ptr->push_back(new_lhs_shape);
+  new_input_shapes_ptr->push_back(new_rhs_shape);
   return true;
 }
 
-static std::vector<HloCustomCallInstruction*> GetRelevantConvs(
-    HloComputation* comp) {
-  std::vector<HloCustomCallInstruction*> convs;
-  for (HloInstruction* instr : comp->instructions()) {
-    if (IsCustomCallToDnnConvolution(*instr)) {
-      convs.push_back(Cast<HloCustomCallInstruction>(instr));
-    }
+// Adds padding to cudnn integer convolutions to make input and output feature
+// maps multiple of 4
+static StatusOr<bool> TryResolvePadedShapesForIntegerConvolution(
+    HloCustomCallInstruction* conv, std::vector<Shape>* new_input_shapes_ptr,
+    Shape* new_result_shape_ptr) {
+  TF_ASSIGN_OR_RETURN(auto kind, GetCudnnConvKind(conv));
+  const Shape& input_shape = conv->operand(0)->shape();
+  const Shape& result_shape = conv->shape().tuple_shapes(0);
+
+  // Integer convolution only
+  if (!primitive_util::IsIntegralType(input_shape.element_type())) {
+    return false;
   }
-  return convs;
+
+  // kForward and kForwardActivation only
+  if (kind != CudnnConvKind::kForward &&
+      kind != CudnnConvKind::kForwardActivation) {
+    return false;
+  }
+
+  const auto& dnums = conv->convolution_dimension_numbers();
+  std::vector<Shape>& new_input_shapes = *new_input_shapes_ptr;
+  for (auto operand : conv->operands()) {
+    new_input_shapes.push_back(operand->shape());
+  }
+  Shape& new_result_shape = *new_result_shape_ptr;
+  new_result_shape = conv->shape().tuple_shapes(0);
+
+  // Pad the features to multiples of 4 and check that
+  // the conv buffers size changes for debugging purpose.
+  {
+    auto pad_dim = [](Shape* s, int64 dim) {
+      s->set_dimensions(dim, RoundUpToNearest<int64>(s->dimensions(dim), 4));
+    };
+
+    switch (kind) {
+      case CudnnConvKind::kForward:
+        CHECK_EQ(new_input_shapes.size(), 2);
+        pad_dim(&new_input_shapes[0],
+                dnums.input_feature_dimension());  // Input feature maps
+        pad_dim(&new_input_shapes[1],
+                dnums.kernel_input_feature_dimension());  // Kernel for the
+                                                          // input feature maps
+        pad_dim(
+            &new_input_shapes[1],
+            dnums.kernel_output_feature_dimension());  // Kernel for the output
+                                                       // feature maps
+        pad_dim(&new_result_shape,
+                dnums.output_feature_dimension());  // Output feature maps
+        break;
+      case CudnnConvKind::kForwardActivation:
+        CHECK(new_input_shapes.size() == 3 || new_input_shapes.size() == 4);
+        pad_dim(&new_input_shapes[0],
+                dnums.input_feature_dimension());  // Input feature maps
+        pad_dim(&new_input_shapes[1],
+                dnums.kernel_input_feature_dimension());  // Kernel for the
+                                                          // input feature maps
+        pad_dim(
+            &new_input_shapes[1],
+            dnums.kernel_output_feature_dimension());  // Kernel for the output
+                                                       // feature maps
+        pad_dim(&new_input_shapes[2], 0);              // Bias
+        if (new_input_shapes.size() == 4) {
+          pad_dim(&new_input_shapes[3],
+                  dnums.output_feature_dimension());  // Optional side input
+        }
+        pad_dim(&new_result_shape,
+                dnums.output_feature_dimension());  // Output feature maps
+        break;
+      default:
+        CHECK(false);
+    }
+    // Check that padding wouldn't increase the total bytes read/written by this
+    // operation too much.
+    auto check_size_increase = [&](const Shape& old_shape,
+                                   const Shape& new_shape) {
+      int64 old_bytes = ShapeUtil::ByteSizeOf(old_shape);
+      int64 new_bytes = ShapeUtil::ByteSizeOf(new_shape);
+      if (new_bytes <= old_bytes * kMaxBytesTouchedIncrease) {
+        return;
+      }
+      VLOG(3)
+          << "Not padding convolution; doing so would change input / result "
+             "shape from "
+          << ShapeUtil::HumanString(old_shape) << " to "
+          << ShapeUtil::HumanString(new_shape) << ", a size increase of "
+          << new_bytes / static_cast<double>(old_bytes) << "x > "
+          << kMaxBytesTouchedIncrease << "x: " << conv->ToString();
+    };
+
+    for (int64 i = 0; i < conv->operand_count(); ++i) {
+      check_size_increase(conv->operand(i)->shape(), new_input_shapes[i]);
+    }
+    check_size_increase(result_shape, new_result_shape);
+  }
+
+  bool changed = false;
+  for (int64 i = 0; i < conv->operand_count(); ++i) {
+    changed |=
+        !ShapeUtil::Equal(conv->operand(i)->shape(), new_input_shapes[i]);
+  }
+  if (!changed) {
+    VLOG(3) << "No need to pad features of " << conv->ToString();
+  }
+
+  return changed;
 }
 
-StatusOr<bool> CudnnConvPadForTensorCores::Run(HloModule* module) {
+StatusOr<bool> CudnnPadForConvolutions::Run(HloModule* module) {
   bool changed = false;
   for (HloComputation* comp : module->MakeNonfusionComputations()) {
     for (HloCustomCallInstruction* conv : GetRelevantConvs(comp)) {
-      TF_ASSIGN_OR_RETURN(bool result, PadForTensorCores(conv));
-      changed |= result;
+      TF_ASSIGN_OR_RETURN(
+          bool local_changed,
+          ResolveAndPad(conv, TryResolvePadedShapesForIntegerConvolution));
+      changed |= local_changed;
+    }
+    for (HloCustomCallInstruction* conv : GetRelevantConvs(comp)) {
+      if (is_volta_or_later_) {
+        TF_ASSIGN_OR_RETURN(
+            bool local_changed,
+            ResolveAndPad(conv, TryResolvePadedShapesForTensorCore));
+        changed |= local_changed;
+      }
     }
   }
   return changed;
diff --git a/tensorflow/compiler/xla/service/gpu/cudnn_pad_for_convolutions.h b/tensorflow/compiler/xla/service/gpu/cudnn_pad_for_convolutions.h
new file mode 100644
index 00000000000..b065c6e4bd4
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gpu/cudnn_pad_for_convolutions.h
@@ -0,0 +1,49 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_GPU_CUDNN_PAD_FOR_CONVOLUTIONS_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_GPU_CUDNN_PAD_FOR_CONVOLUTIONS_H_
+
+#include "tensorflow/compiler/xla/literal_util.h"
+#include "tensorflow/compiler/xla/service/gpu/ir_emission_utils.h"
+#include "tensorflow/compiler/xla/service/hlo_casting_utils.h"
+#include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
+#include "tensorflow/compiler/xla/util.h"
+#include "tensorflow/compiler/xla/window_util.h"
+
+namespace xla {
+namespace gpu {
+
+// Two zero-paddings for CuDNN thunking are done in this transform: padding for
+// tensor cores and padding for integer convolutions.  This transform also
+// add slice instruction to remove unnecessary output features.
+class CudnnPadForConvolutions : public HloModulePass {
+ public:
+  explicit CudnnPadForConvolutions(bool is_volta_or_later)
+      : is_volta_or_later_(is_volta_or_later) {}
+  absl::string_view name() const override {
+    return "cudnn_pad_for_convolutions";
+  }
+  // Run PadForConvolutions on the given module and return if any change is made
+  StatusOr<bool> Run(HloModule* module) override;
+
+ private:
+  const bool is_volta_or_later_;
+};
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_GPU_CUDNN_PAD_FOR_CONVOLUTIONS_H_
diff --git a/tensorflow/compiler/xla/service/gpu/cudnn_conv_pad_for_tensor_cores_test.cc b/tensorflow/compiler/xla/service/gpu/cudnn_pad_for_convolutions_test.cc
similarity index 60%
rename from tensorflow/compiler/xla/service/gpu/cudnn_conv_pad_for_tensor_cores_test.cc
rename to tensorflow/compiler/xla/service/gpu/cudnn_pad_for_convolutions_test.cc
index af9303a5b76..3d0780aedd8 100644
--- a/tensorflow/compiler/xla/service/gpu/cudnn_conv_pad_for_tensor_cores_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/cudnn_pad_for_convolutions_test.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/compiler/xla/service/gpu/cudnn_conv_pad_for_tensor_cores.h"
+#include "tensorflow/compiler/xla/service/gpu/cudnn_pad_for_convolutions.h"
 
 #include "tensorflow/compiler/xla/service/gpu/ir_emission_utils.h"
 #include "tensorflow/compiler/xla/service/hlo_matchers.h"
@@ -29,9 +29,9 @@ namespace {
 namespace op = xla::testing::opcode_matchers;
 using ::testing::_;
 
-class CudnnConvPadForTensorCoresTest : public HloTestBase {};
+class CudnnPadForConvolutionsTest : public HloTestBase {};
 
-TEST_F(CudnnConvPadForTensorCoresTest, PadF16ForwardConvInputChannels) {
+TEST_F(CudnnPadForConvolutionsTest, PadF16ForwardConvInputChannels) {
   auto module = ParseAndReturnVerifiedModule(R"(
   HloModule TestModule
 
@@ -43,7 +43,7 @@ TEST_F(CudnnConvPadForTensorCoresTest, PadF16ForwardConvInputChannels) {
                   custom_call_target="__cudnn$convForward"
   })")
                     .ValueOrDie();
-  EXPECT_TRUE(CudnnConvPadForTensorCores().Run(module.get()).ValueOrDie());
+  EXPECT_TRUE(CudnnPadForConvolutions(true).Run(module.get()).ValueOrDie());
   auto* root = module->entry_computation()->root_instruction();
 
   SCOPED_TRACE(module->ToString());
@@ -56,7 +56,7 @@ TEST_F(CudnnConvPadForTensorCoresTest, PadF16ForwardConvInputChannels) {
                                ShapeUtil::MakeShape(F16, {2, 2, 48, 40})));
 }
 
-TEST_F(CudnnConvPadForTensorCoresTest, PadF16BackwardInputConvOutputChannels) {
+TEST_F(CudnnPadForConvolutionsTest, PadF16BackwardInputConvOutputChannels) {
   auto module = ParseAndReturnVerifiedModule(R"(
   HloModule TestModule
 
@@ -68,7 +68,7 @@ TEST_F(CudnnConvPadForTensorCoresTest, PadF16BackwardInputConvOutputChannels) {
                   custom_call_target="__cudnn$convBackwardInput"
   })")
                     .ValueOrDie();
-  EXPECT_TRUE(CudnnConvPadForTensorCores().Run(module.get()).ValueOrDie());
+  EXPECT_TRUE(CudnnPadForConvolutions(true).Run(module.get()).ValueOrDie());
   auto* root = module->entry_computation()->root_instruction();
   EXPECT_THAT(root, op::CustomCall(kCudnnConvBackwardInputCallTarget,
                                    op::Pad(op::Parameter(0), _),
@@ -79,7 +79,7 @@ TEST_F(CudnnConvPadForTensorCoresTest, PadF16BackwardInputConvOutputChannels) {
                                ShapeUtil::MakeShape(F16, {2, 2, 40, 48})));
 }
 
-TEST_F(CudnnConvPadForTensorCoresTest, PadF16ForwardConvOutputChannels) {
+TEST_F(CudnnPadForConvolutionsTest, PadF16ForwardConvOutputChannels) {
   auto module = ParseAndReturnVerifiedModule(R"(
   HloModule TestModule
 
@@ -91,7 +91,7 @@ TEST_F(CudnnConvPadForTensorCoresTest, PadF16ForwardConvOutputChannels) {
                   custom_call_target="__cudnn$convForward"
   })")
                     .ValueOrDie();
-  EXPECT_TRUE(CudnnConvPadForTensorCores().Run(module.get()).ValueOrDie());
+  EXPECT_TRUE(CudnnPadForConvolutions(true).Run(module.get()).ValueOrDie());
   auto* root = module->entry_computation()->root_instruction();
   EXPECT_THAT(root, op::Tuple(op::Slice(op::GetTupleElement(op::CustomCall(
                                   kCudnnConvForwardCallTarget, op::Parameter(0),
@@ -99,7 +99,7 @@ TEST_F(CudnnConvPadForTensorCoresTest, PadF16ForwardConvOutputChannels) {
                               _));
 }
 
-TEST_F(CudnnConvPadForTensorCoresTest, PadF16BackwardInputConvInputChannels) {
+TEST_F(CudnnPadForConvolutionsTest, PadF16BackwardInputConvInputChannels) {
   auto module = ParseAndReturnVerifiedModule(R"(
   HloModule TestModule
 
@@ -112,7 +112,7 @@ TEST_F(CudnnConvPadForTensorCoresTest, PadF16BackwardInputConvInputChannels) {
     ROOT gte = f16[10,20,30,41] get-tuple-element(result), index=0
   })")
                     .ValueOrDie();
-  EXPECT_TRUE(CudnnConvPadForTensorCores().Run(module.get()).ValueOrDie());
+  EXPECT_TRUE(CudnnPadForConvolutions(true).Run(module.get()).ValueOrDie());
   auto* root = module->entry_computation()->root_instruction();
   EXPECT_THAT(root, op::GetTupleElement(op::Tuple(
                         op::Slice(op::GetTupleElement(op::CustomCall(
@@ -121,7 +121,7 @@ TEST_F(CudnnConvPadForTensorCoresTest, PadF16BackwardInputConvInputChannels) {
                         _)));
 }
 
-TEST_F(CudnnConvPadForTensorCoresTest, PadF16BackwardFilterConvInputChannels) {
+TEST_F(CudnnPadForConvolutionsTest, PadF16BackwardFilterConvInputChannels) {
   auto module = ParseAndReturnVerifiedModule(R"(
   HloModule TestModule
 
@@ -134,7 +134,7 @@ TEST_F(CudnnConvPadForTensorCoresTest, PadF16BackwardFilterConvInputChannels) {
     ROOT gte = f16[2,2,41,40] get-tuple-element(result), index=0
   })")
                     .ValueOrDie();
-  EXPECT_TRUE(CudnnConvPadForTensorCores().Run(module.get()).ValueOrDie());
+  EXPECT_TRUE(CudnnPadForConvolutions(true).Run(module.get()).ValueOrDie());
   auto* root = module->entry_computation()->root_instruction();
   EXPECT_THAT(root, op::GetTupleElement(op::Tuple(
                         op::Slice(op::GetTupleElement(op::CustomCall(
@@ -143,7 +143,7 @@ TEST_F(CudnnConvPadForTensorCoresTest, PadF16BackwardFilterConvInputChannels) {
                         _)));
 }
 
-TEST_F(CudnnConvPadForTensorCoresTest, PadF16BackwardFilterConvOutputChannels) {
+TEST_F(CudnnPadForConvolutionsTest, PadF16BackwardFilterConvOutputChannels) {
   auto module = ParseAndReturnVerifiedModule(R"(
   HloModule TestModule
 
@@ -156,7 +156,7 @@ TEST_F(CudnnConvPadForTensorCoresTest, PadF16BackwardFilterConvOutputChannels) {
     ROOT gte = f16[2,2,40,41] get-tuple-element(result), index=0
   })")
                     .ValueOrDie();
-  EXPECT_TRUE(CudnnConvPadForTensorCores().Run(module.get()).ValueOrDie());
+  EXPECT_TRUE(CudnnPadForConvolutions(true).Run(module.get()).ValueOrDie());
   auto* root = module->entry_computation()->root_instruction();
   EXPECT_THAT(root, op::GetTupleElement(op::Tuple(
                         op::Slice(op::GetTupleElement(op::CustomCall(
@@ -165,7 +165,7 @@ TEST_F(CudnnConvPadForTensorCoresTest, PadF16BackwardFilterConvOutputChannels) {
                         _)));
 }
 
-TEST_F(CudnnConvPadForTensorCoresTest, PadInputFeatures3To4) {
+TEST_F(CudnnPadForConvolutionsTest, PadInputFeatures3To4) {
   auto module = ParseAndReturnVerifiedModule(R"(
   HloModule TestModule
 
@@ -177,7 +177,7 @@ TEST_F(CudnnConvPadForTensorCoresTest, PadInputFeatures3To4) {
                   custom_call_target="__cudnn$convForward"
   })")
                     .ValueOrDie();
-  EXPECT_TRUE(CudnnConvPadForTensorCores().Run(module.get()).ValueOrDie());
+  EXPECT_TRUE(CudnnPadForConvolutions(true).Run(module.get()).ValueOrDie());
   auto* root = module->entry_computation()->root_instruction();
 
   SCOPED_TRACE(module->ToString());
@@ -190,6 +190,78 @@ TEST_F(CudnnConvPadForTensorCoresTest, PadInputFeatures3To4) {
                                ShapeUtil::MakeShape(F16, {2, 2, 4, 32})));
 }
 
+TEST_F(CudnnPadForConvolutionsTest, PadIntForwardConvInputChannels) {
+  auto module = ParseAndReturnVerifiedModule(R"(
+  HloModule TestModule
+
+  ENTRY TestComputation {
+    input = s8[10,20,30,41] parameter(0)
+    filter = s8[2,2,41,40] parameter(1)
+    ROOT result = (f32[10,20,30,40], u8[0]) custom-call(input, filter),
+                  window={size=2x2}, dim_labels=b01f_01io->b01f,
+                  custom_call_target="__cudnn$convForward"
+  })")
+                    .ValueOrDie();
+  EXPECT_TRUE(CudnnPadForConvolutions(true).Run(module.get()).ValueOrDie());
+  auto* root = module->entry_computation()->root_instruction();
+
+  SCOPED_TRACE(module->ToString());
+  EXPECT_THAT(root, op::CustomCall(kCudnnConvForwardCallTarget,
+                                   op::Pad(op::Parameter(0), _),
+                                   op::Pad(op::Parameter(1), _)));
+  EXPECT_TRUE(ShapeUtil::Equal(root->operand(0)->shape(),
+
+                               ShapeUtil::MakeShape(S8, {10, 20, 30, 44})));
+  EXPECT_TRUE(ShapeUtil::Equal(root->operand(1)->shape(),
+                               ShapeUtil::MakeShape(S8, {2, 2, 44, 40})));
+}
+
+TEST_F(CudnnPadForConvolutionsTest, PadIntForwardConvOutputChannels) {
+  auto module = ParseAndReturnVerifiedModule(R"(
+  HloModule TestModule
+
+  ENTRY TestComputation {
+    input = s8[10,20,30,40] parameter(0)
+    filter = s8[2,2,40,41] parameter(1)
+    ROOT result = (f32[10,20,30,41], u8[0]) custom-call(input, filter),
+                  window={size=2x2}, dim_labels=b01f_01io->b01f,
+                  custom_call_target="__cudnn$convForward"
+  })")
+                    .ValueOrDie();
+  EXPECT_TRUE(CudnnPadForConvolutions(true).Run(module.get()).ValueOrDie());
+  auto* root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root, op::Tuple(op::Slice(op::GetTupleElement(op::CustomCall(
+                                  kCudnnConvForwardCallTarget, op::Parameter(0),
+                                  op::Pad(op::Parameter(1), _)))),
+                              _));
+}
+
+TEST_F(CudnnPadForConvolutionsTest,
+       PadIntFusedForwardConvInputAndOutputChannels) {
+  auto module = ParseAndReturnVerifiedModule(R"(
+    HloModule Test
+
+    ENTRY %Test (input: s8[1,3,3,2], filter: s8[3,3,2,5], side_input: s8[1,3,3,5], bias: s8[5]) -> f32[1,3,3,5] {
+    %input = s8[1,3,3,2]{3,2,1,0} parameter(0)
+    %filter = s8[3,3,2,5]{3,2,1,0} parameter(1)
+    %bias = s8[5]{0} parameter(3)
+    %convert = f32[5]{0} convert(s8[5]{0} %bias)
+    %side_input = f32[1,3,3,5]{3,2,1,0} parameter(2)
+    %custom-call.1 = (f32[1,3,3,5]{3,2,1,0}, u8[0]{0}) custom-call(s8[1,3,3,2]{3,2,1,0} %input, s8[3,3,2,5]{3,2,1,0} %filter, f32[5]{0} %convert, f32[1,3,3,5]{3,2,1,0} %side_input), window={size=3x3 pad=1_1x1_1}, dim_labels=b01f_01io->b01f, custom_call_target="__cudnn$convBiasActivationForward", backend_config="{\"activationMode\":\"2\",\"convResultScale\":1,\"sideInputScale\":1}"
+    ROOT %get-tuple-element.1 = f32[1,3,3,5]{3,2,1,0} get-tuple-element((f32[1,3,3,5]{3,2,1,0}, u8[0]{0}) %custom-call.1), index=0
+    })")
+                    .ValueOrDie();
+  EXPECT_TRUE(CudnnPadForConvolutions(true).Run(module.get()).ValueOrDie());
+  auto* root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(
+      root, op::GetTupleElement(op::Tuple(
+                op::Slice(op::GetTupleElement(op::CustomCall(
+                    kCudnnConvBiasActivationForwardCallTarget,
+                    op::Pad(op::Parameter(0), _), op::Pad(op::Parameter(1), _),
+                    op::Pad(op::Convert(op::Parameter(3)), _),
+                    op::Pad(op::Parameter(2), _)))),
+                _)));
+}
 }  // anonymous namespace
 }  // namespace gpu
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/cusolver_context.cc b/tensorflow/compiler/xla/service/gpu/cusolver_context.cc
index 4103a720c98..b18170b00e4 100644
--- a/tensorflow/compiler/xla/service/gpu/cusolver_context.cc
+++ b/tensorflow/compiler/xla/service/gpu/cusolver_context.cc
@@ -169,7 +169,8 @@ StatusOr<int64> CusolverContext::PotrfBufferSize(PrimitiveType type,
 }
 
 #define POTRF_INSTANCE(T, type_prefix)                                    \
-  Status CusolverContext::Potrf(                                          \
+  template <>                                                             \
+  Status CusolverContext::Potrf<T>(                                       \
       se::blas::UpperLower uplo, int n, se::DeviceMemory<T> A, int lda,   \
       se::DeviceMemory<int> lapack_info, se::DeviceMemory<T> workspace) { \
     return CusolverStatusToStatus(DN_SOLVER_FN(potrf, type_prefix)(       \
diff --git a/tensorflow/compiler/xla/service/gpu/cusolver_context.h b/tensorflow/compiler/xla/service/gpu/cusolver_context.h
index c3d075c47c7..dfe55188b18 100644
--- a/tensorflow/compiler/xla/service/gpu/cusolver_context.h
+++ b/tensorflow/compiler/xla/service/gpu/cusolver_context.h
@@ -18,8 +18,10 @@ limitations under the License.
 
 #include <complex>
 
-#include "third_party/gpus/cuda/include/cublas_v2.h"
+#if !TENSORFLOW_USE_ROCM
 #include "third_party/gpus/cuda/include/cusolverDn.h"
+#endif
+
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/util.h"
@@ -30,6 +32,8 @@ limitations under the License.
 namespace xla {
 namespace gpu {
 
+#if !TENSORFLOW_USE_ROCM
+
 class CusolverContext {
  public:
   // stream may be nullptr, in which case the context can only be used for
@@ -43,26 +47,17 @@ class CusolverContext {
   CusolverContext& operator=(const CusolverContext&) = delete;
   CusolverContext& operator=(CusolverContext&&);
 
-  se::Stream* stream() const { return stream_; }
-  cusolverDnHandle_t handle() const { return handle_; }
-
   // Computes the Cholesky factorization A = L * L^T for a single matrix.
   // Returns Status::OK() if the kernel was launched successfully. See:
   // http://docs.nvidia.com/cuda/cusolver/#cuds-lt-t-gt-potrf
-  Status Potrf(se::blas::UpperLower uplo, int n, se::DeviceMemory<float> dev_A,
+  template <typename T, typename = std::enable_if_t<
+                            std::is_same<T, float>::value ||
+                            std::is_same<T, double>::value ||
+                            std::is_same<T, std::complex<float>>::value ||
+                            std::is_same<T, std::complex<double>>::value>>
+  Status Potrf(se::blas::UpperLower uplo, int n, se::DeviceMemory<T> dev_A,
                int lda, se::DeviceMemory<int> dev_lapack_info,
-               se::DeviceMemory<float> workspace);
-  Status Potrf(se::blas::UpperLower uplo, int n, se::DeviceMemory<double> dev_A,
-               int lda, se::DeviceMemory<int> dev_lapack_info,
-               se::DeviceMemory<double> workspace);
-  Status Potrf(se::blas::UpperLower uplo, int n,
-               se::DeviceMemory<std::complex<float>> dev_A, int lda,
-               se::DeviceMemory<int> dev_lapack_info,
-               se::DeviceMemory<std::complex<float>> workspace);
-  Status Potrf(se::blas::UpperLower uplo, int n,
-               se::DeviceMemory<std::complex<double>> dev_A, int lda,
-               se::DeviceMemory<int> dev_lapack_info,
-               se::DeviceMemory<std::complex<double>> workspace);
+               se::DeviceMemory<T> workspace);
 
   // Returns the size of the `workspace` required by Potrf, in number of
   // elements of `type`.
@@ -72,10 +67,42 @@ class CusolverContext {
  private:
   CusolverContext(se::Stream* stream, cusolverDnHandle_t handle);
 
+  cusolverDnHandle_t handle() const { return handle_; }
+
   se::Stream* stream_ = nullptr;
   cusolverDnHandle_t handle_ = nullptr;
 };
 
+#else
+
+typedef void* cusolverDnHandle_t;
+
+// TODO(cheshire): Remove this hack once we have ROCM implementation.
+class CusolverContext {
+ public:
+  static StatusOr<CusolverContext> Create(se::Stream* stream) {
+    LOG(FATAL) << "Unimplemented";
+  }
+
+  template <typename T, typename = std::enable_if_t<
+                            std::is_same<T, float>::value ||
+                            std::is_same<T, double>::value ||
+                            std::is_same<T, std::complex<float>>::value ||
+                            std::is_same<T, std::complex<double>>::value>>
+  Status Potrf(se::blas::UpperLower uplo, int n, se::DeviceMemory<T> dev_A,
+               int lda, se::DeviceMemory<int> dev_lapack_info,
+               se::DeviceMemory<T> workspace) {
+    LOG(FATAL) << "Unimplemented";
+  }
+
+  StatusOr<int64> PotrfBufferSize(PrimitiveType type, se::blas::UpperLower uplo,
+                                  int n, int lda) {
+    LOG(FATAL) << "Unimplemented";
+  }
+};
+
+#endif
+
 }  // namespace gpu
 }  // namespace xla
 
diff --git a/tensorflow/compiler/xla/service/gpu/custom_call_thunk.cc b/tensorflow/compiler/xla/service/gpu/custom_call_thunk.cc
index 65673106391..85571804315 100644
--- a/tensorflow/compiler/xla/service/gpu/custom_call_thunk.cc
+++ b/tensorflow/compiler/xla/service/gpu/custom_call_thunk.cc
@@ -16,7 +16,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/gpu/custom_call_thunk.h"
 
 #include "absl/strings/str_format.h"
-#include "tensorflow/stream_executor/cuda/cuda_stream.h"
 #include "tensorflow/stream_executor/gpu/gpu_stream.h"
 
 namespace xla {
diff --git a/tensorflow/compiler/xla/service/gpu/gemm_algorithm_picker.cc b/tensorflow/compiler/xla/service/gpu/gemm_algorithm_picker.cc
index 98d8d00b62c..21a0ffab5e0 100644
--- a/tensorflow/compiler/xla/service/gpu/gemm_algorithm_picker.cc
+++ b/tensorflow/compiler/xla/service/gpu/gemm_algorithm_picker.cc
@@ -30,9 +30,9 @@ limitations under the License.
 #include "tensorflow/core/protobuf/autotuning.pb.h"
 #include "tensorflow/core/util/proto/proto_utils.h"
 #include "tensorflow/stream_executor/blas.h"
-#include "tensorflow/stream_executor/cuda/redzone_allocator.h"
 #include "tensorflow/stream_executor/device_memory.h"
 #include "tensorflow/stream_executor/device_memory_allocator.h"
+#include "tensorflow/stream_executor/gpu/redzone_allocator.h"
 
 namespace xla {
 namespace gpu {
@@ -59,8 +59,8 @@ static StatusOr<absl::optional<se::blas::AlgorithmType>> DoUncachedGemmAutotune(
     const HloInstruction* gemm, se::DeviceMemoryBase lhs_buffer,
     se::DeviceMemoryBase rhs_buffer, se::DeviceMemoryBase output_buffer,
     se::DeviceMemoryBase reference_result_buffer, se::Stream* stream,
-    const se::cuda::RedzoneAllocator& allocator,
-    const BufferComparator& comparator, bool crash_on_checking_failure) {
+    const se::RedzoneAllocator& allocator, const BufferComparator& comparator,
+    bool crash_on_checking_failure) {
   if (!stream->parent()->SynchronizeAllActivity()) {
     return InternalError("Failed to synchronize GPU for autotuning.");
   }
@@ -81,8 +81,8 @@ static StatusOr<absl::optional<se::blas::AlgorithmType>> DoUncachedGemmAutotune(
     // the bias parameter.
     if (backend_config.beta() != 0) {
       int64 rng_state = 0;
-      InitializeFloatBuffer(stream, gemm->shape().element_type(), &rng_state,
-                            output_buffer);
+      InitializeBuffer(stream, gemm->shape().element_type(), &rng_state,
+                       output_buffer);
     }
     se::blas::ProfileResult profile_result;
 
@@ -113,7 +113,7 @@ static StatusOr<absl::optional<se::blas::AlgorithmType>> DoUncachedGemmAutotune(
         absl::Milliseconds(profile_result.elapsed_time_in_ms()));
 
     TF_ASSIGN_OR_RETURN(
-        se::cuda::RedzoneAllocator::RedzoneCheckStatus rz_check_status,
+        se::RedzoneAllocator::RedzoneCheckStatus rz_check_status,
         allocator.CheckRedzones());
     if (!rz_check_status.ok()) {
       result.mutable_failure()->set_kind(AutotuneResult::REDZONE_MODIFIED);
@@ -188,7 +188,7 @@ static StatusOr<absl::optional<se::blas::AlgorithmType>> DoGemmAutotune(
     const HloInstruction* rhs, se::DeviceMemoryBase lhs_buffer,
     se::DeviceMemoryBase rhs_buffer, se::DeviceMemoryBase output_buffer,
     se::DeviceMemoryBase reference_result_buffer, se::Stream* stream,
-    bool crash_on_checking_failure, const se::cuda::RedzoneAllocator& allocator,
+    bool crash_on_checking_failure, const se::RedzoneAllocator& allocator,
     const BufferComparator& comparator) {
   // Don't run autotuning concurrently on the same GPU.
   tensorflow::mutex_lock gpu_lock = LockGpu(stream->parent());
@@ -253,7 +253,7 @@ static StatusOr<bool> RunOnInstruction(HloInstruction* instr,
   }();
 
   const HloModuleConfig& hlo_module_config = instr->GetModule()->config();
-  se::cuda::RedzoneAllocator input_output_allocator(
+  se::RedzoneAllocator input_output_allocator(
       stream, allocator, PtxOptsFromConfig(hlo_module_config));
 
   BufferComparator comparator(instr->shape(), hlo_module_config);
@@ -264,8 +264,7 @@ static StatusOr<bool> RunOnInstruction(HloInstruction* instr,
     TF_ASSIGN_OR_RETURN(se::DeviceMemoryBase buffer,
                         input_output_allocator.AllocateBytes(
                             ShapeUtil::ByteSizeOf(op->shape())));
-    InitializeFloatBuffer(stream, op->shape().element_type(), &rng_state,
-                          buffer);
+    InitializeBuffer(stream, op->shape().element_type(), &rng_state, buffer);
     return buffer;
   };
 
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc b/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc
index 40ccf7a820b..95e21a84f29 100755
--- a/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc
@@ -231,6 +231,9 @@ Status GpuCompiler::OptimizeHloModule(
     // run, meaning, the pipeline that contains layout assignment cannot contain
     // a layout-sensitive verifier!
     HloPassPipeline pipeline("layout assignment");
+    // Layout assignment uses alias analysis, which requires the call graph to
+    // be flattened.
+    pipeline.AddPass<FlattenCallGraph>();
     pipeline.AddPass<GpuLayoutAssignment>(
         hlo_module->mutable_entry_computation_layout(),
         LayoutAssignment::InstructionCanChangeLayout, stream_exec);
@@ -306,7 +309,6 @@ Status GpuCompiler::PrepareHloModuleForIrEmitting(HloModule* hlo_module) {
   // (and sometime after) copy insertion, to avoid dead code from interfering
   // with the rewrites.
   pipeline.AddPass<HloDCE>();
-  pipeline.AddPass<FlattenCallGraph>();
   if (hlo_module->config().alias_passthrough_params()) {
     pipeline.AddPass<AliasPassthroughParams>();
   }
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_fusible.cc b/tensorflow/compiler/xla/service/gpu/gpu_fusible.cc
index 97fa275a2e7..3d307cc8993 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_fusible.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_fusible.cc
@@ -162,8 +162,7 @@ bool ShapesCompatibleForMultiOutputFusion(const HloInstruction& instr1,
     return false;
   }
   // The elementwise output shapes must be the same (including layout).
-  // TODO(tjoerg): Further relax the constraint. The datatype does not matter.
-  return ShapeUtil::EqualIgnoringFpPrecision(get_loop_shape(instr_1),
+  return ShapeUtil::EqualIgnoringElementType(get_loop_shape(instr_1),
                                              get_loop_shape(instr_2));
 }
 
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_fusible_test.cc b/tensorflow/compiler/xla/service/gpu/gpu_fusible_test.cc
index dc4e54c74d2..ae31b10deb3 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_fusible_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_fusible_test.cc
@@ -465,7 +465,43 @@ TEST_F(GpuFusibleTest,
   const HloInstruction* fusion_1 =
       module->entry_computation()->root_instruction()->operand(0)->operand(0);
   const HloInstruction* fusion_2 =
-      module->entry_computation()->root_instruction()->operand(1)->operand(0);
+      module->entry_computation()->root_instruction()->operand(2);
+  EXPECT_NE(fusion_1, fusion_2);
+  EXPECT_TRUE(ShapesCompatibleForMultiOutputFusion(*fusion_1, *fusion_2));
+}
+
+TEST_F(GpuFusibleTest,
+       ShapesCompatibleForMultiOutputFusion_DifferentElementType) {
+  auto module = ParseAndReturnVerifiedModule(absl::StrCat(kModulePrefix, R"(
+    fused_computation_1 {
+      p0.1 = f32[8,1,5,16,1,1]{5,4,3,2,1,0} parameter(0)
+      mul = f32[8,1,5,16,1,1]{5,4,3,2,1,0} multiply(p0.1, p0.1)
+      exp = f32[8,1,5,16,1,1]{5,4,3,2,1,0} exponential(p0.1)
+      ROOT tuple = (f32[8,1,5,16,1,1]{5,4,3,2,1,0}, f32[8,1,5,16,1,1]{5,4,3,2,1,0}) tuple(mul, exp)
+    }
+
+    fused_computation_2 {
+      p0.2 = f32[8,1,5,16,1,1]{5,4,3,2,1,0} parameter(0)
+      const.2 = f32[] constant(0)
+      broadcast = f32[8,1,5,16,1,1]{5,4,3,2,1,0} broadcast(const.2), dimensions={}
+      add = f32[8,1,5,16,1,1]{5,4,3,2,1,0} add(p0.2, broadcast)
+      ROOT convert = s32[8,1,5,16,1,1]{5,4,3,2,1,0} convert(add)
+    }
+
+    ENTRY entry {
+      p0 = f32[8,1,5,16,1,1]{5,4,3,2,1,0} parameter(0)
+      fusion.1 = (f32[8,1,5,16,1,1]{5,4,3,2,1,0}, f32[8,1,5,16,1,1]{5,4,3,2,1,0}) fusion(p0), kind=kLoop, calls=fused_computation_1
+      fusion.2 = s32[8,1,5,16,1,1]{5,4,3,2,1,0} fusion(p0), kind=kLoop, calls=fused_computation_2
+      gte0 = f32[8,1,5,16,1,1]{5,4,3,2,1,0} get-tuple-element(fusion.1), index=0
+      gte1 = f32[8,1,5,16,1,1]{5,4,3,2,1,0} get-tuple-element(fusion.1), index=1
+      ROOT root = (f32[8,1,5,16,1,1]{5,4,3,2,1,0}, f32[8,1,5,16,1,1]{5,4,3,2,1,0}, f32[8,1,5,16,1,1]{5,4,3,2,1,0}) tuple(gte0, gte1, fusion.2)
+    })"))
+                    .ValueOrDie();
+  const HloInstruction* fusion_1 =
+      module->entry_computation()->root_instruction()->operand(0)->operand(0);
+  const HloInstruction* fusion_2 =
+      module->entry_computation()->root_instruction()->operand(2);
+  EXPECT_NE(fusion_1, fusion_2);
   EXPECT_TRUE(ShapesCompatibleForMultiOutputFusion(*fusion_1, *fusion_2));
 }
 
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_layout_assignment.cc b/tensorflow/compiler/xla/service/gpu/gpu_layout_assignment.cc
index 550f4662b55..75c9d93c63b 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_layout_assignment.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_layout_assignment.cc
@@ -58,6 +58,12 @@ HeuristicLayoutAssignment(const HloInstruction* instr,
       std::make_tuple(DataLayout::kBatchYXDepth, FilterLayout::kOutputYXInput,
                       DataLayout::kBatchYXDepth);
 
+  // Integer convolution must use NHWC.
+  if (primitive_util::IsIntegralType(
+          instr->operand(0)->shape().element_type())) {
+    return kAllNHWC;
+  }
+
   const DebugOptions& debug_options =
       instr->GetModule()->config().debug_options();
 
diff --git a/tensorflow/compiler/xla/service/gpu/hlo_algorithm_blacklist.cc b/tensorflow/compiler/xla/service/gpu/hlo_algorithm_blacklist.cc
index e9a6e8d14d9..bb85c509d18 100644
--- a/tensorflow/compiler/xla/service/gpu/hlo_algorithm_blacklist.cc
+++ b/tensorflow/compiler/xla/service/gpu/hlo_algorithm_blacklist.cc
@@ -26,20 +26,7 @@ namespace gpu {
 // MSVC requires the extra const. Without, it reports an
 // "error C2131: expression did not evaluate to a constant".
 constexpr const absl::string_view kDefaultBlacklist = R"pb(
-  entries {
-    hlo: "(f16[256,112,112,64]{3,2,1,0}, u8[0]{0}) custom-call(f16[256,224,224,4]{3,2,1,0}, f16[7,7,4,64]{2,1,0,3}), window={size=7x7 stride=2x2 pad=3_3x3_3}, dim_labels=b01f_01io->b01f, custom_call_target=\"__cudnn$convForward\", backend_config=\"{conv_result_scale:1}\""
-    cc { major: 7 }
-    cudnn_version { major: 7 minor: 6 patch: 2 }
-    blas_version: "10201"
-    algos { id: 1 tensor_ops: true }
-  }
-  entries {
-    hlo: "(f16[7,7,4,64]{2,1,0,3}, u8[0]{0}) custom-call(f16[256,224,224,4]{3,2,1,0}, f16[256,112,112,64]{3,2,1,0}), window={size=7x7 stride=2x2 pad=3_3x3_3}, dim_labels=b01f_01io->b01f, custom_call_target=\"__cudnn$convBackwardFilter\", backend_config=\"{conv_result_scale:1}\""
-    cc { major: 7 }
-    cudnn_version { major: 7 minor: 6 patch: 2 }
-    blas_version: "10201"
-    algos { id: 1 tensor_ops: true }
-  })pb";
+)pb";
 
 absl::Span<const stream_executor::dnn::AlgorithmDesc>
 GetBlacklistedConvAlgorithms(tensorflow::ComputeCapability cc,
diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
index 1c617a07372..ea238a6db02 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
@@ -1857,28 +1857,42 @@ Status IrEmitterUnnested::EmitTargetElementLoop(
 namespace {
 
 std::tuple<llvm::Value*, int64> GetStartOffsetAndStepForX(
-    int64 tile_size_x, int64 num_threads_x,
-    const KernelMappingScheme& mapping_scheme, llvm::IRBuilder<>* builder,
-    llvm::Value* x, llvm::Type* index_ty) {
+    const KernelMappingScheme& mapping_scheme, llvm::IRBuilder<>* b,
+    llvm::Value* x, const IrEmitterUnnested::ConstantGenerator& constant) {
   llvm::Value* start_offset_x;
   int64 step_x;
   if (mapping_scheme.DilatedX()) {
     start_offset_x = x;
-    step_x = num_threads_x;
+    step_x = mapping_scheme.GetNumberOfThreadsForDimensionX();
   } else {
-    start_offset_x = builder->CreateMul(
-        x, llvm::ConstantInt::get(index_ty, tile_size_x / num_threads_x));
+    start_offset_x = b->CreateMul(
+        x, constant(mapping_scheme.GetTileSizeForDimensionX() /
+                    mapping_scheme.GetNumberOfThreadsForDimensionX()));
     step_x = 1;
   }
   return std::make_tuple(start_offset_x, step_x);
 }
 
 // Emits code for writing into a tile which fits fully into the output buffer.
+//
+// Pseudocode:
+//
+//  for (y_idx = 0; y_idx < tile_size_y; y_idx += num_threads_y) {
+//    for (j = 0; j < tile_size_x / num_threads_x; j++) {
+//       y_pos = y + y_idx;
+//       if (dilated)
+//         x_pos = x + j * num_threads_x
+//       else
+//         x_pos = x * (tile_size_x / num_threads_x) + j
+//
+//       EmitElementary(y_pos, x_pos);
+//    }
+//  }
 void EmitFullElementalTile(
     const KernelMappingScheme& mapping_scheme,
     const IrArray::Index& tile_origin_index, const string& loop_name,
-    KernelSupportLibrary* ksl, llvm::IRBuilder<>* builder, llvm::Value* y,
-    llvm::Value* x, llvm::Type* index_ty,
+    KernelSupportLibrary* ksl, llvm::IRBuilder<>* b, llvm::Value* y,
+    llvm::Value* x, const IrEmitterUnnested::ConstantGenerator& constant,
     const IrEmitterUnnested::EmitElementFunction& emit_elem_function) {
   int64 num_threads_x = mapping_scheme.GetNumberOfThreadsForDimensionX();
   int64 num_threads_y = mapping_scheme.GetNumberOfThreadsForDimensionY();
@@ -1887,26 +1901,23 @@ void EmitFullElementalTile(
 
   llvm::Value* start_offset_x;
   int64 step_x;
-  std::tie(start_offset_x, step_x) = GetStartOffsetAndStepForX(
-      tile_size_x, num_threads_x, mapping_scheme, builder, x, index_ty);
+  std::tie(start_offset_x, step_x) =
+      GetStartOffsetAndStepForX(mapping_scheme, b, x, constant);
   IrArray::Index source_idx =
-      tile_origin_index.AddOffsetToDim(y, KernelMappingScheme::DimY, builder)
-          .AddOffsetToDim(start_offset_x, KernelMappingScheme::DimX, builder);
-  ksl->For(loop_name + "_y", /*start=*/llvm::ConstantInt::get(index_ty, 0),
-           /*end=*/llvm::ConstantInt::get(index_ty, tile_size_y),
-           /*step=*/llvm::ConstantInt::get(index_ty, num_threads_y),
-           [&](llvm::Value* y_indvar) {
+      tile_origin_index.AddOffsetToDim(y, KernelMappingScheme::DimY, b)
+          .AddOffsetToDim(start_offset_x, KernelMappingScheme::DimX, b);
+  ksl->For(loop_name + "_y", /*start=*/constant(0),
+           /*end=*/constant(tile_size_y),
+           /*step=*/constant(num_threads_y), [&](llvm::Value* y_indvar) {
              IrArray::Index source_idx_y = source_idx.AddOffsetToDim(
-                 y_indvar, KernelMappingScheme::DimY, builder);
-             llvm::Value* y_loc = builder->CreateAdd(y_indvar, y);
+                 y_indvar, KernelMappingScheme::DimY, b);
+             llvm::Value* y_loc = b->CreateAdd(y_indvar, y);
 
              for (int64 j = 0; j < tile_size_x / num_threads_x; j++) {
                IrArray::Index source_idx_y_x = source_idx_y.AddOffsetToDim(
-                   llvm::ConstantInt::get(index_ty, j * step_x),
-                   KernelMappingScheme::DimX, builder);
-               llvm::Value* x_loc = builder->CreateAdd(
-                   llvm::ConstantInt::get(index_ty, j * step_x),
-                   start_offset_x);
+                   constant(j * step_x), KernelMappingScheme::DimX, b);
+               llvm::Value* x_loc =
+                   b->CreateAdd(constant(j * step_x), start_offset_x);
                emit_elem_function(source_idx_y_x, y_loc, x_loc, j);
              }
            });
@@ -1914,12 +1925,30 @@ void EmitFullElementalTile(
 
 // Emits code for writing into a tile which does not fit fully into the output
 // buffer.
+//
+// Pseudocode:
+//
+// for (j = 0; j < tile_size_x / num_threads_x; j++) {
+//   if (dilated)
+//     x_pos = x + j * num_threads_x
+//   else
+//     x_pos = x * (tile_size_x / num_threads_x) + j
+//
+//   if (x_pos < tile_width) {
+//      for (y_indvar = 0; y_indvar < tile_height_bound; y_indvar +=
+//      num_threads_y) {
+//         if (y_indvar < tile_height) {
+//            EmitElementary(y + y_indevar, x);
+//         }
+//      }
+//   }
+// }
 void EmitPartialElementalTile(
     const KernelMappingScheme& mapping_scheme,
     const IrArray::Index& tile_origin_index, const string& loop_name,
-    KernelSupportLibrary* ksl, llvm::IRBuilder<>* builder, llvm::Value* y,
+    KernelSupportLibrary* ksl, llvm::IRBuilder<>* b, llvm::Value* y,
     llvm::Value* x, llvm::Value* tile_height, llvm::Value* tile_width,
-    llvm::Type* index_ty,
+    const IrEmitterUnnested::ConstantGenerator& constant,
     const IrEmitterUnnested::EmitElementFunction& emit_elem_function) {
   int64 num_threads_x = mapping_scheme.GetNumberOfThreadsForDimensionX();
   int64 num_threads_y = mapping_scheme.GetNumberOfThreadsForDimensionY();
@@ -1927,45 +1956,36 @@ void EmitPartialElementalTile(
 
   llvm::Value* start_offset_x;
   int64 step_x;
-  std::tie(start_offset_x, step_x) = GetStartOffsetAndStepForX(
-      tile_size_x, num_threads_x, mapping_scheme, builder, x, index_ty);
+  std::tie(start_offset_x, step_x) =
+      GetStartOffsetAndStepForX(mapping_scheme, b, x, constant);
   IrArray::Index source_idx =
-      tile_origin_index.AddOffsetToDim(y, KernelMappingScheme::DimY, builder)
-          .AddOffsetToDim(start_offset_x, KernelMappingScheme::DimX, builder);
-  for (int64 j = 0; j < tile_size_x / num_threads_x; j++) {
-    IrArray::Index source_idx_x =
-        source_idx.AddOffsetToDim(llvm::ConstantInt::get(index_ty, j * step_x),
-                                  KernelMappingScheme::DimX, builder);
-    llvm::Value* x_loc = builder->CreateAdd(
-        llvm::ConstantInt::get(index_ty, j * step_x), start_offset_x);
+      tile_origin_index.AddOffsetToDim(y, KernelMappingScheme::DimY, b)
+          .AddOffsetToDim(start_offset_x, KernelMappingScheme::DimX, b);
 
-    ksl->If(
-        loop_name + "_x_in_tile", builder->CreateICmpULT(x_loc, tile_width),
-        [&] {
-          // tile_height_bound =
-          //   ceil(tile_height / num_threads_y) * num_threads_y
-          llvm::Value* ceiling_of_ratio = builder->CreateUDiv(
-              builder->CreateAdd(tile_height, llvm::ConstantInt::get(
-                                                  index_ty, num_threads_y - 1)),
-              llvm::ConstantInt::get(index_ty, num_threads_y));
-          llvm::Value* tile_height_bound = builder->CreateMul(
-              ceiling_of_ratio,
-              llvm::ConstantInt::get(index_ty, num_threads_y));
-          ksl->For(
-              loop_name, /*start=*/llvm::ConstantInt::get(index_ty, 0),
-              /*end=*/tile_height_bound,
-              /*step=*/llvm::ConstantInt::get(index_ty, num_threads_y),
-              [&](llvm::Value* y_indvar) {
-                llvm::Value* y_loc = builder->CreateAdd(y_indvar, y);
-                ksl->If(loop_name + "_y_in_tile",
-                        builder->CreateICmpULT(y_loc, tile_height), [&] {
-                          emit_elem_function(
-                              source_idx_x.AddOffsetToDim(
-                                  y_indvar, KernelMappingScheme::DimY, builder),
-                              y_loc, x_loc, j);
-                        });
-              });
-        });
+  for (int64 j = 0; j < tile_size_x / num_threads_x; j++) {
+    IrArray::Index source_idx_x = source_idx.AddOffsetToDim(
+        constant(j * step_x), KernelMappingScheme::DimX, b);
+    llvm::Value* x_loc = b->CreateAdd(constant(j * step_x), start_offset_x);
+    ksl->If(loop_name + "_x_in_tile", b->CreateICmpULT(x_loc, tile_width), [&] {
+      llvm::Value* ceiling_of_ratio =
+          b->CreateUDiv(b->CreateAdd(tile_height, constant(num_threads_y - 1)),
+                        constant(num_threads_y));
+      llvm::Value* tile_height_bound =
+          b->CreateMul(ceiling_of_ratio, constant(num_threads_y));
+      ksl->For(loop_name,
+               /*start=*/constant(0),
+               /*end=*/tile_height_bound,
+               /*step=*/constant(num_threads_y), [&](llvm::Value* y_indvar) {
+                 llvm::Value* y_loc = b->CreateAdd(y_indvar, y);
+                 ksl->If(loop_name + "_y_in_tile",
+                         b->CreateICmpULT(y_loc, tile_height), [&] {
+                           emit_elem_function(
+                               source_idx_x.AddOffsetToDim(
+                                   y_indvar, KernelMappingScheme::DimY, b),
+                               y_loc, x_loc, j);
+                         });
+               });
+    });
   }
 }
 
@@ -1977,31 +1997,39 @@ void EmitPartialElementalTile(
 // about tile_size_x/y and num_threads_x/y are stored in `mapping_scheme`. Emits
 // bounds check to ensure that each processed element is within the boundary
 // defined by `tile_width` and `tile_height`.
+//
+// Pseudocode:
+//
+// if (tile_size_x == tile_width && tile_size_y == tile_height) {
+//     EmitFullElementalTile();
+// } else {
+//     EmitPartialElementalTile();
+// }
 void EmitTiledElementalCodeWithBoundsCheck(
     const KernelMappingScheme& mapping_scheme,
     const IrArray::Index& tile_origin_index, const string& loop_name,
-    KernelSupportLibrary* ksl, llvm::IRBuilder<>* builder, llvm::Value* y,
+    KernelSupportLibrary* ksl, llvm::IRBuilder<>* b, llvm::Value* y,
     llvm::Value* x, llvm::Value* tile_height, llvm::Value* tile_width,
     const IrEmitterUnnested::EmitElementFunction& emit_elem_function) {
   int64 tile_size_x = mapping_scheme.GetTileSizeForDimensionX();
   int64 tile_size_y = mapping_scheme.GetTileSizeForDimensionY();
   llvm::Type* index_ty = tile_width->getType();
+  auto constant = [&](int64 val) {
+    return llvm::ConstantInt::get(index_ty, val);
+  };
 
   ksl->If(
       loop_name + "_full_tile",
-      builder->CreateAnd(
-          builder->CreateICmpEQ(llvm::ConstantInt::get(index_ty, tile_size_x),
-                                tile_width),
-          builder->CreateICmpEQ(llvm::ConstantInt::get(index_ty, tile_size_y),
-                                tile_height)),
+      b->CreateAnd(b->CreateICmpEQ(constant(tile_size_x), tile_width),
+                   b->CreateICmpEQ(constant(tile_size_y), tile_height)),
       [&] {
         EmitFullElementalTile(mapping_scheme, tile_origin_index, loop_name, ksl,
-                              builder, y, x, index_ty, emit_elem_function);
+                              b, y, x, constant, emit_elem_function);
       },
       [&] {
         EmitPartialElementalTile(mapping_scheme, tile_origin_index, loop_name,
-                                 ksl, builder, y, x, tile_height, tile_width,
-                                 index_ty, emit_elem_function);
+                                 ksl, b, y, x, tile_height, tile_width,
+                                 constant, emit_elem_function);
       });
 }
 }  // namespace
@@ -2329,9 +2357,10 @@ void IrEmitterUnnested::EmitTileElementForReduction(
                                         : unnested_hlo;
   // Record the untransposed output linear address for the reduction.
   int partial_result_index = reduction_info.IsRowReduction() ? 0 : x_iter_num;
-  Store(GetUntransposedOutputLinearAddress(&b_, index, reduction_info),
-        InBoundsGEP(reduction_info.GetCurrentOutputLinearIndexAddress(),
-                    {b_.getInt32(partial_result_index)}));
+  b_.CreateStore(
+      GetUntransposedOutputLinearAddress(&b_, index, reduction_info),
+      InBoundsGEP(reduction_info.GetCurrentOutputLinearIndexAddress(),
+                  {b_.getInt32(partial_result_index)}));
 
   if (!reduction_info.IsRowReduction()) {
     llvm::Type* bool_ty = b_.getInt1Ty();
@@ -2381,23 +2410,23 @@ void IrEmitterUnnested::EmitTileElementForReduction(
   int num_partial_results = GetNumberOfPartialResults(reduction_info);
   auto index_without_linear = IrArray::Index(
       input_index.multidim(), reduction_operand_shape, input_index.GetType());
-  absl::Span<llvm::AllocaInst* const> partial_reduction_result_addresses =
-      reduction_info.GetPartialResultAddresses();
-  absl::Span<llvm::AllocaInst* const> reduction_input_addresses =
-      reduction_info.GetReductionInputAddresses();
+
   // Emit code to generate the input and perform the reduction computation for
   // each reduction instruction.
   for (int i = 0; i != reducers.size(); ++i) {
+    llvm::AllocaInst* input_address =
+        reduction_info.GetReductionInputAddresses()[i];
+    llvm::AllocaInst* partial_reduction_result_address =
+        reduction_info.GetPartialResultAddresses()[i];
     llvm::Value* const input_ir_value =
         input_gens[i](num_partial_results > 1 ? index_without_linear
                                               : input_index)
             .ValueOrDie();
-    Store(input_ir_value, reduction_input_addresses[i]);
-    llvm::Value* partial_result_address =
-        InBoundsGEP(partial_reduction_result_addresses[i],
-                    {b_.getInt32(partial_result_index)});
+    Store(input_ir_value, input_address);
+    llvm::Value* partial_result_address = InBoundsGEP(
+        partial_reduction_result_address, {b_.getInt32(partial_result_index)});
     TF_CHECK_OK(EmitCallToNestedComputation(
-        *reducers[i], {partial_result_address, reduction_input_addresses[i]},
+        *reducers[i], {partial_result_address, input_address},
         partial_result_address));
   }
 
@@ -2408,46 +2437,6 @@ void IrEmitterUnnested::EmitTileElementForReduction(
       /*use_linear_index=*/num_partial_results == 1, extra_output_gens));
 }
 
-// Emits tiles for a given dimension.
-static void EmitTilesForBlockDim(
-    const KernelMappingScheme& mapping_scheme, KernelSupportLibrary* ksl,
-    llvm::Type* index_ty, const string& loop_name,
-    const IrArray::Index& starting_tile, int dim_id, llvm::IRBuilder<>* b_,
-    const std::function<void(const IrArray::Index& tile_index)>
-        emit_next_block_dim) {
-  absl::Span<const int64> dims_in_tile = mapping_scheme.GetDimensionsInTiles();
-  absl::Span<const int64> dims_in_block =
-      mapping_scheme.GetDimensionsInBlocks();
-  absl::Span<const int64> block_sizes = mapping_scheme.GetBlockSizes();
-  auto index_typed_constant = [&](uint64 c) -> llvm::Constant* {
-    return llvm::ConstantInt::get(index_ty, c);
-  };
-  if (block_sizes[dim_id] == 1) {
-    emit_next_block_dim(starting_tile);
-  } else {
-    llvm::Value* starting_tile_index_for_dim = starting_tile[dim_id];
-    llvm::Value* block_size_for_dim = index_typed_constant(block_sizes[dim_id]);
-    llvm::Value* block_id_for_dim =
-        b_->CreateUDiv(starting_tile_index_for_dim, block_size_for_dim);
-    llvm::Value* last_block_for_dim =
-        index_typed_constant(dims_in_block[dim_id] - 1);
-    llvm::Value* last_block_size_for_dim =
-        index_typed_constant(dims_in_tile[dim_id] -
-                             (dims_in_block[dim_id] - 1) * block_sizes[dim_id]);
-    llvm::Value* num_tiles_in_block =
-        b_->CreateSelect(b_->CreateICmpEQ(last_block_for_dim, block_id_for_dim),
-                         last_block_size_for_dim, block_size_for_dim);
-    ksl->For(loop_name,
-             /*start=*/index_typed_constant(0),
-             /*end=*/num_tiles_in_block,
-             /*step=*/1, [&](llvm::Value* block_dim_induction_var) {
-               IrArray::Index tile_index = starting_tile.AddOffsetToDim(
-                   block_dim_induction_var, dim_id, b_);
-               emit_next_block_dim(tile_index);
-             });
-  }
-}
-
 // Returns the index for the first element in the tile with the given tile
 // index.
 static IrArray::Index GetElementIndexForTileOrigin(
@@ -2467,79 +2456,20 @@ static IrArray::Index GetElementIndexForTileOrigin(
                         tile_index.GetType());
 }
 
-// Emits the tile with a given tile_index, by calculating the tight bounds for
-// each dimension of the tile and then calling tile_generator.
-static void EmitOneTileForTileIndex(
-    const IrArray::Index& tile_index, llvm::Type* index_ty, llvm::Value* y,
-    llvm::Value* x, const KernelMappingScheme& mapping_scheme,
-    KernelSupportLibrary* ksl, llvm::IRBuilder<>* b_,
-    IrEmitterUnnested::TileElementGenerator tile_generator) {
+llvm::Value* IrEmitterUnnested::EmitTilingKernel(
+    const KernelMappingScheme& mapping_scheme, llvm::Type* index_ty,
+    TileElementGenerator tile_element_generator) {
   absl::Span<const int64> dims_in_tile = mapping_scheme.GetDimensionsInTiles();
+  absl::Span<const int64> dims_in_block =
+      mapping_scheme.GetDimensionsInBlocks();
   absl::Span<const int64> dimensions_in_elements =
       mapping_scheme.GetDimensionsInElements();
-  auto index_typed_constant = [&](uint64 c) -> llvm::Constant* {
+
+  auto constant = [&](uint64 c) -> llvm::Constant* {
     return llvm::ConstantInt::get(index_ty, c);
   };
-  std::vector<llvm::Value*> output_tile_bounds(3);
-  for (int i = KernelMappingScheme::DimY; i < KernelMappingScheme::DimTot;
-       ++i) {
-    int64 tile_size_for_dim = mapping_scheme.GetTileSizeForDimension(i);
-    // Only last row or column may not have full size.
-    llvm::Value* is_last_row = b_->CreateICmpEQ(
-        tile_index[i], index_typed_constant(dims_in_tile[i] - 1));
-    int64 partial_row_size =
-        dimensions_in_elements[i] - (dims_in_tile[i] - 1) * tile_size_for_dim;
-    output_tile_bounds[i] =
-        b_->CreateSelect(is_last_row, index_typed_constant(partial_row_size),
-                         index_typed_constant(tile_size_for_dim), "tile_bound");
-  }
-  IrArray::Index tile_origin =
-      GetElementIndexForTileOrigin(tile_index, mapping_scheme, b_);
-  tile_generator(y, x, tile_origin, "output", output_tile_bounds[1],
-                 output_tile_bounds[2], ksl);
-}
 
-static IrArray::Index GetStartingBlockIdx(
-    const KernelMappingScheme& mapping_scheme, llvm::Type* index_ty,
-    llvm::IRBuilder<>* b_) {
-  llvm::Value* block_id = gpu::EmitCallToTargetIntrinsic(
-      gpu::TargetIntrinsicID::kBlockIdx, {}, {}, b_);
-  llvm_ir::AddRangeMetadata(0, mapping_scheme.GetNumberOfBlocks(),
-                            llvm::cast<llvm::Instruction>(block_id));
-  llvm::Value* linear_block_id =
-      b_->CreateIntCast(block_id, index_ty, /*isSigned=*/true, "block.id.x");
-  return IrArray::Index(
-      linear_block_id,
-      ShapeUtil::MakeShapeWithDescendingLayout(
-          PRED /*arbitrary*/, mapping_scheme.GetDimensionsInBlocks()),
-      b_);
-}
-
-static IrArray::Index GetStartingBlockForDimZ(
-    const KernelMappingScheme& mapping_scheme, llvm::Type* index_ty,
-    llvm::IRBuilder<>* b_) {
-  const IrArray::Index starting_block =
-      GetStartingBlockIdx(mapping_scheme, index_ty, b_);
-  std::vector<llvm::Value*> multidim;
-  multidim.reserve(3);
-  for (int i = 0; i < 3; ++i) {
-    multidim.push_back(b_->CreateMul(
-        starting_block[i],
-        llvm::ConstantInt::get(
-            starting_block[i]->getType(),
-            mapping_scheme.GetNumberOfTilesInOneBlockForDimension(i)),
-        "block_origin." + std::to_string(i)));
-  }
-  return IrArray::Index(multidim, mapping_scheme.GetDimensionsInTiles(),
-                        starting_block.GetType());
-}
-
-void IrEmitterUnnested::EmitTilingKernel(
-    const KernelMappingScheme& mapping_scheme, llvm::Type* index_ty,
-    TileElementGenerator tile_element_generator,
-    KernelPrologueGenerator kernel_prologue_generator,
-    KernelEpilogueGenerator kernel_epilogue_generator) {
-  // Calculate (y, x) coordinate of the thread in the 2D view of thread block
+  // Calculate (y, x) coordinates respectively in the 2D view of thread block,
   // defined by (num_thread_y, num_thread_x) from thread_id.
   llvm::CallInst* thread_id_raw = gpu::EmitCallToTargetIntrinsic(
       gpu::TargetIntrinsicID::kThreadIdx, {}, {}, &b_);
@@ -2552,44 +2482,83 @@ void IrEmitterUnnested::EmitTilingKernel(
       index_ty, mapping_scheme.GetNumberOfThreadsForDimensionX());
   llvm::Value* x = b_.CreateURem(thread_id_int, num_thread_x, "thread.x");
   llvm::Value* y = b_.CreateUDiv(thread_id_int, num_thread_x, "thread.y");
-  llvm::Value* lane_id =
-      mapping_scheme.GetNumberOfThreadsForDimensionX() == kWarpSize ? x
-                                                                    : nullptr;
+
   KernelSupportLibrary ksl(&b_, llvm_ir::UnrollMode::kDefaultUnroll);
 
-  auto emit_one_tile_for_tile_index = [&](const IrArray::Index& tile_index) {
-    return EmitOneTileForTileIndex(tile_index, index_ty, y, x, mapping_scheme,
-                                   &ksl, &b_, tile_element_generator);
+  // Calculate the starting tile.
+  const IrArray::Index starting_tile = [&]() {
+    llvm::Value* block_id = gpu::EmitCallToTargetIntrinsic(
+        gpu::TargetIntrinsicID::kBlockIdx, {}, {}, &b_);
+    llvm_ir::AddRangeMetadata(0, mapping_scheme.GetNumberOfBlocks(),
+                              llvm::cast<llvm::Instruction>(block_id));
+    llvm::Value* linear_block_id =
+        b_.CreateIntCast(block_id, index_ty, /*isSigned=*/true, "block.id.x");
+    IrArray::Index starting_block(
+        linear_block_id,
+        ShapeUtil::MakeShapeWithDescendingLayout(
+            PRED /*arbitrary*/, mapping_scheme.GetDimensionsInBlocks()),
+        &b_);
+
+    std::vector<llvm::Value*> multidim;
+    multidim.reserve(3);
+    for (int i = 0; i < 3; ++i) {
+      multidim.push_back(
+          b_.CreateMul(starting_block[i],
+                       llvm::ConstantInt::get(starting_block[i]->getType(),
+                                              mapping_scheme.BlockSize(i)),
+                       "block_origin." + std::to_string(i)));
+    }
+    return IrArray::Index(multidim, mapping_scheme.GetDimensionsInTiles(),
+                          starting_block.GetType());
+  }();
+
+  auto emit_tile = [&](const IrArray::Index& tile_index) {
+    std::vector<llvm::Value*> output_tile_bounds(3);
+    for (int i = KernelMappingScheme::DimY; i < KernelMappingScheme::DimTot;
+         ++i) {
+      int64 tile_size_for_dim = mapping_scheme.GetTileSizeForDimension(i);
+      // Only last row or column may not have full size.
+      llvm::Value* is_last_row =
+          b_.CreateICmpEQ(tile_index[i], constant(dims_in_tile[i] - 1));
+      int64 partial_row_size =
+          dimensions_in_elements[i] - (dims_in_tile[i] - 1) * tile_size_for_dim;
+      output_tile_bounds[i] =
+          b_.CreateSelect(is_last_row, constant(partial_row_size),
+                          constant(tile_size_for_dim), "tile_bound");
+    }
+    IrArray::Index tile_origin =
+        GetElementIndexForTileOrigin(tile_index, mapping_scheme, &b_);
+    tile_element_generator(y, x, tile_origin, "output", output_tile_bounds[1],
+                           output_tile_bounds[2], &ksl);
   };
 
-  const IrArray::Index starting_tile_for_dim_z =
-      GetStartingBlockForDimZ(mapping_scheme, index_ty, &b_);
+  int dim_z = KernelMappingScheme::DimZ;
 
-  auto emit_tiles_for_block_dim =
-      [&](const string& loop_name, const IrArray::Index& starting_tile,
-          int dim_id,
-          const std::function<void(const IrArray::Index& tile_index)>
-              emit_next_block_dim) {
-        EmitTilesForBlockDim(mapping_scheme, &ksl, index_ty, loop_name,
-                             starting_tile, dim_id, &b_, emit_next_block_dim);
-      };
+  if (mapping_scheme.BlockSize(dim_z) == 1) {
+    emit_tile(starting_tile);
+  } else {
+    llvm::Value* starting_tile_index_for_dim = starting_tile[dim_z];
+    llvm::Value* block_size_for_dim = constant(mapping_scheme.BlockSize(dim_z));
+    llvm::Value* block_id_for_dim =
+        b_.CreateUDiv(starting_tile_index_for_dim, block_size_for_dim);
+    llvm::Value* last_block_for_dim = constant(dims_in_block[dim_z] - 1);
+    llvm::Value* last_block_size_for_dim =
+        constant(dims_in_tile[dim_z] -
+                 (dims_in_block[dim_z] - 1) * mapping_scheme.BlockSize(dim_z));
 
-  kernel_prologue_generator(lane_id);
-
-  // Emit the three dimensional block of tiles.
-  emit_tiles_for_block_dim(
-      "block_dim_z", starting_tile_for_dim_z, KernelMappingScheme::DimZ,
-      [&](const IrArray::Index& starting_tile_for_dim_y) {
-        emit_tiles_for_block_dim(
-            "block_dim_y", starting_tile_for_dim_y, KernelMappingScheme::DimY,
-            [&](const IrArray::Index& starting_tile_for_dim_x) {
-              emit_tiles_for_block_dim("block_dim_x", starting_tile_for_dim_x,
-                                       KernelMappingScheme::DimX,
-                                       emit_one_tile_for_tile_index);
+    llvm::Value* num_tiles_in_block =
+        b_.CreateSelect(b_.CreateICmpEQ(last_block_for_dim, block_id_for_dim),
+                        last_block_size_for_dim, block_size_for_dim);
+    ksl.For("loop_z",
+            /*start=*/constant(0),
+            /*end=*/num_tiles_in_block,
+            /*step=*/1, [&](llvm::Value* block_dim_induction_var) {
+              IrArray::Index tile_index = starting_tile.AddOffsetToDim(
+                  block_dim_induction_var, dim_z, &b_);
+              emit_tile(tile_index);
             });
-      });
-
-  kernel_epilogue_generator(lane_id);
+  }
+  return x;
 }
 
 // Emits a kernel for the given hlo instruction using a tiled 0-2-1 transpose
@@ -2744,23 +2713,20 @@ void IrEmitterUnnested::EmitHlo021Tile(
         }
       };
 
-  KernelPrologueGenerator hlo021_prologue = [&](llvm::Value* /*lane_id*/) {
-    // For multioutput fusion, one thread needs to output a tuple
-    // with pointers to all the individual outputs.  We could do this
-    // at any point in the kernel, but we do it at the beginning in
-    // the hopes of reducing register pressure, since we touch
-    // threadIdx.x and blockIdx.x at the beginning of the kernel
-    // *anyway*.
-    if (hlo->IsMultiOutputFusion()) {
-      KernelSupportLibrary{&b_}.If("emit_mof_tuple", IsBlock0Thread0(&b_), [&] {
-        llvm_ir::EmitTuple(GetIrArray(*hlo, *hlo),
-                           ConstructIrArrayForOutputs(*hlo), &b_);
-      });
-    }
-  };
-  KernelEpilogueGenerator epilogue_generator = [](llvm::Value* /*lane_id*/) {};
-  EmitTilingKernel(mapping_scheme, index_type, tile_generator, hlo021_prologue,
-                   epilogue_generator);
+  // For multioutput fusion, one thread needs to output a tuple
+  // with pointers to all the individual outputs.  We could do this
+  // at any point in the kernel, but we do it at the beginning in
+  // the hopes of reducing register pressure, since we touch
+  // threadIdx.x and blockIdx.x at the beginning of the kernel
+  // *anyway*.
+  if (hlo->IsMultiOutputFusion()) {
+    KernelSupportLibrary{&b_}.If("emit_mof_tuple", IsBlock0Thread0(&b_), [&] {
+      llvm_ir::EmitTuple(GetIrArray(*hlo, *hlo),
+                         ConstructIrArrayForOutputs(*hlo), &b_);
+    });
+  }
+
+  EmitTilingKernel(mapping_scheme, index_type, tile_generator);
   UpdateLaunchDimensions(launch_dimensions, kernel_thunk,
                          ir_emitter_context_->llvm_module());
 }
@@ -3093,14 +3059,13 @@ ReductionCodegenInfo IrEmitterUnnested::ComputeReductionCodegenInfo(
   int64 num_threads_y = 1;
   bool dilated_x = true;
   if (is_row_reduction) {
+    num_threads_x = kWarpSize;
     if (dims_in_elem[1] == 1) {
       // Scalar reduction is handled differently than the other kind of row
       // reduction.
       CHECK_EQ(dims_in_elem[0], 1);
       tile_size_x = kWarpSize * 16;
-      num_threads_x = kWarpSize;
     } else {
-      num_threads_x = kWarpSize;
       if (dims_in_elem[2] % (kWarpSize * 64) == 0) {
         tile_size_x = kWarpSize * 64;
       } else {
@@ -3209,6 +3174,14 @@ Status IrEmitterUnnested::EmitReductionFromOrToContiguousDimensions(
 
   ReductionCodegenInfo reduction_info =
       ComputeReductionCodegenInfo(unnested_hlo, first_reduce);
+  const KernelMappingScheme& mapping_scheme =
+      reduction_info.GetKernelMappingScheme();
+  LaunchDimensions launch_dimensions(mapping_scheme.GetNumberOfBlocks(),
+                                     mapping_scheme.GetThreadsPerBlock());
+  llvm::Type* index_ty = GetIndexTypeForKernel(
+      unnested_hlo, launch_dimensions.launch_bound(), &b_);
+  EmitPrologueForReduction(unnested_hlo, &reduction_info, reduce_instructions,
+                           index_ty);
   EmitElementFunction emit_reduction_tile =
       [&](const llvm_ir::IrArray::Index& index, llvm::Value* y_loc,
           llvm::Value* x_loc, int64 x_iter_num) {
@@ -3217,15 +3190,7 @@ Status IrEmitterUnnested::EmitReductionFromOrToContiguousDimensions(
                                     reducers, x_iter_num);
       };
 
-  const auto& mapping_scheme = reduction_info.GetKernelMappingScheme();
-  LaunchDimensions launch_dimensions(mapping_scheme.GetNumberOfBlocks(),
-                                     mapping_scheme.GetThreadsPerBlock());
-  llvm::Type* index_ty =
-      reduction_info.IsRowReduction()
-          ? GetIndexTypeForKernel(unnested_hlo,
-                                  launch_dimensions.launch_bound(), &b_)
-          : b_.getInt64Ty();
-  EmitTilingKernel(
+  llvm::Value* lane_id = EmitTilingKernel(
       mapping_scheme, index_ty,
       /*tile_element_generator=*/
       [&](llvm::Value* y, llvm::Value* x, const IrArray::Index& index,
@@ -3234,18 +3199,10 @@ Status IrEmitterUnnested::EmitReductionFromOrToContiguousDimensions(
         EmitTiledElementalCodeWithBoundsCheck(
             reduction_info.GetKernelMappingScheme(), index, loop_name, ksl, &b_,
             y, x, tile_height, tile_width, emit_reduction_tile);
-      },
-      /*kernel_prologue_generator=*/
-      [&](llvm::Value* /*lane_id*/) {
-        EmitPrologueForReduction(unnested_hlo, &reduction_info,
-                                 reduce_instructions, index_ty);
-      },
-      /*kernel_epilogue_generator=*/
-      [&](llvm::Value* lane_id) {
-        EmitEpilogueForReduction(
-            unnested_hlo, reduction_info, reduce_instructions,
-            reduction_output_shape_indices, reducers, lane_id);
       });
+  EmitEpilogueForReduction(unnested_hlo, reduction_info, reduce_instructions,
+                           reduction_output_shape_indices, reducers, lane_id);
+
   UpdateLaunchDimensions(launch_dimensions, kernel_thunk.get(),
                          ir_emitter_context_->llvm_module());
 
diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.h b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.h
index dbd6ce78bb3..3afcde86f28 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.h
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.h
@@ -52,12 +52,6 @@ namespace gpu {
 class IrEmitterUnnested : public IrEmitter,
                           private ThunkEmitter::EmissionContext {
  public:
-  // A function object to prepare for the code generation for a tiling kernel.
-  using KernelPrologueGenerator = std::function<void(llvm::Value* lane_id)>;
-
-  // A function object to finalize the code generation for a tiling kernel.
-  using KernelEpilogueGenerator = std::function<void(llvm::Value* lane_id)>;
-
   // A function object to generate code to process one element in a tile.
   //
   // hlo: the instruction for which the code is generated for.
@@ -71,6 +65,8 @@ class IrEmitterUnnested : public IrEmitter,
       const llvm_ir::IrArray::Index& index, llvm::Value* y_loc,
       llvm::Value* x_loc, int64 x_iter_num)>;
 
+  using ConstantGenerator = std::function<llvm::Value*(int64)>;
+
   // A function to generate the code to emit the entire tile.
   using TileElementGenerator = std::function<void(
       llvm::Value* y, llvm::Value* x, const llvm_ir::IrArray::Index& index,
@@ -211,11 +207,11 @@ class IrEmitterUnnested : public IrEmitter,
 
   // Emits a kernel for the hlo instruction using the given kernel mapping
   // scheme.
-  void EmitTilingKernel(const KernelMappingScheme& mapping_scheme,
-                        llvm::Type* index_ty,
-                        TileElementGenerator tile_element_generator,
-                        KernelPrologueGenerator kernel_prologue_generator,
-                        KernelEpilogueGenerator kernel_epilogue_generator);
+  //
+  // Returns lane_id as an LLVM value.
+  llvm::Value* EmitTilingKernel(const KernelMappingScheme& mapping_scheme,
+                                llvm::Type* index_ty,
+                                TileElementGenerator tile_element_generator);
 
   // Emits code to process a tensor element in a tile for the given kCopy HLO
   // that performs a 0-2-1 transpose.
@@ -235,6 +231,9 @@ class IrEmitterUnnested : public IrEmitter,
 
   // Emits code to process a tensor element in a tile for the given input hlo
   // that is either a unnested kReduce or a kInput fusion.
+  //
+  // Calculates and stores the temporary reduction value in the corresponding
+  // alloca.
   void EmitTileElementForReduction(
       HloInstruction* unnested_hlo, const Shape& reduction_operand_shape,
       absl::Span<HloInstruction* const> output_instructions,
@@ -243,6 +242,9 @@ class IrEmitterUnnested : public IrEmitter,
       absl::Span<HloComputation* const> reducers, int64 x_iter_num);
 
   // Prepares for the code generation for a tile block of a reduction kernel.
+  //
+  // Create accumulator alloca's, populate them with initial values, and store
+  // inside reduction_info.
   void EmitPrologueForReduction(
       HloInstruction* unnested_hlo, ReductionCodegenInfo* reduction_info,
       absl::Span<HloInstruction* const> reduce_instructions,
@@ -253,7 +255,8 @@ class IrEmitterUnnested : public IrEmitter,
                                    ReductionCodegenInfo* kernel_info,
                                    GpuElementalIrEmitter* elemental_emitter);
 
-  // Wraps up the code generation for a tile block of a reduction kernel.
+  // Wraps up the code generation for a tile block of a reduction kernel: write
+  // the calculated output into the output tensor.
   void EmitEpilogueForReduction(
       HloInstruction* unnested_hlo, const ReductionCodegenInfo& reduction_info,
       absl::Span<const HloInstruction* const> reduce_instructions,
diff --git a/tensorflow/compiler/xla/service/gpu/kernel_mapping_scheme.h b/tensorflow/compiler/xla/service/gpu/kernel_mapping_scheme.h
index 6e5bb5a1ba7..e25f1b66862 100644
--- a/tensorflow/compiler/xla/service/gpu/kernel_mapping_scheme.h
+++ b/tensorflow/compiler/xla/service/gpu/kernel_mapping_scheme.h
@@ -85,21 +85,18 @@ class KernelMappingScheme {
         dims_in_tiles_{dims_in_elems[0],
                        CeilOfRatio<int64>(dims_in_elems[1], tile_size_y),
                        CeilOfRatio<int64>(dims_in_elems[2], tile_size_x)},
-        block_sizes_{block_size_z, 1, 1},
-        dims_in_blocks_{CeilOfRatio<int64>(dims_in_elems[0], block_sizes_[0]),
-                        dims_in_tiles_[1], dims_in_tiles_[2]},
+        dims_in_blocks_{dims_in_elems[0] / block_size_z, dims_in_tiles_[1],
+                        dims_in_tiles_[2]},
+        block_size_z_{block_size_z},
         num_threads_x_(num_threads_x),
         num_threads_y_(num_threads_y),
         dilated_x_(is_dilated_x) {
     CHECK_EQ(tile_size_y % num_threads_y_, 0);
     CHECK_EQ(tile_size_x % num_threads_x_, 0);
     CHECK_EQ((dims_in_elems[0] % block_size_z), 0);
-    VLOG(10) << "dims_in_elems_ = [" << absl::StrJoin(dims_in_elems_, ",")
-             << "]";
-    VLOG(10) << "dims_in_tiles_ = [" << absl::StrJoin(dims_in_tiles_, ",")
-             << "]";
-    VLOG(10) << "dims_in_blocks_ = [" << absl::StrJoin(dims_in_blocks_, ",")
-             << "]";
+    VLOG(10) << "dims_in_elems_ = " << absl::StrJoin(dims_in_elems_, ",");
+    VLOG(10) << "dims_in_tiles_ = " << absl::StrJoin(dims_in_tiles_, ",");
+    VLOG(10) << "dims_in_blocks_ = " << absl::StrJoin(dims_in_blocks_, ",");
     if (!dilated_x_) {
       // dilated_x_=false is for the purpose of vectorization, which requires
       // GetTileSizeForDimension(DimX) to be a multiplier of num_threads_x_.
@@ -127,13 +124,14 @@ class KernelMappingScheme {
     return absl::c_accumulate(dims_in_tiles_, 1LL, std::multiplies<int64>());
   }
 
-  int64 GetNumberOfTilesInOneBlock() const {
-    return absl::c_accumulate(block_sizes_, 1, std::multiplies<int64>());
-  }
+  int64 GetNumberOfTilesInOneBlock() const { return block_size_z_; }
 
-  int64 GetNumberOfTilesInOneBlockForDimension(int d) const {
+  int64 BlockSize(int d) const {
     DCHECK(d >= DimZ && d <= DimX);
-    return block_sizes_[d];
+    if (d == DimZ) {
+      return block_size_z_;
+    }
+    return 1;
   }
 
   int64 GetNumberOfBlocks() const {
@@ -148,7 +146,6 @@ class KernelMappingScheme {
     return GetTileSizeForDimension(DimY);
   }
 
-  absl::Span<const int64> GetBlockSizes() const { return block_sizes_; }
   int64 GetTileBlockSizeForDimension(int d) const {
     return dims_in_blocks_.at(d);
   }
@@ -165,31 +162,31 @@ class KernelMappingScheme {
 
  private:
   // The number of elements in each dimension.
-  std::array<int64, 3> dims_in_elems_;
+  const std::array<int64, 3> dims_in_elems_;
 
   // The number of elements for each dimension of a tile.
-  std::array<int64, 3> tile_sizes_;
+  const std::array<int64, 3> tile_sizes_;
   // The number of tiles in each dimension. It is computed from dims_in_elem_
   // and tile_sizes_.
-  std::array<int64, 3> dims_in_tiles_;
+  const std::array<int64, 3> dims_in_tiles_;
 
-  // The number of tiles for each dimension of a tile block.
-  std::array<int64, 3> block_sizes_;
   // The number of blocks in each dimension of a tile block. It is computed from
   // dims_in_tile_ and block_sizes_.
-  std::array<int64, 3> dims_in_blocks_;
+  const std::array<int64, 3> dims_in_blocks_;
+
+  const int64 block_size_z_;
 
   // Number of threads used to process elements in the X direction of a tile.
-  int64 num_threads_x_;
+  const int64 num_threads_x_;
   // Number of threads used to process elements in the Y direction of a tile.
-  int64 num_threads_y_;
+  const int64 num_threads_y_;
 
   // When num_threads_x threads process a total of tile_size_x elements in the
   // X dimension of a tile, each threads process n=tile_size_x/num_threads_x
   // elements. When dilated_x=false, the n elements processed by a thread are
   // contiguous. On the other hand, when dilated_x=true the n elements are
   // dilated by a factor of num_threads_x.
-  bool dilated_x_;
+  const bool dilated_x_;
 };
 
 // Information to support the code generation for a tiled reduction kernel.
diff --git a/tensorflow/compiler/xla/service/gpu/nvptx_compiler.cc b/tensorflow/compiler/xla/service/gpu/nvptx_compiler.cc
index fa903f7233a..e525b4b1de9 100755
--- a/tensorflow/compiler/xla/service/gpu/nvptx_compiler.cc
+++ b/tensorflow/compiler/xla/service/gpu/nvptx_compiler.cc
@@ -23,10 +23,10 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/dump.h"
 #include "tensorflow/compiler/xla/service/gpu/cublas_gemm_pad_for_tensor_cores.h"
 #include "tensorflow/compiler/xla/service/gpu/cudnn_conv_algorithm_picker.h"
-#include "tensorflow/compiler/xla/service/gpu/cudnn_conv_pad_for_tensor_cores.h"
 #include "tensorflow/compiler/xla/service/gpu/cudnn_conv_padding_legalization.h"
 #include "tensorflow/compiler/xla/service/gpu/cudnn_conv_rewriter.h"
 #include "tensorflow/compiler/xla/service/gpu/cudnn_fused_conv_rewriter.h"
+#include "tensorflow/compiler/xla/service/gpu/cudnn_pad_for_convolutions.h"
 #include "tensorflow/compiler/xla/service/gpu/cusolver_rewriter.h"
 #include "tensorflow/compiler/xla/service/gpu/gemm_algorithm_picker.h"
 #include "tensorflow/compiler/xla/service/gpu/gemm_rewriter.h"
@@ -50,7 +50,7 @@ limitations under the License.
 #include "tensorflow/core/platform/tracing.h"
 #include "tensorflow/core/profiler/lib/traceme.h"
 #include "tensorflow/stream_executor/cuda/cuda_diagnostics.h"
-#include "tensorflow/stream_executor/cuda/ptxas_utils.h"
+#include "tensorflow/stream_executor/gpu/asm_compiler.h"
 
 namespace xla {
 namespace gpu {
@@ -115,12 +115,11 @@ Status NVPTXCompiler::OptimizeHloConvolutionCanonicalization(
   pipeline.AddPass<CudnnConvRewriter>();
   pipeline.AddPass<CudnnFusedConvRewriter>();
   pipeline.AddPass<CudnnConvPaddingLegalization>();
-  if (IsVoltaOrLater(*stream_exec)) {
-    pipeline.AddPass<CudnnConvPadForTensorCores>();
-    // CudnnConvPadForTensorCores leaves behind unnecessary
-    // tuple/get-tuple-element pairs that TupleSimplifier fixes.
-    pipeline.AddPass<TupleSimplifier>();
-  }
+  pipeline.AddPass<CudnnPadForConvolutions>(IsVoltaOrLater(*stream_exec));
+  // CudnnConvPadForIntegerConvolutions and CudnnConvPadForTensorCores leaves
+  // behind unnecessary tuple/get-tuple-element pairs that TupleSimplifier
+  // fixes.
+  pipeline.AddPass<TupleSimplifier>();
 
   // tf2xla bridge, DepthwiseConvolutionConverter and CudnnConvRewriter
   // introduces reshapes and transposes that can be eliminated using
@@ -362,18 +361,18 @@ NVPTXCompiler::CompileTargetBinary(const HloModule* module,
     DumpToFileInDirOrStdout(*module, "ptx", ptx);
   }
 
-  std::vector<uint8> cubin =
-      CompilePtxOrGetCachedResult(stream_exec, ptx, compute_capability.first,
-                                  compute_capability.second, module->config());
+  std::vector<uint8> cubin = CompileGpuAsmOrGetCachedResult(
+      stream_exec, ptx, compute_capability.first, compute_capability.second,
+      module->config());
 
   return std::pair<std::string, std::vector<uint8>>(std::move(ptx),
                                                     std::move(cubin));
 }
 
-std::vector<uint8> NVPTXCompiler::CompilePtxOrGetCachedResult(
+std::vector<uint8> NVPTXCompiler::CompileGpuAsmOrGetCachedResult(
     se::StreamExecutor* stream_exec, const string& ptx, int cc_major,
     int cc_minor, const HloModuleConfig& hlo_module_config) {
-  XLA_SCOPED_LOGGING_TIMER("NVPTXCompiler::CompilePtxOrGetCachedResult");
+  XLA_SCOPED_LOGGING_TIMER("NVPTXCompiler::CompileGpuAsmOrGetCachedResult");
   tensorflow::profiler::TraceMe activity(
       "PTX->CUBIN", tensorflow::profiler::TraceMeLevel::kInfo);
   bool inserted;
@@ -401,9 +400,9 @@ std::vector<uint8> NVPTXCompiler::CompilePtxOrGetCachedResult(
     if (inserted) {
       CHECK(!cache_value->compilation_done);
       if (!ptx.empty()) {
-        StatusOr<std::vector<uint8>> maybe_cubin = se::cuda::CompilePtx(
-            stream_exec->device_ordinal(), cache_ptx->c_str(),
-            PtxOptsFromConfig(hlo_module_config));
+        StatusOr<std::vector<uint8>> maybe_cubin =
+            se::CompileGpuAsm(stream_exec->device_ordinal(), cache_ptx->c_str(),
+                              PtxOptsFromConfig(hlo_module_config));
         if (maybe_cubin.ok()) {
           cache_value->cubin_data = std::move(maybe_cubin).ValueOrDie();
           VLOG(2) << "Compiled PTX size:" << ptx.size()
diff --git a/tensorflow/compiler/xla/service/gpu/nvptx_compiler.h b/tensorflow/compiler/xla/service/gpu/nvptx_compiler.h
index a7b38afb8ec..3098d5af25f 100644
--- a/tensorflow/compiler/xla/service/gpu/nvptx_compiler.h
+++ b/tensorflow/compiler/xla/service/gpu/nvptx_compiler.h
@@ -67,7 +67,7 @@ class NVPTXCompiler : public GpuCompiler {
 
   // Tries to compile the given ptx string to cubin.  Returns a vector with the
   // compiled cubin.  If compilation was unsuccessful, returns an empty vector.
-  std::vector<uint8> CompilePtxOrGetCachedResult(
+  std::vector<uint8> CompileGpuAsmOrGetCachedResult(
       se::StreamExecutor* stream_exec, const string& ptx, int cc_major,
       int cc_minor, const HloModuleConfig& hlo_module_config);
 
diff --git a/tensorflow/compiler/xla/service/gpu/stream_executor_util.cc b/tensorflow/compiler/xla/service/gpu/stream_executor_util.cc
index 117931e3398..8df21c3dfb1 100644
--- a/tensorflow/compiler/xla/service/gpu/stream_executor_util.cc
+++ b/tensorflow/compiler/xla/service/gpu/stream_executor_util.cc
@@ -220,10 +220,9 @@ Status ExecuteKernelOnStream(const se::KernelBase& kernel,
                                   *kernel_args);
 }
 
-se::cuda::PtxCompilationOptions PtxOptsFromConfig(
-    const HloModuleConfig& hlo_module_config) {
-  return se::cuda::PtxCompilationOptions(
-      hlo_module_config.debug_options().xla_gpu_disable_ptxas_optimizations(),
+se::GpuAsmOpts PtxOptsFromConfig(const HloModuleConfig& hlo_module_config) {
+  return se::GpuAsmOpts(
+      hlo_module_config.debug_options().xla_gpu_disable_gpuasm_optimizations(),
       hlo_module_config.debug_options().xla_gpu_cuda_data_dir());
 }
 
@@ -245,10 +244,6 @@ template <typename T>
 static void InitializeTypedBuffer(se::Stream* stream,
                                   se::DeviceMemoryBase buffer,
                                   int64* rng_state) {
-  static_assert(
-      std::is_floating_point<T>::value || std::is_same<T, Eigen::half>::value,
-      "Unimplemented for integers yet.");
-
   // Accesses to static variables are not locked, since the caller is already
   // in a critical section.
   static std::vector<T>* host_buffer = [] {
@@ -257,13 +252,23 @@ static void InitializeTypedBuffer(se::Stream* stream,
     // Default-seeded random numbers.
     std::mt19937 gen;
     for (auto& element : *ret) {
-      using RandomType =
+      // Only double gets random values in double.  Other data types get random
+      // values in float then cast them to the target data types.
+      using RandomFloatingPointType =
           typename std::conditional<std::is_same<T, Eigen::half>::value, float,
                                     T>::type;
+      using RandomType =
+          typename std::conditional<std::is_integral<T>::value, float,
+                                    RandomFloatingPointType>::type;
       // Scale down the values for fp16 to have less overflows.
       auto upper_bound =
           RandomType(std::is_same<T, Eigen::half>::value ? 0.1 : 1.0);
-      element = T(UniformDistribution(RandomType(0), upper_bound, &gen));
+      auto rand_val = UniformDistribution(RandomType(0), upper_bound, &gen);
+      // For float or double, it is between [0,1].
+      // For fp16, it ranges between [0, 0.1].
+      // For integer types, element is either 0 or 1 for less overflows
+      // especially for int8.
+      element = T(std::is_integral<T>::value ? rand_val + 0.5 : rand_val);
     }
     return ret;
   }();
@@ -289,8 +294,8 @@ static void InitializeTypedBuffer(se::Stream* stream,
   }
 }
 
-void InitializeFloatBuffer(se::Stream* stream, PrimitiveType buffer_type,
-                           int64* rng_state, se::DeviceMemoryBase buffer) {
+void InitializeBuffer(se::Stream* stream, PrimitiveType buffer_type,
+                      int64* rng_state, se::DeviceMemoryBase buffer) {
   switch (buffer_type) {
     case xla::F16:
       return InitializeTypedBuffer<Eigen::half>(stream, buffer, rng_state);
@@ -300,6 +305,8 @@ void InitializeFloatBuffer(se::Stream* stream, PrimitiveType buffer_type,
     case xla::F64:
     case xla::C128:
       return InitializeTypedBuffer<double>(stream, buffer, rng_state);
+    case xla::S8:
+      return InitializeTypedBuffer<int8>(stream, buffer, rng_state);
     default:
       LOG(FATAL) << "Unexpected type";
   }
diff --git a/tensorflow/compiler/xla/service/gpu/stream_executor_util.h b/tensorflow/compiler/xla/service/gpu/stream_executor_util.h
index 5da2931f049..3e2ae241a03 100644
--- a/tensorflow/compiler/xla/service/gpu/stream_executor_util.h
+++ b/tensorflow/compiler/xla/service/gpu/stream_executor_util.h
@@ -24,7 +24,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/platform/stream_executor_no_cuda.h"
-#include "tensorflow/stream_executor/cuda/ptxas_utils.h"
+#include "tensorflow/stream_executor/gpu/asm_compiler.h"
 #include "tensorflow/stream_executor/kernel_spec.h"
 
 // Helper functions for interacting with StreamExecutor.
@@ -74,9 +74,8 @@ Status ExecuteKernelOnStream(const se::KernelBase& kernel,
                              int64 threads_per_block, int64 block_count,
                              se::Stream* stream);
 
-// Create PtxCompilationOptions out of HloModuleConfig.
-se::cuda::PtxCompilationOptions PtxOptsFromConfig(
-    const HloModuleConfig& hlo_module_config);
+// Create GpuAsmOpts out of HloModuleConfig.
+se::GpuAsmOpts PtxOptsFromConfig(const HloModuleConfig& hlo_module_config);
 
 // Initializes `buffer` with random data on `stream`.
 // `rng_state` is an inout parameter for the pseudorandom generator state.
@@ -84,8 +83,8 @@ se::cuda::PtxCompilationOptions PtxOptsFromConfig(
 //
 // Precondition: `buffer_type` is a floating point type, `rng_state` needs to be
 // initalized to zero on the first use.
-void InitializeFloatBuffer(se::Stream* stream, PrimitiveType buffer_type,
-                           int64* rng_state, se::DeviceMemoryBase buffer);
+void InitializeBuffer(se::Stream* stream, PrimitiveType buffer_type,
+                      int64* rng_state, se::DeviceMemoryBase buffer);
 
 }  // namespace gpu
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_creation_utils.cc b/tensorflow/compiler/xla/service/hlo_creation_utils.cc
index 9ec22df1b47..2b6383b6e3e 100644
--- a/tensorflow/compiler/xla/service/hlo_creation_utils.cc
+++ b/tensorflow/compiler/xla/service/hlo_creation_utils.cc
@@ -185,6 +185,12 @@ HloInstruction* MakeBroadcastHlo(HloInstruction* operand,
       broadcast_shape, operand, broadcast_dimensions));
 }
 
+HloInstruction* MakeBroadcastHlo(HloInstruction* operand,
+                                 absl::Span<const int64> broadcast_dimensions,
+                                 const Shape& shape) {
+  return MakeBroadcastHlo(operand, broadcast_dimensions, shape.dimensions());
+}
+
 StatusOr<HloInstruction*> MakeGetTupleElementHlo(HloInstruction* operand,
                                                  int64 index) {
   HloComputation* computation = operand->parent();
@@ -224,6 +230,22 @@ HloInstruction* MakeConvertToHlo(HloInstruction* hlo, PrimitiveType type) {
   return hlo;
 }
 
+HloInstruction* MakeBitcastConvertToHlo(HloInstruction* hlo,
+                                        PrimitiveType type) {
+  CHECK_NE(hlo->shape().element_type(), type);
+  Shape shape = ShapeUtil::ChangeElementType(hlo->shape(), type);
+  hlo = hlo->parent()->AddInstruction(
+      HloInstruction::CreateBitcastConvert(shape, hlo));
+  CHECK_EQ(hlo->shape().element_type(), type);
+  return hlo;
+}
+
+HloInstruction* MakeIotaHlo(HloComputation* computation, const Shape& shape,
+                            int64 iota_dimension) {
+  return computation->AddInstruction(
+      HloInstruction::CreateIota(shape, iota_dimension));
+}
+
 StatusOr<HloInstruction*> MakeDotHlo(HloInstruction* lhs, HloInstruction* rhs,
                                      const DotDimensionNumbers& dim_numbers,
                                      const PrecisionConfig& precision_config) {
diff --git a/tensorflow/compiler/xla/service/hlo_creation_utils.h b/tensorflow/compiler/xla/service/hlo_creation_utils.h
index e56199650cb..986bed79af9 100644
--- a/tensorflow/compiler/xla/service/hlo_creation_utils.h
+++ b/tensorflow/compiler/xla/service/hlo_creation_utils.h
@@ -91,6 +91,9 @@ StatusOr<HloInstruction*> MakeDynamicUpdateSliceHlo(
 HloInstruction* MakeBroadcastHlo(HloInstruction* operand,
                                  absl::Span<const int64> broadcast_dimensions,
                                  absl::Span<const int64> result_shape_bounds);
+HloInstruction* MakeBroadcastHlo(HloInstruction* operand,
+                                 absl::Span<const int64> broadcast_dimensions,
+                                 const Shape& shape);
 
 // Creates a GetTupleElement HLO instruction and adds it to the computation
 // containing `operand`.
@@ -107,6 +110,14 @@ StatusOr<HloInstruction*> MakeConcatHlo(
 // the given primitive type.
 HloInstruction* MakeConvertToHlo(HloInstruction* hlo, PrimitiveType type);
 
+// Creates a BitcastConvert HLO instruction.
+HloInstruction* MakeBitcastConvertToHlo(HloInstruction* hlo,
+                                        PrimitiveType type);
+
+// Creates an Iota HLO instruction.
+HloInstruction* MakeIotaHlo(HloComputation* computation, const Shape& shape,
+                            int64 iota_dimension);
+
 // Creates a Dot HLO instruction and adds it to the computation containing `lhs`
 // and `rhs` (both must be in the same computation).
 StatusOr<HloInstruction*> MakeDotHlo(HloInstruction* lhs, HloInstruction* rhs,
diff --git a/tensorflow/compiler/xla/service/hlo_creation_utils_test.cc b/tensorflow/compiler/xla/service/hlo_creation_utils_test.cc
index 6025e6a7794..3c27366a8e6 100644
--- a/tensorflow/compiler/xla/service/hlo_creation_utils_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_creation_utils_test.cc
@@ -41,6 +41,21 @@ class HloCreationUtilsTest : public HloTestBase {
     *param = (*entry_computation)->parameter_instruction(0);
     return module;
   }
+
+  std::unique_ptr<VerifiedHloModule> CreateModuleWithProgramShape(
+      PrimitiveType primitive_type, absl::Span<const int64> input_shape_dims,
+      absl::Span<const int64> output_shape_dims, HloInstruction** param,
+      HloComputation** entry_computation, PrimitiveType primitive_type_output) {
+    Shape input_shape = ShapeUtil::MakeShape(primitive_type, input_shape_dims);
+    Shape output_shape =
+        ShapeUtil::MakeShape(primitive_type_output, output_shape_dims);
+    auto module = CreateNewVerifiedModule("test");
+    *entry_computation = module->AddEntryComputation(
+        CreateComputationWithSignature({&input_shape}, output_shape, "entry")
+            .ValueOrDie());
+    *param = (*entry_computation)->parameter_instruction(0);
+    return module;
+  }
 };
 
 TEST_F(HloCreationUtilsTest, CollapseFirst1Dim) {
@@ -222,5 +237,85 @@ TEST_F(HloCreationUtilsTest, BroadcastZeros_F32) {
            LiteralUtil::CreateR2<float>({{0.0f, 0.0f}, {0.0f, 0.0f}}));
 }
 
+TEST_F(HloCreationUtilsTest, MakeBitcastConvertToHlo_S32) {
+  HloInstruction* param;
+  HloComputation* entry_computation;
+
+  auto module = CreateModuleWithProgramShape(S32, /*input_shape_dims=*/{2, 2},
+                                             /*output_shape_dims=*/{2, 2},
+                                             &param, &entry_computation, F32);
+  auto* input = module->entry_computation()->AddInstruction(
+      HloInstruction::CreateConstant(
+          LiteralUtil::CreateR2<int32>({{0, 0}, {0, 0}})));
+
+  HloInstruction* output = MakeBitcastConvertToHlo(input, F32);
+  entry_computation->set_root_instruction(output);
+
+  HloEvaluator evaluator;
+  TF_ASSERT_OK_AND_ASSIGN(
+      Literal result_literal,
+      evaluator.Evaluate(*module,
+                         {LiteralUtil::CreateR2<int32>({{0, 0}, {0, 0}})}));
+  CHECK_EQ(result_literal,
+           LiteralUtil::CreateR2<float>({{0.0f, 0.0f}, {0.0f, 0.0f}}));
+}
+
+TEST_F(HloCreationUtilsTest, MakeIotaHlo_I32) {
+  HloInstruction* param;
+  HloComputation* entry_computation;
+
+  auto module = CreateModuleWithProgramShape(S32, /*input_shape_dims=*/{},
+                                             /*output_shape_dims=*/{2, 2},
+                                             &param, &entry_computation, F32);
+  HloInstruction* output = MakeIotaHlo(module->entry_computation(),
+                                       ShapeUtil::MakeShape(F32, {2, 2}), 0);
+  entry_computation->set_root_instruction(output);
+
+  HloEvaluator evaluator;
+  TF_ASSERT_OK_AND_ASSIGN(
+      Literal result_literal,
+      evaluator.Evaluate(*module, {LiteralUtil::CreateR0<int32>(0.0)}));
+  CHECK_EQ(result_literal,
+           LiteralUtil::CreateR2<float>({{0.0f, 0.0f}, {1.0f, 1.0f}}));
+}
+
+TEST_F(HloCreationUtilsTest, MakeBroadcast_F32) {
+  HloInstruction* param;
+  HloComputation* entry_computation;
+
+  auto module = CreateModuleWithProgramShape(F32, /*input_shape_dims=*/{},
+                                             /*output_shape_dims=*/{2, 2},
+                                             &param, &entry_computation);
+  auto* input = MakeR0ConstantHlo<float>(module->entry_computation(), 0);
+  HloInstruction* output = MakeBroadcastHlo(input, {}, {2, 2});
+  entry_computation->set_root_instruction(output);
+
+  HloEvaluator evaluator;
+  TF_ASSERT_OK_AND_ASSIGN(
+      Literal result_literal,
+      evaluator.Evaluate(*module, {LiteralUtil::CreateR0<float>(0.0f)}));
+  CHECK_EQ(result_literal,
+           LiteralUtil::CreateR2<float>({{0.0f, 0.0f}, {0.0f, 0.0f}}));
+}
+
+TEST_F(HloCreationUtilsTest, MakeBroadcast_Shape_I32) {
+  HloInstruction* param;
+  HloComputation* entry_computation;
+
+  auto module = CreateModuleWithProgramShape(S32, /*input_shape_dims=*/{},
+                                             /*output_shape_dims=*/{2, 2},
+                                             &param, &entry_computation);
+  auto* input = MakeR0ConstantHlo<int32>(module->entry_computation(), 0);
+  HloInstruction* output =
+      MakeBroadcastHlo(input, {}, ShapeUtil::MakeShape(S32, {2, 2}));
+  entry_computation->set_root_instruction(output);
+
+  HloEvaluator evaluator;
+  TF_ASSERT_OK_AND_ASSIGN(
+      Literal result_literal,
+      evaluator.Evaluate(*module, {LiteralUtil::CreateR0<int32>(0.0)}));
+  CHECK_EQ(result_literal, LiteralUtil::CreateR2<int32>({{0, 0}, {0, 0}}));
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_dataflow_analysis.cc b/tensorflow/compiler/xla/service/hlo_dataflow_analysis.cc
index 47100e3fc58..188f196fd3c 100644
--- a/tensorflow/compiler/xla/service/hlo_dataflow_analysis.cc
+++ b/tensorflow/compiler/xla/service/hlo_dataflow_analysis.cc
@@ -189,8 +189,20 @@ bool HloDataflowAnalysis::Phi(
   for (const InstructionValueSet* input : inputs) {
     VLOG(5) << "input value set = " << input->ToString();
   }
-  for (const InstructionValueSet* input : inputs) {
-    DCHECK(ShapeUtil::Compatible(instruction->shape(), input->shape()));
+
+  if (bitcast_defines_value_) {
+    absl::c_for_each(inputs, [&](const InstructionValueSet* input) {
+      DCHECK(ShapeUtil::Compatible(instruction->shape(), input->shape()));
+    });
+  } else {
+    const Shape& shape = instruction->shape();
+    PrimitiveType ty = shape.element_type();
+    bool is_array = shape.IsArray();
+    absl::c_for_each(inputs, [&](const InstructionValueSet* input) {
+      DCHECK(ty == input->shape().element_type() &&
+             (!is_array || ShapeUtil::ElementsIn(shape) ==
+                               ShapeUtil::ElementsIn(input->shape())));
+    });
   }
 
   bool changed = false;
@@ -774,9 +786,9 @@ Status HloDataflowAnalysis::InitializeInstructionValueSets() {
                           std::forward_as_tuple(instruction),
                           std::forward_as_tuple(instruction->shape()));
 
-      // Lambda to set the value set to define all values in the output of the
-      // instruction.
-      auto define_all_values = [this, &instruction](bool is_phi = false) {
+      // For each sub-shape of the instruction shape, add a new HloValue to its
+      // HloValueSet.
+      auto define_all_values = [this, &instruction]() {
         for (auto& pair : GetInstructionValueSet(instruction)) {
           const ShapeIndex& index = pair.first;
           HloValue* value = NewHloValue(instruction, index, /*is_phi=*/false);
@@ -784,16 +796,8 @@ Status HloDataflowAnalysis::InitializeInstructionValueSets() {
         }
       };
 
-      // Lambda to set the value set to define only the top-level buffer in the
-      // output of the instruction. Any other values flow from the operands of
-      // the instruction (or from cross-computation dataflow).
-      auto define_top_level_only = [this, &instruction]() {
-        HloValue* value =
-            NewHloValue(instruction, /*index=*/{}, /*is_phi=*/false);
-        GetValueSet(instruction, /*index=*/{}).AddValue(value);
-      };
-
-      // Lambda to set the value set at the given index of the output.
+      // Add a new HloValue to the HloValueSet corresponding to the given index
+      // of the instruction shape.
       auto define_value_at = [this, &instruction](const ShapeIndex& index) {
         HloValue* value = NewHloValue(instruction, index, /*is_phi=*/false);
         GetValueSet(instruction, index).AddValue(value);
@@ -840,7 +844,7 @@ Status HloDataflowAnalysis::InitializeInstructionValueSets() {
         case HloOpcode::kTuple:
           // These instructions only define their top-level values. Any other
           // values flow from their operands.
-          define_top_level_only();
+          define_value_at(/*index=*/{});
           break;
         case HloOpcode::kCopyDone:
           // CopyDone produces an element. Its output aliases its input tuple
diff --git a/tensorflow/compiler/xla/service/hlo_dataflow_analysis.h b/tensorflow/compiler/xla/service/hlo_dataflow_analysis.h
index a4966f9e2ba..79cd11f033e 100644
--- a/tensorflow/compiler/xla/service/hlo_dataflow_analysis.h
+++ b/tensorflow/compiler/xla/service/hlo_dataflow_analysis.h
@@ -52,7 +52,7 @@ class HloDataflowAnalysis {
       const HloInstruction* instr, const HloInstruction* operand,
       const ShapeIndex& user_index)>;
 
-  // Run dataflow analysis on the given module. Parameters:
+  // Runs dataflow analysis on the given module. Parameters:
   //
   //   ssa_form : If true then new values are defined at the merge points of
   //     kWhile instructions. Abusing nomenclature somewhat, we call these "phi
@@ -81,7 +81,7 @@ class HloDataflowAnalysis {
   bool ValueIsDefinedAt(const HloInstruction* instruction,
                         const ShapeIndex& index = {}) const;
 
-  // Return the HloValue defined by 'instruction' at the given shape index of
+  // Returns the HloValue defined by 'instruction' at the given shape index of
   // its output.
   //
   // Precondition: ValueIsDefinedAt is true for this instruction and index.
@@ -90,7 +90,7 @@ class HloDataflowAnalysis {
   HloValue& GetValueDefinedAt(const HloInstruction* instruction,
                               const ShapeIndex& index = {});
 
-  // Return the InstructionValueSet for the given instruction.
+  // Returns the InstructionValueSet for the given instruction.
   const InstructionValueSet& GetInstructionValueSet(
       const HloInstruction* instruction) const;
   InstructionValueSet& GetInstructionValueSet(
@@ -100,7 +100,7 @@ class HloDataflowAnalysis {
   // a flattened set.
   HloValueSet GetFlattenedValueSet(const HloInstruction* instruction) const;
 
-  // Return the HloValueSet for the given instruction at the given index or the
+  // Returns the HloValueSet for the given instruction at the given index or the
   // given position.
   const HloValueSet& GetValueSet(const HloInstruction* instruction,
                                  const ShapeIndex& index = {}) const;
@@ -109,7 +109,7 @@ class HloDataflowAnalysis {
   HloValueSet& GetValueSet(const HloInstruction* instruction,
                            const ShapeIndex& index = {});
 
-  // Return the unique value in the HloValueSet at the given instruction and
+  // Returns the unique value in the HloValueSet at the given instruction and
   // shape index. CHECKs if the value set does not contain a exactly one value.
   const HloValue& GetUniqueValueAt(const HloInstruction* instruction,
                                    const ShapeIndex& index = {}) const {
@@ -120,17 +120,17 @@ class HloDataflowAnalysis {
     return GetValue(GetValueSet(instruction, index).GetUniqueValue().id());
   }
 
-  // Return the HloValue with the given Id.
+  // Returns the HloValue with the given Id.
   const HloValue& GetValue(HloValue::Id value_id) const;
   HloValue& GetValue(HloValue::Id value_id);
 
-  // Return the total number of HloValues.
+  // Returns the total number of HloValues.
   int64 value_count() const { return values_.size(); }
 
-  // Return a vector of all HloValues stabily sorted by HloValue::Id.
+  // Returns a vector of all HloValues stabily sorted by HloValue::Id.
   const std::vector<HloValue*>& values() const { return values_vector_; }
 
-  // Return the call graph used for computing the dataflow.
+  // Returns the call graph used for computing the dataflow.
   const CallGraph& call_graph() const { return *call_graph_; }
 
   string ToString() const;
@@ -164,10 +164,10 @@ class HloDataflowAnalysis {
   HloValue* NewHloValue(HloInstruction* instruction, const ShapeIndex& index,
                         bool is_phi = false);
 
-  // Mark the HloValue with the given ID for deletion.
+  // Marks the HloValue with the given ID for deletion.
   void MarkValueForDeletion(HloValue::Id value_id);
 
-  // Delete all HloValues marked for deletion. Should be called after
+  // Deletes all HloValues marked for deletion. Should be called after
   // propagation is complete.
   void DeleteMarkedValues();
 
@@ -197,12 +197,13 @@ class HloDataflowAnalysis {
   bool UpdateWhileValueSet(HloInstruction* xla_while);
   bool UpdateAddDependencyValueSet(HloInstruction* add_dependency);
 
-  // Propagate the dataflow through the module.
+  // Propagates the dataflow through the module. In particular, it propagates
+  // the HloValueSet from its defining instruction to the users of the
+  // instructions.
   void Propagate();
 
-  // Return the result of the SSA Phi function applied to the given inputs at
-  // the given instruction. If skip_top_level is true, then the top level of the
-  // value set of 'instruction' is not modified.
+  // Returns the result of the SSA Phi function applied to the given inputs at
+  // the given instruction.
   bool Phi(HloInstruction* instruction,
            absl::Span<const InstructionValueSet* const> inputs);
 
@@ -217,7 +218,7 @@ class HloDataflowAnalysis {
       HloInstruction* instruction, const InstructionValueSet& new_value_set,
       const InstructionValueSet* prev_value_set = nullptr);
 
-  // Verify various invariants of the dataflow analysis.
+  // Verifies various invariants of the dataflow analysis.
   Status Verify() const;
 
   const HloModule& module_;
diff --git a/tensorflow/compiler/xla/service/hlo_evaluator.cc b/tensorflow/compiler/xla/service/hlo_evaluator.cc
index 211e5d830f5..bae803bdaa0 100644
--- a/tensorflow/compiler/xla/service/hlo_evaluator.cc
+++ b/tensorflow/compiler/xla/service/hlo_evaluator.cc
@@ -410,9 +410,9 @@ Status HloEvaluator::HandleGetDimensionSize(
   }
 
   const Shape& shape = get_dimension_size->operand(0)->shape();
-  Literal output(ShapeUtil::MakeShape(U32, {}));
+  Literal output(ShapeUtil::MakeShape(S32, {}));
   output.PopulateWithValue(
-      static_cast<uint32>(shape.dimensions(get_dimension_size->dimension())));
+      static_cast<int32>(shape.dimensions(get_dimension_size->dimension())));
   evaluated_[get_dimension_size] = std::move(output);
   return Status::OK();
 }
@@ -1719,6 +1719,10 @@ Status HloEvaluator::HandleGather(HloInstruction* gather) {
       /*output_shape=*/shape);
 
   const Shape& operand_shape = operand.shape();
+  if (ShapeUtil::IsZeroElementArray(operand_shape)) {
+    evaluated_[gather] = std::move(result);
+    return Status::OK();
+  }
 
   auto gather_inner_loop_body =
       [&](absl::Span<const int64> output_window_index,
diff --git a/tensorflow/compiler/xla/service/hlo_evaluator_test.cc b/tensorflow/compiler/xla/service/hlo_evaluator_test.cc
index 888434774bb..eff012065dc 100644
--- a/tensorflow/compiler/xla/service/hlo_evaluator_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_evaluator_test.cc
@@ -4154,13 +4154,13 @@ TEST_F(HloEvaluatorTest, GetDimensionSize) {
 HloModule Test
 
 ENTRY main {
-  size = u32[] parameter(0)
+  size = s32[] parameter(0)
 
   data = s32[4] parameter(1)
 
   sum = s32[4] add(data, data)
 
-  ROOT dynamic_size = u32[] get-dimension-size(sum), dimensions={0}
+  ROOT dynamic_size = s32[] get-dimension-size(sum), dimensions={0}
 }
 )";
   TF_ASSERT_OK_AND_ASSIGN(m_, ParseAndReturnVerifiedModule(hlo_text));
@@ -4174,12 +4174,12 @@ ENTRY main {
                           DynamicDimensionInference::Run(m_.get()));
 
   evaluator_.set_dynamic_dimension_inference(&dynamic_dimension_inference);
-  Literal size_arg = LiteralUtil::CreateR0<uint32>(3);
+  Literal size_arg = LiteralUtil::CreateR0<int32>(3);
   Literal data_arg = LiteralUtil::CreateR1<int32>({1, 2, 3, 4});
 
   TF_ASSERT_OK_AND_ASSIGN(Literal actual, Evaluate({&size_arg, &data_arg}));
 
-  EXPECT_EQ(actual.GetFirstElement<uint32>(), static_cast<uint32>(3));
+  EXPECT_EQ(actual.GetFirstElement<int32>(), static_cast<int32>(3));
 }
 
 // Check that we get a useful error if we pass inputs of the wrong shape.
diff --git a/tensorflow/compiler/xla/service/hlo_evaluator_typed_visitor.h b/tensorflow/compiler/xla/service/hlo_evaluator_typed_visitor.h
index 9487d955f31..6fa3f9fb34b 100644
--- a/tensorflow/compiler/xla/service/hlo_evaluator_typed_visitor.h
+++ b/tensorflow/compiler/xla/service/hlo_evaluator_typed_visitor.h
@@ -1389,9 +1389,10 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
               *(accumulate_index_locations[i].second) = accumulate_index[i];
             }
 
+            ElementwiseT lhs_val(lhs_literal.Get<ReturnT>(lhs_index));
+            ElementwiseT rhs_val(rhs_literal.Get<ReturnT>(rhs_index));
             result_val +=
-                static_cast<ElementwiseT>(lhs_literal.Get<ReturnT>(lhs_index)) *
-                static_cast<ElementwiseT>(rhs_literal.Get<ReturnT>(rhs_index));
+                ToArithmeticSafeType(lhs_val) * ToArithmeticSafeType(rhs_val);
 
             // If there are no contracting dimension accumulate_index_sizes is
             // empty, do not try to count down from -1 to 0 since it is and
diff --git a/tensorflow/compiler/xla/service/hlo_get_dimension_size_rewriter.cc b/tensorflow/compiler/xla/service/hlo_get_dimension_size_rewriter.cc
index 862b2029718..937c535e550 100644
--- a/tensorflow/compiler/xla/service/hlo_get_dimension_size_rewriter.cc
+++ b/tensorflow/compiler/xla/service/hlo_get_dimension_size_rewriter.cc
@@ -37,8 +37,10 @@ StatusOr<bool> ReplaceGetSize(
   TF_ASSIGN_OR_RETURN(auto legal_shape,
                       ShapeInference::InferGetDimensionSizeShape(
                           instr->operand(0)->shape(), instr->dimension()));
-  TF_RET_CHECK(ShapeUtil::Equal(instr->shape(), legal_shape));
-  TF_RET_CHECK(ShapeUtil::HasPrimitiveType(instr->shape(), U32));
+  TF_RET_CHECK(ShapeUtil::Equal(instr->shape(), legal_shape))
+      << "instr->shape() " << instr->shape().ToString() << " , "
+      << "legal_shape " << legal_shape.ToString();
+  TF_RET_CHECK(ShapeUtil::HasPrimitiveType(instr->shape(), S32));
   HloInstruction* operand = instr->mutable_operand(0);
   int64 dim = instr->dimension();
   HloInstruction* dynamic_size =
@@ -46,9 +48,9 @@ StatusOr<bool> ReplaceGetSize(
   if (dynamic_size != nullptr) {
     TF_RETURN_IF_ERROR(instr->ReplaceAllUsesWith(dynamic_size));
   } else {
-    uint32 size = instr->operand(0)->shape().dimensions(dim);
+    int32 size = instr->operand(0)->shape().dimensions(dim);
     HloInstruction* new_instr = computation->AddInstruction(
-        HloInstruction::CreateConstant(LiteralUtil::CreateR0<uint32>(size)));
+        HloInstruction::CreateConstant(LiteralUtil::CreateR0<int32>(size)));
     TF_RETURN_IF_ERROR(instr->ReplaceAllUsesWith(new_instr));
   }
   return true;
diff --git a/tensorflow/compiler/xla/service/hlo_get_dimension_size_rewriter_test.cc b/tensorflow/compiler/xla/service/hlo_get_dimension_size_rewriter_test.cc
index bc240435be8..a0a06d53ea2 100644
--- a/tensorflow/compiler/xla/service/hlo_get_dimension_size_rewriter_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_get_dimension_size_rewriter_test.cc
@@ -44,9 +44,9 @@ TEST_F(HloGetDimensionSizeRewriterTest, Ok) {
 HloModule _
 ENTRY gds {
   p = s32[3,4] parameter(0)
-  size0 = u32[] get-dimension-size(p), dimensions={0}
-  size1 = u32[] get-dimension-size(p), dimensions={1}
-  ROOT mul = u32[] multiply(size0, size1)
+  size0 = s32[] get-dimension-size(p), dimensions={0}
+  size1 = s32[] get-dimension-size(p), dimensions={1}
+  ROOT mul = s32[] multiply(size0, size1)
 })")
                     .ValueOrDie();
   HloGetDimensionSizeRewriter pass;
@@ -72,7 +72,7 @@ TEST_F(HloGetDimensionSizeRewriterTest, IllegalDimension) {
 HloModule _
 ENTRY gds {
   p = f32[2,5] parameter(0)
-  ROOT gds = u32[] get-dimension-size(p), dimensions={2}
+  ROOT gds = s32[] get-dimension-size(p), dimensions={2}
 })")
                     .ValueOrDie();
   HloGetDimensionSizeRewriter pass;
diff --git a/tensorflow/compiler/xla/service/hlo_instructions.cc b/tensorflow/compiler/xla/service/hlo_instructions.cc
index 82f3b245590..c93f0106075 100644
--- a/tensorflow/compiler/xla/service/hlo_instructions.cc
+++ b/tensorflow/compiler/xla/service/hlo_instructions.cc
@@ -1627,14 +1627,8 @@ bool HloFusionInstruction::IdenticalSlowPath(
                          other.fused_instructions_computation());
 }
 
-static uint64 HashOperandRecursive(const HloInstruction* hlo) {
-  return hlo->Hash(HashOperandRecursive);
-}
-
 uint64 HloFusionInstruction::InnerHash() const {
-  // Use HashOperandRecursive to recursively compute hash on inner operands.
-  return fused_instructions_computation()->root_instruction()->Hash(
-      HashOperandRecursive);
+  return fused_instructions_computation()->root_instruction()->Hash();
 }
 
 std::unique_ptr<HloInstruction> HloFusionInstruction::CloneWithNewOperandsImpl(
diff --git a/tensorflow/compiler/xla/service/hlo_value.cc b/tensorflow/compiler/xla/service/hlo_value.cc
index 78c48e036d6..acc077ab12d 100644
--- a/tensorflow/compiler/xla/service/hlo_value.cc
+++ b/tensorflow/compiler/xla/service/hlo_value.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include "absl/container/flat_hash_set.h"
 #include "absl/memory/memory.h"
 #include "absl/strings/str_cat.h"
+#include "absl/strings/str_format.h"
 #include "absl/strings/str_join.h"
 #include "tensorflow/compiler/xla/map_util.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
@@ -77,7 +78,7 @@ HloValue::HloValue(HloValue::Id id, HloInstruction* instruction,
 bool HloValue::operator==(const HloValue& other) const {
   bool equal = defining_instruction() == other.defining_instruction() &&
                defining_index() == other.defining_index();
-  // If the values are equal they most both be phi (or non phi).
+  // If the values are equal they must both be phi (or non phi).
   CHECK(!(equal && is_phi() != other.is_phi()));
   return equal;
 }
@@ -87,17 +88,17 @@ bool HloValue::operator!=(const HloValue& other) const {
 }
 
 string HloValue::ToShortString() const {
-  string index_str = defining_instruction()->shape().IsTuple()
-                         ? defining_index().ToString()
-                         : "";
-  return StrCat(id(), " ", is_phi_ ? "PHI " : "",
-                defining_instruction()->name(), index_str, " @",
-                (has_color() ? color().value() : -1));
+  return absl::StrFormat(
+      "<%d %s%s%s%s>", id(), instruction()->name(),
+      instruction()->shape().IsTuple() ? index().ToString() : "",
+      is_phi() ? " (phi)" : "",
+      has_color() ? StrCat(" @", color().value()) : "");
 }
 
 string HloValue::ToString(int indent) const {
   string indentation(indent, ' ');
-  string out = StrCat(indentation, ToShortString(), ", positions:\n");
+  string out =
+      StrCat(indentation, ToShortString(), "\n", indentation, " positions:\n");
   for (const HloPosition& position : positions()) {
     StrAppend(&out, indentation, "  ", position.ToString(), "\n");
   }
diff --git a/tensorflow/compiler/xla/service/interpreter/executor.cc b/tensorflow/compiler/xla/service/interpreter/executor.cc
index b1a26b3b586..2606e2e4bf7 100644
--- a/tensorflow/compiler/xla/service/interpreter/executor.cc
+++ b/tensorflow/compiler/xla/service/interpreter/executor.cc
@@ -33,7 +33,9 @@ XlaInterpreterExecutor::XlaInterpreterExecutor(
 
 XlaInterpreterExecutor::~XlaInterpreterExecutor() {}
 
-void *XlaInterpreterExecutor::Allocate(uint64 size) { return new char[size]; }
+DeviceMemoryBase XlaInterpreterExecutor::Allocate(uint64 size) {
+  return DeviceMemoryBase(new char[size], size);
+}
 
 void *XlaInterpreterExecutor::GetSubBuffer(DeviceMemoryBase *parent,
                                            uint64 offset_bytes,
diff --git a/tensorflow/compiler/xla/service/interpreter/executor.h b/tensorflow/compiler/xla/service/interpreter/executor.h
index 43493b6e154..ce94dbe7a6f 100644
--- a/tensorflow/compiler/xla/service/interpreter/executor.h
+++ b/tensorflow/compiler/xla/service/interpreter/executor.h
@@ -68,7 +68,7 @@ class XlaInterpreterExecutor : public internal::StreamExecutorInterface {
     return port::UnimplementedError("Not Implemented");
   }
 
-  void *Allocate(uint64 size) override;
+  DeviceMemoryBase Allocate(uint64 size) override;
   void *GetSubBuffer(DeviceMemoryBase *parent, uint64 offset_bytes,
                      uint64 size_bytes) override;
   void Deallocate(DeviceMemoryBase *mem) override;
diff --git a/tensorflow/compiler/xla/service/layout_assignment.cc b/tensorflow/compiler/xla/service/layout_assignment.cc
index bf1df58f0b8..2fe3f9aa03e 100644
--- a/tensorflow/compiler/xla/service/layout_assignment.cc
+++ b/tensorflow/compiler/xla/service/layout_assignment.cc
@@ -36,6 +36,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/map_util.h"
 #include "tensorflow/compiler/xla/service/call_graph.h"
 #include "tensorflow/compiler/xla/service/computation_layout.h"
+#include "tensorflow/compiler/xla/service/hlo_alias_analysis.h"
 #include "tensorflow/compiler/xla/service/hlo_casting_utils.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_dce.h"
@@ -1964,6 +1965,41 @@ Status LayoutAssignment::ConstrainChannelLayouts(
   return Status::OK();
 }
 
+Status LayoutAssignment::PropagateMemorySpace(HloModule* module) {
+  TF_ASSIGN_OR_RETURN(auto alias_analysis, HloAliasAnalysis::Run(module));
+  for (auto buffer : alias_analysis->buffers()) {
+    // First go through values to collect the memory spaces.
+    int64 buffer_memory_space = Layout::kDefaultMemorySpace;
+    for (auto value : buffer.values()) {
+      const Shape& defining_shape = value->defining_position().shape();
+      int64 memory_space = defining_shape.layout().memory_space();
+      if (memory_space != Layout::kDefaultMemorySpace) {
+        if (buffer_memory_space != Layout::kDefaultMemorySpace &&
+            memory_space != buffer_memory_space) {
+          return InternalError(
+              "Buffer %d (%s) has conflicting memory spaces: %d and %d.",
+              buffer.id(), value->ToShortString(), buffer_memory_space,
+              memory_space);
+        }
+        buffer_memory_space = memory_space;
+      }
+    }
+
+    // If we encounter a memory space other than the default, then propagate all
+    // the positions with the buffer's memory space.
+    if (buffer_memory_space != Layout::kDefaultMemorySpace) {
+      for (auto value : buffer.values()) {
+        for (auto& position : value->positions()) {
+          Shape* shape = ShapeUtil::GetMutableSubshape(
+              position.instruction->mutable_shape(), position.index);
+          shape->mutable_layout()->set_memory_space(buffer_memory_space);
+        }
+      }
+    }
+  }
+  return Status::OK();
+}
+
 Status LayoutAssignment::PropagateComputationLayouts(
     HloComputation* computation, ComputationLayout* computation_layout) {
   ComputationLayout computed_computation_layout(
@@ -2076,6 +2112,9 @@ StatusOr<bool> LayoutAssignment::Run(HloModule* module) {
   }
   TF_RETURN_IF_ERROR(PropagateComputationLayouts(module->entry_computation(),
                                                  entry_computation_layout_));
+
+  TF_RETURN_IF_ERROR(PropagateMemorySpace(module));
+
   TF_RETURN_IF_ERROR(CheckLayouts(module));
 
   // All layouts are reset then reassigned by this pass.
diff --git a/tensorflow/compiler/xla/service/layout_assignment.h b/tensorflow/compiler/xla/service/layout_assignment.h
index 6a202837e14..a0f61fc416d 100644
--- a/tensorflow/compiler/xla/service/layout_assignment.h
+++ b/tensorflow/compiler/xla/service/layout_assignment.h
@@ -356,6 +356,10 @@ class LayoutAssignment : public HloModulePass {
                                       const HloInstruction* instruction,
                                       LayoutConstraints* constraints);
 
+  // Propagates the memory space defined in the entry computation to the called
+  // computations.
+  Status PropagateMemorySpace(HloModule* module);
+
   // Chooses a layout of operand `operand_no` of `instruction` that minimizes
   // the cost of `instruction`. `output_layout` is the layout of `instruction`.
   // Returns null if it can't decide the best layout.
diff --git a/tensorflow/compiler/xla/service/mlir_gpu/lhlo_dialect_emitter.cc b/tensorflow/compiler/xla/service/mlir_gpu/lhlo_dialect_emitter.cc
index c23b343d902..fa9a606568f 100644
--- a/tensorflow/compiler/xla/service/mlir_gpu/lhlo_dialect_emitter.cc
+++ b/tensorflow/compiler/xla/service/mlir_gpu/lhlo_dialect_emitter.cc
@@ -188,6 +188,8 @@ StatusOr<FuncOp> LhloDialectEmitter::CreateFunction(
       FuncOp::create(builder_.getUnknownLoc(), instr.name(), function_type);
   mlir_module_.push_back(function);
   function.addEntryBlock();
+  OpBuilder op_builder(function.getBody());
+  op_builder.create<::mlir::ReturnOp>(builder_.getUnknownLoc());
   instruction_to_mlir_func_[&instr] = function;
   return function;
 }
diff --git a/tensorflow/compiler/xla/service/mlir_gpu/mlir_compiler.cc b/tensorflow/compiler/xla/service/mlir_gpu/mlir_compiler.cc
index eef17132efa..22f60374ee9 100644
--- a/tensorflow/compiler/xla/service/mlir_gpu/mlir_compiler.cc
+++ b/tensorflow/compiler/xla/service/mlir_gpu/mlir_compiler.cc
@@ -191,11 +191,9 @@ StatusOr<std::unique_ptr<Executable>> MlirCompiler::RunBackend(
   }
 
   // TODO(b/137624192): Add profiling support.
-
-  return static_cast<std::unique_ptr<Executable>>(
-      absl::make_unique<GpuExecutable>(
-          ptx, cubin, GetGpuVersion(stream_exec), std::move(thunk_schedule),
-          std::move(module), std::move(buffer_assignment), nullptr, nullptr));
+  return {absl::make_unique<GpuExecutable>(
+      ptx, cubin, GetGpuVersion(stream_exec), std::move(thunk_schedule),
+      std::move(module), std::move(buffer_assignment), nullptr, nullptr)};
 }
 
 StatusOr<std::vector<std::unique_ptr<Executable>>> MlirCompiler::Compile(
diff --git a/tensorflow/compiler/xla/service/mlir_gpu/mlir_irgen_test_base.cc b/tensorflow/compiler/xla/service/mlir_gpu/mlir_irgen_test_base.cc
index dfa3af8c39f..c1d47fabbcd 100644
--- a/tensorflow/compiler/xla/service/mlir_gpu/mlir_irgen_test_base.cc
+++ b/tensorflow/compiler/xla/service/mlir_gpu/mlir_irgen_test_base.cc
@@ -64,8 +64,7 @@ void MlirIrGenTestBase::CompileAndVerifyIr(const string& hlo_text,
 
 MlirCompiler* MlirIrGenTestBase::GetMLIRCompiler() {
   // TODO(b/137624192): Remove failover once no longer in place.
-  FailoverCompiler* failover =
-      static_cast<FailoverCompiler*>(backend().compiler());
+  auto* failover = static_cast<FailoverCompiler*>(backend().compiler());
   return static_cast<MlirCompiler*>(failover->GetPrimary());
 }
 
diff --git a/tensorflow/compiler/xla/service/shape_inference.cc b/tensorflow/compiler/xla/service/shape_inference.cc
index 7c4605125cf..b93c4358e6c 100644
--- a/tensorflow/compiler/xla/service/shape_inference.cc
+++ b/tensorflow/compiler/xla/service/shape_inference.cc
@@ -652,9 +652,7 @@ Status ValidateDotDimensionNumbers(
     const int64 rhs_contracting_dimension =
         dimension_numbers.rhs_contracting_dimensions(i);
     if (lhs.dimensions(lhs_contracting_dimension) !=
-            rhs.dimensions(rhs_contracting_dimension) ||
-        lhs.is_dynamic_dimension(lhs_contracting_dimension) !=
-            rhs.is_dynamic_dimension(rhs_contracting_dimension)) {
+        rhs.dimensions(rhs_contracting_dimension)) {
       return fail("Contracting dimension sizes do not match.");
     }
   }
@@ -668,10 +666,7 @@ Status ValidateDotDimensionNumbers(
   // Check that batch dimension numbers and sizes match.
   for (int64 i = 0; i < dimension_numbers.lhs_batch_dimensions_size(); ++i) {
     if (lhs.dimensions(dimension_numbers.lhs_batch_dimensions(i)) !=
-            rhs.dimensions(dimension_numbers.rhs_batch_dimensions(i)) ||
-        lhs.is_dynamic_dimension(dimension_numbers.lhs_batch_dimensions(i)) !=
-            rhs.is_dynamic_dimension(
-                dimension_numbers.rhs_batch_dimensions(i))) {
+        rhs.dimensions(dimension_numbers.rhs_batch_dimensions(i))) {
       return fail("Batch dimension sizes must match for lhs/rhs.");
     }
   }
@@ -726,13 +721,10 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(HloOpcode operation,
   for (int64 i = 0; i < lhs.rank(); ++i) {
     if (lhs.dimensions(i) == rhs.dimensions(i)) {
       output_dimensions[i] = lhs.dimensions(i);
-      output_dimensions_is_dynamic[i] = lhs.is_dynamic_dimension(i);
     } else if (lhs.dimensions(i) == 1) {
       output_dimensions[i] = rhs.dimensions(i);
-      output_dimensions_is_dynamic[i] = rhs.is_dynamic_dimension(i);
     } else if (rhs.dimensions(i) == 1) {
       output_dimensions[i] = lhs.dimensions(i);
-      output_dimensions_is_dynamic[i] = lhs.is_dynamic_dimension(i);
     } else {
       return InvalidArgument(
           "Binary op %s with incompatible shapes: %s and %s.",
@@ -740,6 +732,14 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(HloOpcode operation,
           ShapeUtil::HumanString(rhs));
     }
   }
+
+  // Merge dynamic dimensions from two shapes.
+  for (int64 i = 0; i < rhs.rank(); ++i) {
+    if (rhs.is_dynamic_dimension(i) || lhs.is_dynamic_dimension(i)) {
+      output_dimensions_is_dynamic[i] = true;
+    }
+  }
+
   return ShapeUtil::MakeShape(ShapeUtil::HigherPrecisionElementType(lhs, rhs),
                               output_dimensions, output_dimensions_is_dynamic);
 }
@@ -888,11 +888,18 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(HloOpcode operation,
   if (ShapeUtil::CompatibleIgnoringFpPrecision(lhs, rhs)) {
     // If the shapes are the same other than layout, the output shape is the
     // same (elementwise op).
-    return ShapeUtil::ChangeElementType(
+    Shape result = ShapeUtil::ChangeElementType(
         lhs, ShapeUtil::HigherPrecisionElementType(lhs, rhs));
-  }
 
-  if (lhs.rank() == rhs.rank()) {
+    for (int64 i = 0; i < rhs.rank(); ++i) {
+      if (rhs.is_dynamic_dimension(i)) {
+        result.set_dynamic_dimension(i, true);
+      }
+    }
+
+    return result;
+
+  } else if (lhs.rank() == rhs.rank()) {
     return InferDegenerateDimensionBroadcastShape(operation, lhs, rhs);
   } else {
     // Ranks do not match, so perform InDim broadcasting using
@@ -2201,14 +2208,14 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(HloOpcode operation,
 
   // TODO(b/119580730): Remove this restriction when very large dimension size
   // is needed.
-  if (shape.dimensions(dimension) > std::numeric_limits<uint32>::max()) {
+  if (shape.dimensions(dimension) > std::numeric_limits<int32>::max()) {
     return InvalidArgument(
         "GetDimensionSize's input shape is %s, the %dth dimension exceeds the "
-        "UINT_MAX limit.",
+        "INT_MAX limit.",
         ShapeUtil::HumanString(shape), dimension);
   }
 
-  return ShapeUtil::MakeShape(U32, {});
+  return ShapeUtil::MakeShape(S32, {});
 }
 
 /* static */ StatusOr<Window> ShapeInference::InferWindowFromDimensions(
@@ -2324,7 +2331,12 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(HloOpcode operation,
     sizes.push_back((limit_index - start_index + stride - 1) / stride);
   }
 
-  return ShapeUtil::MakeShape(arg.element_type(), sizes);
+  std::vector<bool> is_dynamic(arg.rank());
+  for (int64 i = 0; i < arg.dimensions_size(); ++i) {
+    is_dynamic[i] = arg.is_dynamic_dimension(i);
+  }
+
+  return ShapeUtil::MakeShape(arg.element_type(), sizes, is_dynamic);
 }
 
 /* static */ StatusOr<Shape> ShapeInference::InferDynamicSliceShape(
@@ -3061,10 +3073,10 @@ static Status ValidateGatherDimensionNumbers(
   }
 
   for (int i = 0; i < gather_dim_numbers.collapsed_slice_dims_size(); i++) {
-    if (slice_sizes[gather_dim_numbers.collapsed_slice_dims(i)] != 1) {
+    if (slice_sizes[gather_dim_numbers.collapsed_slice_dims(i)] > 1) {
       return InvalidArgument(
-          "Gather op can only collapse slice dims with bound 1, but bound is "
-          "%d for index %d at position %d.",
+          "Gather op can only collapse slice dims with bound 1 or 0, but bound "
+          "is %d for index %d at position %d.",
           slice_sizes[gather_dim_numbers.collapsed_slice_dims(i)],
           gather_dim_numbers.collapsed_slice_dims(i), i);
     }
diff --git a/tensorflow/compiler/xla/service/shape_inference_test.cc b/tensorflow/compiler/xla/service/shape_inference_test.cc
index c241a4ac2ce..7ccdb869a91 100644
--- a/tensorflow/compiler/xla/service/shape_inference_test.cc
+++ b/tensorflow/compiler/xla/service/shape_inference_test.cc
@@ -2368,9 +2368,10 @@ TEST_F(ScatterGatherShapeInferenceTest,
           /*index_vector_dim=*/4),
       /*slice_sizes=*/{30, 29, 28, 26, 20});
   ASSERT_FALSE(statusor.ok());
-  EXPECT_THAT(statusor.status().error_message(),
-              HasSubstr("Gather op can only collapse slice dims with bound 1, "
-                        "but bound is 29 for index 1 at position 0."))
+  EXPECT_THAT(
+      statusor.status().error_message(),
+      HasSubstr("Gather op can only collapse slice dims with bound 1 or 0, "
+                "but bound is 29 for index 1 at position 0."))
       << statusor.status();
 }
 
diff --git a/tensorflow/compiler/xla/shape_util.cc b/tensorflow/compiler/xla/shape_util.cc
index 5f11fbf03be..c47145d076d 100644
--- a/tensorflow/compiler/xla/shape_util.cc
+++ b/tensorflow/compiler/xla/shape_util.cc
@@ -128,6 +128,17 @@ StatusOr<Shape> MakeShapeWithLayoutInternal(
   return equal;
 }
 
+/* static */ bool ShapeUtil::EqualIgnoringElementType(const Shape& lhs,
+                                                      const Shape& rhs) {
+  bool equal = Shape::Equal().IgnoreElementType()(lhs, rhs);
+  if (!equal && VLOG_IS_ON(3)) {
+    VLOG(3) << "ShapeUtil::EqualIgnoringElementType differ: lhs = "
+            << lhs.ShortDebugString() << ", rhs = " << rhs.ShortDebugString();
+  }
+
+  return equal;
+}
+
 /* static */ bool ShapeUtil::EqualIgnoringFpPrecision(const Shape& lhs,
                                                       const Shape& rhs) {
   bool equal = Shape::Equal().IgnoreFpPrecision()(lhs, rhs);
@@ -507,17 +518,23 @@ ShapeUtil::MakeShapeWithDescendingLayoutAndSamePhysicalLayout(
 }
 
 /* static */ bool ShapeUtil::Compatible(const Shape& lhs, const Shape& rhs) {
-  return Shape::Equal().IgnoreLayout()(lhs, rhs);
+  return Shape::Equal().IgnoreDynamicDimension().IgnoreLayout()(lhs, rhs);
 }
 
 /* static */ bool ShapeUtil::CompatibleIgnoringElementType(const Shape& lhs,
                                                            const Shape& rhs) {
-  return Shape::Equal().IgnoreElementType().IgnoreLayout()(lhs, rhs);
+  return Shape::Equal()
+      .IgnoreDynamicDimension()
+      .IgnoreElementType()
+      .IgnoreLayout()(lhs, rhs);
 }
 
 /* static */ bool ShapeUtil::CompatibleIgnoringFpPrecision(const Shape& lhs,
                                                            const Shape& rhs) {
-  return Shape::Equal().IgnoreFpPrecision().IgnoreLayout()(lhs, rhs);
+  return Shape::Equal()
+      .IgnoreDynamicDimension()
+      .IgnoreFpPrecision()
+      .IgnoreLayout()(lhs, rhs);
 }
 
 /* static */ int64 ShapeUtil::GetDimension(const Shape& shape,
diff --git a/tensorflow/compiler/xla/shape_util.h b/tensorflow/compiler/xla/shape_util.h
index 44994b26ac1..dffabf75a9a 100644
--- a/tensorflow/compiler/xla/shape_util.h
+++ b/tensorflow/compiler/xla/shape_util.h
@@ -280,7 +280,6 @@ class ShapeUtil {
     if (SameElementType(a, b)) {
       return a.element_type();
     }
-    CHECK(SameElementTypeIgnoringFpPrecision(a, b));
     return primitive_util::BitWidth(a.element_type()) <
                    primitive_util::BitWidth(b.element_type())
                ? b.element_type()
@@ -304,6 +303,9 @@ class ShapeUtil {
   // Returns whether the lhs and rhs shapes are identical.
   static bool Equal(const Shape& lhs, const Shape& rhs);
 
+  // As Equal, but does not compare the element type.
+  static bool EqualIgnoringElementType(const Shape& lhs, const Shape& rhs);
+
   // As Equal, but allow one of lhs and rhs to be F16 while the other is F32.
   static bool EqualIgnoringFpPrecision(const Shape& lhs, const Shape& rhs);
 
diff --git a/tensorflow/compiler/xla/shape_util_test.cc b/tensorflow/compiler/xla/shape_util_test.cc
index 020b062f6b1..4a59fe794c7 100644
--- a/tensorflow/compiler/xla/shape_util_test.cc
+++ b/tensorflow/compiler/xla/shape_util_test.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/shape_util.h"
 
 #include <numeric>
+
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_join.h"
 #include "tensorflow/compiler/xla/layout_util.h"
@@ -176,6 +177,27 @@ TEST(ShapeUtilTest, UnequalIgnoringFpPrecision) {
       ShapeUtil::MakeShapeWithLayout(PRED, {4, 3}, {0, 1})));
 }
 
+TEST(ShapeUtilTest, EqualIgnoringElementType) {
+  EXPECT_TRUE(ShapeUtil::EqualIgnoringElementType(
+      ShapeUtil::MakeShapeWithLayout(F32, {4, 3}, {0, 1}),
+      ShapeUtil::MakeShapeWithLayout(F16, {4, 3}, {0, 1})));
+  EXPECT_TRUE(ShapeUtil::EqualIgnoringElementType(
+      ShapeUtil::MakeShapeWithLayout(S32, {4, 3}, {0, 1}),
+      ShapeUtil::MakeShapeWithLayout(F16, {4, 3}, {0, 1})));
+  EXPECT_TRUE(ShapeUtil::EqualIgnoringElementType(
+      ShapeUtil::MakeShapeWithLayout(F32, {4, 3}, {0, 1}),
+      ShapeUtil::MakeShapeWithLayout(PRED, {4, 3}, {0, 1})));
+}
+
+TEST(ShapeUtilTest, UnequalIgnoringElementType) {
+  EXPECT_FALSE(ShapeUtil::EqualIgnoringElementType(
+      ShapeUtil::MakeShapeWithLayout(F32, {4, 3}, {0, 1}),
+      ShapeUtil::MakeShapeWithLayout(F16, {3, 4}, {0, 1})));
+  EXPECT_FALSE(ShapeUtil::EqualIgnoringElementType(
+      ShapeUtil::MakeShapeWithLayout(F32, {3, 4}, {0, 1}),
+      ShapeUtil::MakeShapeWithLayout(F16, {3, 4}, {1, 0})));
+}
+
 TEST(ShapeUtilTest, EqualDynamicShapes) {
   EXPECT_TRUE(
       ShapeUtil::Equal(ShapeUtil::MakeShape(F32, {4, 3}, {true, false}),
@@ -195,7 +217,7 @@ TEST(ShapeUtilTest, CompatibleDynamicShapes) {
 
   EXPECT_TRUE(ShapeUtil::Compatible(shape_a, shape_a));
   EXPECT_TRUE(ShapeUtil::Compatible(shape_a, shape_b));
-  EXPECT_FALSE(ShapeUtil::Compatible(shape_a, shape_c));
+  EXPECT_TRUE(ShapeUtil::Compatible(shape_a, shape_c));
 }
 
 TEST(ShapeUtilTest, CompatibleTuples) {
diff --git a/tensorflow/compiler/xla/tests/BUILD b/tensorflow/compiler/xla/tests/BUILD
index ae0d70610be..ee823ce6364 100644
--- a/tensorflow/compiler/xla/tests/BUILD
+++ b/tensorflow/compiler/xla/tests/BUILD
@@ -745,7 +745,9 @@ xla_test(
         "no_oss",
     ],
     deps = [
+        ":client_library_test_base",
         ":exhaustive_op_test_utils",
+        "//tensorflow/compiler/xla:util",
     ],
 )
 
@@ -765,7 +767,9 @@ xla_test(
         "no_oss",
     ],
     deps = [
+        ":client_library_test_base",
         ":exhaustive_op_test_utils",
+        "//tensorflow/compiler/xla:util",
     ],
 )
 
@@ -785,7 +789,9 @@ xla_test(
         "no_oss",
     ],
     deps = [
+        ":client_library_test_base",
         ":exhaustive_op_test_utils",
+        "//tensorflow/compiler/xla:util",
     ],
 )
 
@@ -1281,16 +1287,17 @@ xla_test(
     srcs = ["slice_test.cc"],
     shard_count = 40,
     deps = [
+        ":client_library_test_base",
+        ":literal_test_util",
         ":test_macros_header",
+        ":xla_internal_test_main",
         "//tensorflow/compiler/xla:array2d",
         "//tensorflow/compiler/xla:reference_util",
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/client:xla_builder",
-        "//tensorflow/compiler/xla/tests:client_library_test_base",
-        "//tensorflow/compiler/xla/tests:literal_test_util",
-        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/core:lib",
         "//tensorflow/core:test",
+        "//tensorflow/core/platform:types",
         "@com_google_absl//absl/container:inlined_vector",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
diff --git a/tensorflow/compiler/xla/tests/array_elementwise_ops_test.cc b/tensorflow/compiler/xla/tests/array_elementwise_ops_test.cc
index a5e27cd67a7..916bbed252d 100644
--- a/tensorflow/compiler/xla/tests/array_elementwise_ops_test.cc
+++ b/tensorflow/compiler/xla/tests/array_elementwise_ops_test.cc
@@ -1550,8 +1550,10 @@ XLA_TEST_F(ArrayElementwiseOpTest, PowOfExpF32) {
 XLA_TEST_F(ArrayElementwiseOpTest, LogOfPowerF32) {
   XlaBuilder b(TestName());
 
-  std::vector<float> values0 = {1.0f, 2.0f, 3.2f, 4.0f, 0.5f, 5.7f};
-  std::vector<float> values1 = {0.0f, 1.0f, 2.0f, 0.5f, -1.0f, -0.5f};
+  std::vector<float> values0 = {1.0f, -10.0f, -2.0f, 2.0f,
+                                3.2f, 4.0f,   0.5f,  5.7f};
+  std::vector<float> values1 = {0.0f, 10.0f, -4.0f, 1.0f,
+                                2.0f, 0.5f,  -1.0f, -0.5f};
 
   Literal literal0 = LiteralUtil::CreateR1<float>(values0);
   std::unique_ptr<GlobalData> data0 =
diff --git a/tensorflow/compiler/xla/tests/compute_constant_test.cc b/tensorflow/compiler/xla/tests/compute_constant_test.cc
index ef800b8ef62..3e1b9508346 100644
--- a/tensorflow/compiler/xla/tests/compute_constant_test.cc
+++ b/tensorflow/compiler/xla/tests/compute_constant_test.cc
@@ -160,7 +160,7 @@ TEST_F(ComputeConstantTest, GetDimensionSize) {
     auto get_dimension_size = GetDimensionSize(add, 0);
     EXPECT_TRUE(IsConstant(get_dimension_size, &b));
 
-    TF_ASSERT_OK_AND_ASSIGN(auto value, ComputeConstantScalar<uint32>(
+    TF_ASSERT_OK_AND_ASSIGN(auto value, ComputeConstantScalar<int32>(
                                             client, get_dimension_size, &b));
     EXPECT_EQ(value, 1);
   }
@@ -178,7 +178,7 @@ TEST_F(ComputeConstantTest, MultipleGetDimensionSize) {
     EXPECT_TRUE(IsConstant(add_2, &b));
 
     TF_ASSERT_OK_AND_ASSIGN(auto value,
-                            ComputeConstantScalar<uint32>(client, add_2, &b));
+                            ComputeConstantScalar<int32>(client, add_2, &b));
     EXPECT_EQ(value, 2);
   }
 }
diff --git a/tensorflow/compiler/xla/tests/conv_depthwise_test.cc b/tensorflow/compiler/xla/tests/conv_depthwise_test.cc
index 80566be9085..6d8ddc199e2 100644
--- a/tensorflow/compiler/xla/tests/conv_depthwise_test.cc
+++ b/tensorflow/compiler/xla/tests/conv_depthwise_test.cc
@@ -39,7 +39,8 @@ static std::vector<DepthwiseConvolution2DSpec> GetConv2DTestCases() {
   std::vector<std::vector<int64>> config_options = {
       {128, 6, 3, 64},  {256, 5, 3, 256}, {256, 5, 2, 144}, {144, 5, 3, 64},
       {144, 5, 2, 256}, {8, 48, 17, 8},   {128, 20, 6, 64}, {64, 14, 12, 172},
-      {16, 9, 4, 16},   {128, 1, 2, 144}, {256, 1, 2, 64}};
+      {16, 9, 4, 16},   {128, 1, 2, 144}, {256, 1, 2, 64},  {256, 1, 2, 2},
+      {144, 5, 3, 3},   {8, 48, 17, 1},   {16, 9, 5, 4}};
 
   for (auto option : config_options) {
     int64 feature = option[0];
diff --git a/tensorflow/compiler/xla/tests/exhaustive_binary_test.cc b/tensorflow/compiler/xla/tests/exhaustive_binary_test.cc
index c0f8a0dc626..64372788be4 100644
--- a/tensorflow/compiler/xla/tests/exhaustive_binary_test.cc
+++ b/tensorflow/compiler/xla/tests/exhaustive_binary_test.cc
@@ -88,30 +88,41 @@ inline std::function<XlaOp(XlaOp, XlaOp)> AddEmptyBroadcastDimension(
   };
 }
 
-#define XLA_TEST_16BIT(test_name, ...)            \
-  XLA_TEST_P(ExhaustiveF16BinaryTest, test_name)  \
-  __VA_ARGS__                                     \
+#if defined(BINARY_TEST_TARGET_F16) && defined(BINARY_TEST_TARGET_BF16)
+#error "Can't define both BINARY_TEST_TARGET_F16 and BINARY_TEST_TARGET_BF16"
+#endif
+
+#if defined(BINARY_TEST_TARGET_F16) && \
+    !defined(XLA_BACKEND_DOES_NOT_SUPPORT_FLOAT16)
+#define BINARY_TEST_16BIT(test_name, ...)        \
+  XLA_TEST_P(ExhaustiveF16BinaryTest, test_name) \
+  __VA_ARGS__
+#elif defined(BINARY_TEST_TARGET_BF16) && defined(XLA_BACKEND_SUPPORTS_BFLOAT16)
+#define BINARY_TEST_16BIT(test_name, ...)         \
   XLA_TEST_P(ExhaustiveBF16BinaryTest, test_name) \
   __VA_ARGS__
+#else
+#define BINARY_TEST_16BIT(test_name, ...)
+#endif
 
-XLA_TEST_16BIT(Add, {
+BINARY_TEST_16BIT(Add, {
   auto host_add = [](float x, float y) { return x + y; };
   Run(AddEmptyBroadcastDimension(Add), host_add);
 })
 
-XLA_TEST_16BIT(Sub, {
+BINARY_TEST_16BIT(Sub, {
   auto host_sub = [](float x, float y) { return x - y; };
   Run(AddEmptyBroadcastDimension(Sub), host_sub);
 })
 
 // TODO(bixia): Mul fails with bfloat16 on CPU.
-XLA_TEST_16BIT(DISABLED_ON_CPU(Mul), {
+BINARY_TEST_16BIT(DISABLED_ON_CPU(Mul), {
   auto host_mul = [](float x, float y) { return x * y; };
   Run(AddEmptyBroadcastDimension(Mul), host_mul);
 })
 
 // TODO(bixia): Div fails with bfloat16 on CPU.
-XLA_TEST_16BIT(DISABLED_ON_CPU(Div), {
+BINARY_TEST_16BIT(DISABLED_ON_CPU(Div), {
   auto host_div = [](float x, float y) { return x / y; };
   Run(AddEmptyBroadcastDimension(Div), host_div);
 })
@@ -146,19 +157,21 @@ T ReferenceMin(T x, T y) {
   return std::min<T>(x, y);
 }
 
-XLA_TEST_16BIT(Max,
-               { Run(AddEmptyBroadcastDimension(Max), ReferenceMax<float>); })
+BINARY_TEST_16BIT(Max, {
+  Run(AddEmptyBroadcastDimension(Max), ReferenceMax<float>);
+})
 
-XLA_TEST_16BIT(Min,
-               { Run(AddEmptyBroadcastDimension(Min), ReferenceMin<float>); })
+BINARY_TEST_16BIT(Min, {
+  Run(AddEmptyBroadcastDimension(Min), ReferenceMin<float>);
+})
 
 // TODO(bixia): Pow fails with bfloat16 on CPU.
-XLA_TEST_16BIT(DISABLED_ON_CPU(Pow),
-               { Run(AddEmptyBroadcastDimension(Pow), std::powf); })
+BINARY_TEST_16BIT(DISABLED_ON_CPU(Pow),
+                  { Run(AddEmptyBroadcastDimension(Pow), std::powf); })
 
 // TODO(bixia): Atan2 fails with bfloat16 on CPU.
-XLA_TEST_16BIT(DISABLED_ON_CPU(Atan2),
-               { Run(AddEmptyBroadcastDimension(Atan2), std::atan2f); })
+BINARY_TEST_16BIT(DISABLED_ON_CPU(Atan2),
+                  { Run(AddEmptyBroadcastDimension(Atan2), std::atan2f); })
 
 #if defined(BINARY_TEST_TARGET_F16)
 #if !defined(XLA_BACKEND_DOES_NOT_SUPPORT_FLOAT16)
@@ -224,35 +237,43 @@ class Exhaustive32BitOrMoreBinaryTest
 using ExhaustiveF32BinaryTest = Exhaustive32BitOrMoreBinaryTest<F32>;
 using ExhaustiveF64BinaryTest = Exhaustive32BitOrMoreBinaryTest<F64>;
 
-XLA_TEST_P(ExhaustiveF32BinaryTest, Add) {
+#if defined(BINARY_TEST_TARGET_F32)
+#define BINARY_TEST_FLOAT_32(test_name, ...)     \
+  XLA_TEST_P(ExhaustiveF32BinaryTest, test_name) \
+  __VA_ARGS__
+#else
+#define BINARY_TEST_FLOAT_32(test_name, ...)
+#endif
+
+BINARY_TEST_FLOAT_32(Add, {
   auto host_add = [](float x, float y) { return x + y; };
   Run(AddEmptyBroadcastDimension(Add), host_add);
-}
+})
 
-XLA_TEST_P(ExhaustiveF32BinaryTest, Sub) {
+BINARY_TEST_FLOAT_32(Sub, {
   auto host_sub = [](float x, float y) { return x - y; };
   Run(AddEmptyBroadcastDimension(Sub), host_sub);
-}
+})
 
 // TODO(bixia): Need to investigate the failure on CPU and file bugs.
-XLA_TEST_P(ExhaustiveF32BinaryTest, DISABLED_ON_CPU(Mul)) {
+BINARY_TEST_FLOAT_32(DISABLED_ON_CPU(Mul), {
   auto host_mul = [](float x, float y) { return x * y; };
   Run(AddEmptyBroadcastDimension(Mul), host_mul);
-}
+})
 
 // TODO(bixia): Need to investigate the failure on CPU and file bugs.
-XLA_TEST_P(ExhaustiveF32BinaryTest, DISABLED_ON_CPU(Div)) {
+BINARY_TEST_FLOAT_32(DISABLED_ON_CPU(Div), {
   auto host_div = [](float x, float y) { return x / y; };
   Run(AddEmptyBroadcastDimension(Div), host_div);
-}
+})
 
-XLA_TEST_P(ExhaustiveF32BinaryTest, Max) {
+BINARY_TEST_FLOAT_32(Max, {
   Run(AddEmptyBroadcastDimension(Max), ReferenceMax<float>);
-}
+})
 
-XLA_TEST_P(ExhaustiveF32BinaryTest, Min) {
+BINARY_TEST_FLOAT_32(Min, {
   Run(AddEmptyBroadcastDimension(Min), ReferenceMin<float>);
-}
+})
 
 // It is more convenient to implement Abs(complex) as a binary op than a unary
 // op, as the operations we currently support all have the same data type for
@@ -261,16 +282,14 @@ XLA_TEST_P(ExhaustiveF32BinaryTest, Min) {
 // implement Abs(complex) as unary conveniently.
 //
 // TODO(bixia): Need to investigate the failure on CPU and file bugs.
-XLA_TEST_P(ExhaustiveF32BinaryTest, DISABLED_ON_CPU(AbsComplex)) {
+BINARY_TEST_FLOAT_32(DISABLED_ON_CPU(AbsComplex), {
   auto host_abs_complex = [](float x, float y) {
     return std::abs(std::complex<float>(x, y));
   };
   auto device_abs_complex = [](XlaOp x, XlaOp y) { return Abs(Complex(x, y)); };
 
   Run(device_abs_complex, host_abs_complex);
-}
-
-#if defined(BINARY_TEST_TARGET_F32)
+})
 
 INSTANTIATE_TEST_SUITE_P(
     SpecialValues, ExhaustiveF32BinaryTest,
@@ -307,51 +326,55 @@ INSTANTIATE_TEST_SUITE_P(
         ::testing::ValuesIn(
             GetFpValuesForMagnitudeExtremeNormals<float>(40000, 2000))));
 
+#if defined(BINARY_TEST_TARGET_F64) && \
+    !defined(XLA_BACKEND_DOES_NOT_SUPPORT_FLOAT64)
+#define BINARY_TEST_FLOAT_64(test_name, ...)     \
+  XLA_TEST_P(ExhaustiveF64BinaryTest, test_name) \
+  __VA_ARGS__
+#else
+#define BINARY_TEST_FLOAT_64(test_name, ...)
 #endif
 
-XLA_TEST_P(ExhaustiveF64BinaryTest, Add) {
+BINARY_TEST_FLOAT_64(Add, {
   auto host_add = [](double x, double y) { return x + y; };
   Run(AddEmptyBroadcastDimension(Add), host_add);
-}
+})
 
-XLA_TEST_P(ExhaustiveF64BinaryTest, Sub) {
+BINARY_TEST_FLOAT_64(Sub, {
   auto host_sub = [](double x, double y) { return x - y; };
   Run(AddEmptyBroadcastDimension(Sub), host_sub);
-}
+})
 
 // TODO(bixia): Need to investigate the failure on CPU and file bugs.
-XLA_TEST_P(ExhaustiveF64BinaryTest, DISABLED_ON_CPU(Mul)) {
+BINARY_TEST_FLOAT_64(DISABLED_ON_CPU(Mul), {
   auto host_mul = [](double x, double y) { return x * y; };
   Run(AddEmptyBroadcastDimension(Mul), host_mul);
-}
+})
 
 // TODO(bixia): Need to investigate the failure on CPU and file bugs.
-XLA_TEST_P(ExhaustiveF64BinaryTest, DISABLED_ON_CPU(Div)) {
+BINARY_TEST_FLOAT_64(DISABLED_ON_CPU(Div), {
   auto host_div = [](double x, double y) { return x / y; };
   Run(AddEmptyBroadcastDimension(Div), host_div);
-}
+})
 
-XLA_TEST_P(ExhaustiveF64BinaryTest, Max) {
+BINARY_TEST_FLOAT_64(Max, {
   Run(AddEmptyBroadcastDimension(Max), ReferenceMax<double>);
-}
+})
 
-XLA_TEST_P(ExhaustiveF64BinaryTest, Min) {
+BINARY_TEST_FLOAT_64(Min, {
   Run(AddEmptyBroadcastDimension(Min), ReferenceMin<double>);
-}
+})
 
 // TODO(bixia): Need to investigate the failure on CPU and file bugs.
-XLA_TEST_P(ExhaustiveF64BinaryTest, DISABLED_ON_CPU(AbsComplex)) {
+BINARY_TEST_FLOAT_64(DISABLED_ON_CPU(AbsComplex), {
   auto host_abs_complex = [](double x, double y) {
     return std::abs(std::complex<double>(x, y));
   };
   auto device_abs_complex = [](XlaOp x, XlaOp y) { return Abs(Complex(x, y)); };
 
   Run(device_abs_complex, host_abs_complex);
-}
+})
 
-#if defined(BINARY_TEST_TARGET_F64)
-
-#if !defined(XLA_BACKEND_DOES_NOT_SUPPORT_FLOAT64)
 INSTANTIATE_TEST_SUITE_P(
     SpecialValues, ExhaustiveF64BinaryTest,
     ::testing::Combine(
@@ -385,8 +408,6 @@ INSTANTIATE_TEST_SUITE_P(
             GetFpValuesForMagnitudeExtremeNormals<double>(40000, 2000)),
         ::testing::ValuesIn(
             GetFpValuesForMagnitudeExtremeNormals<double>(40000, 2000))));
-#endif
 
-#endif
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/tests/exhaustive_unary_test.cc b/tensorflow/compiler/xla/tests/exhaustive_unary_test.cc
index 3a14bb2d4cc..0a8fd82dd0c 100644
--- a/tensorflow/compiler/xla/tests/exhaustive_unary_test.cc
+++ b/tensorflow/compiler/xla/tests/exhaustive_unary_test.cc
@@ -13,7 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include "tensorflow/compiler/xla/tests/client_library_test_base.h"
 #include "tensorflow/compiler/xla/tests/exhaustive_op_test_utils.h"
+#include "tensorflow/compiler/xla/util.h"
 
 #ifdef __FAST_MATH__
 #error "Can't be compiled with fast math on"
@@ -211,15 +213,54 @@ typedef Exhaustive32BitOrLessUnaryTest<F32> ExhaustiveF32UnaryTest;
 typedef Exhaustive32BitOrLessUnaryTest<F16> ExhaustiveF16UnaryTest;
 typedef Exhaustive32BitOrLessUnaryTest<BF16> ExhaustiveBF16UnaryTest;
 
-#define XLA_TEST_FLOAT_32_BITS_OR_LESS(test_name, ...) \
-  XLA_TEST_P(ExhaustiveF32UnaryTest, test_name)        \
-  __VA_ARGS__                                          \
-  XLA_TEST_P(ExhaustiveF16UnaryTest, test_name)        \
-  __VA_ARGS__                                          \
-  XLA_TEST_P(ExhaustiveBF16UnaryTest, test_name)       \
-  __VA_ARGS__
+#if defined(UNARY_TEST_TARGET_F32_OR_SMALLER)
+#define NEED_UNARY_F32 true
+#if !defined(XLA_BACKEND_DOES_NOT_SUPPORT_FLOAT16)
+#define NEED_UNARY_F16 true
+#else
+#define NEED_UNARY_F16 false
+#endif
+#if defined(XLA_BACKEND_SUPPORTS_BFLOAT16)
+#define NEED_UNARY_BF16 true
+#else
+#define NEED_UNARY_BF16 false
+#endif
+#else
+#define NEED_UNARY_F32 false
+#define NEED_UNARY_F16 false
+#define NEED_UNARY_BF16 false
+#endif
 
-XLA_TEST_FLOAT_32_BITS_OR_LESS(Log, {
+#if NEED_UNARY_F32
+#define UNARY_TEST_F32(test_name, ...)          \
+  XLA_TEST_P(ExhaustiveF32UnaryTest, test_name) \
+  __VA_ARGS__
+#else
+#define UNARY_TEST_F32(test_name, ...)
+#endif
+
+#if NEED_UNARY_F16
+#define UNARY_TEST_F16(test_name, ...)          \
+  XLA_TEST_P(ExhaustiveF16UnaryTest, test_name) \
+  __VA_ARGS__
+#else
+#define UNARY_TEST_F16(test_name, ...)
+#endif
+
+#if NEED_UNARY_BF16
+#define UNARY_TEST_BF16(test_name, ...)          \
+  XLA_TEST_P(ExhaustiveBF16UnaryTest, test_name) \
+  __VA_ARGS__
+#else
+#define UNARY_TEST_BF16(test_name, ...)
+#endif
+
+#define UNARY_TEST_FLOAT_32_BITS_OR_LESS(test_name, ...) \
+  UNARY_TEST_F32(test_name, __VA_ARGS__)                 \
+  UNARY_TEST_F16(test_name, __VA_ARGS__)                 \
+  UNARY_TEST_BF16(test_name, __VA_ARGS__)
+
+UNARY_TEST_FLOAT_32_BITS_OR_LESS(Log, {
   ErrorSpecGen error_spec_gen = GetDefaultSpecGenerator();
   if (platform_ != "Host" && platform_ != "CUDA" && ty_ == F32) {
     error_spec_gen = +[](NativeT x) { return ErrorSpec{0.001, 0.001}; };
@@ -227,7 +268,7 @@ XLA_TEST_FLOAT_32_BITS_OR_LESS(Log, {
   Run(Log, std::log, error_spec_gen);
 })
 
-XLA_TEST_FLOAT_32_BITS_OR_LESS(Log1p, {
+UNARY_TEST_FLOAT_32_BITS_OR_LESS(Log1p, {
   ErrorSpecGen error_spec_gen = GetDefaultSpecGenerator();
   if (platform_ != "Host" && platform_ != "CUDA" && ty_ == F32) {
     error_spec_gen = +[](NativeT x) { return ErrorSpec{0.001, 0.001}; };
@@ -235,7 +276,7 @@ XLA_TEST_FLOAT_32_BITS_OR_LESS(Log1p, {
   Run(Log1p, std::log1p, error_spec_gen);
 })
 
-XLA_TEST_FLOAT_32_BITS_OR_LESS(Exp, {
+UNARY_TEST_FLOAT_32_BITS_OR_LESS(Exp, {
   // When x < -105, the true value of exp(x) is smaller than the smallest F32,
   // so exp(x) should return exactly 0. We want our implementation of exp to
   // return exactly 0 as well, as not doing so implies either that our
@@ -266,7 +307,7 @@ XLA_TEST_FLOAT_32_BITS_OR_LESS(Exp, {
   }
 })
 
-XLA_TEST_FLOAT_32_BITS_OR_LESS(Expm1, {
+UNARY_TEST_FLOAT_32_BITS_OR_LESS(Expm1, {
   ErrorSpecGen error_spec_gen = GetDefaultSpecGenerator();
   if (ty_ == F32) {
     error_spec_gen = +[](NativeT x) { return ErrorSpec{0, 0.00015}; };
@@ -292,7 +333,7 @@ XLA_TEST_FLOAT_32_BITS_OR_LESS(Expm1, {
 // It feels a little overkill to exhaustively test sqrt and pow(x, 0.5), but
 // this *did* find a bug, namely that some backends were assuming sqrt(x) ==
 // pow(x, 0.5), but this is not true for x == -inf.
-XLA_TEST_FLOAT_32_BITS_OR_LESS(PowOneHalf, {
+UNARY_TEST_FLOAT_32_BITS_OR_LESS(PowOneHalf, {
   EvaluateOp fn = +[](float x) { return std::pow(x, 0.5f); };
   // TODO(b/123837116): Enable the test for all values after fixing the bug.
   if (platform_ != "Host" && platform_ != "CUDA") {
@@ -306,12 +347,12 @@ XLA_TEST_FLOAT_32_BITS_OR_LESS(PowOneHalf, {
   Run([](XlaOp x) { return Pow(x, ScalarLike(x, 0.5)); }, fn);
 })
 
-XLA_TEST_FLOAT_32_BITS_OR_LESS(Rsqrt, {
+UNARY_TEST_FLOAT_32_BITS_OR_LESS(Rsqrt, {
   Run(
       Rsqrt, +[](float x) { return 1 / std::sqrt(x); });
 })
 
-XLA_TEST_FLOAT_32_BITS_OR_LESS(Sqrt, {
+UNARY_TEST_FLOAT_32_BITS_OR_LESS(Sqrt, {
   ErrorSpecGen error_spec_gen = GetDefaultSpecGenerator();
   if (platform_ == "Host" || platform_ == "CUDA") {
     error_spec_gen = +[](NativeT x) {
@@ -349,11 +390,11 @@ XLA_TEST_P(ExhaustiveF32UnaryTest, Asinh) {
 XLA_TEST_P(ExhaustiveF16UnaryTest, Asinh) { Run(Asinh, std::asinh); }
 XLA_TEST_P(ExhaustiveBF16UnaryTest, Asinh) { Run(Asinh, std::asinh); }
 
-XLA_TEST_FLOAT_32_BITS_OR_LESS(Atanh, { Run(Atanh, std::atanh); })
-XLA_TEST_FLOAT_32_BITS_OR_LESS(Acos, { Run(Acos, std::acos); })
-XLA_TEST_FLOAT_32_BITS_OR_LESS(Asin, { Run(Asin, std::asin); })
+UNARY_TEST_FLOAT_32_BITS_OR_LESS(Atanh, { Run(Atanh, std::atanh); })
+UNARY_TEST_FLOAT_32_BITS_OR_LESS(Acos, { Run(Acos, std::acos); })
+UNARY_TEST_FLOAT_32_BITS_OR_LESS(Asin, { Run(Asin, std::asin); })
 
-XLA_TEST_FLOAT_32_BITS_OR_LESS(Cosh, {
+UNARY_TEST_FLOAT_32_BITS_OR_LESS(Cosh, {
   // Our cosh implementation incorrectly overflows to inf for +/-89.4159851.
   // The correct answer of 3.40281961e+38 (0x7f7fffec) is very close to
   // max-float, so we deem this acceptable.
@@ -374,7 +415,7 @@ XLA_TEST_FLOAT_32_BITS_OR_LESS(Cosh, {
   Run(Cosh, host_cosh);
 })
 
-XLA_TEST_FLOAT_32_BITS_OR_LESS(Sinh, {
+UNARY_TEST_FLOAT_32_BITS_OR_LESS(Sinh, {
   // Our sinh implementation incorrectly overflows to +/-inf for +/-89.4159851.
   // The correct answer of 3.40281961e+38 (0x7f7fffec) is very close to
   // max-float, so we deem this acceptable.
@@ -395,7 +436,7 @@ XLA_TEST_FLOAT_32_BITS_OR_LESS(Sinh, {
   Run(Sinh, host_sinh);
 })
 
-XLA_TEST_FLOAT_32_BITS_OR_LESS(Tanh, {
+UNARY_TEST_FLOAT_32_BITS_OR_LESS(Tanh, {
   ErrorSpecGen error_spec_gen = GetDefaultSpecGenerator();
   if (platform_ == "CUDA") {
     error_spec_gen = +[](NativeT x) {
@@ -429,62 +470,68 @@ void Exhaustive32BitOrLessUnaryTest<T>::SetParamsForSinCosTan() {
   }
 }
 
-XLA_TEST_P(ExhaustiveF32UnaryTest, Cos) {
+UNARY_TEST_F32(Cos, {
   SetParamsForSinCosTan();
   Run(
       Cos, std::cos, +[](NativeT) {
         return ErrorSpec{0.001, 0.001};
       });
-}
-XLA_TEST_P(ExhaustiveF16UnaryTest, Cos) {
-  SetParamsForSinCosTan();
-  Run(Cos, std::cos);
-}
-XLA_TEST_P(ExhaustiveBF16UnaryTest, Cos) {
-  SetParamsForSinCosTan();
-  Run(Cos, std::cos);
-}
+})
 
-XLA_TEST_P(ExhaustiveF32UnaryTest, Sin) {
+UNARY_TEST_F16(Cos, {
+  SetParamsForSinCosTan();
+  Run(Cos, std::cos);
+})
+
+UNARY_TEST_BF16(Cos, {
+  SetParamsForSinCosTan();
+  Run(Cos, std::cos);
+})
+
+UNARY_TEST_F32(Sin, {
   SetParamsForSinCosTan();
   Run(
       Sin, std::sin, +[](NativeT) {
         return ErrorSpec{0.001, 0.001};
       });
-}
-XLA_TEST_P(ExhaustiveF16UnaryTest, Sin) {
-  SetParamsForSinCosTan();
-  Run(Sin, std::sin);
-}
-XLA_TEST_P(ExhaustiveBF16UnaryTest, Sin) {
-  SetParamsForSinCosTan();
-  Run(Sin, std::sin);
-}
+})
 
-XLA_TEST_P(ExhaustiveF32UnaryTest, Tan) {
+UNARY_TEST_F16(Sin, {
+  SetParamsForSinCosTan();
+  Run(Sin, std::sin);
+})
+
+UNARY_TEST_BF16(Sin, {
+  SetParamsForSinCosTan();
+  Run(Sin, std::sin);
+})
+
+UNARY_TEST_F32(Tan, {
   SetParamsForSinCosTan();
   Run(
       Tan, std::tan, +[](NativeT) {
         return ErrorSpec{0.001, 0.001};
       });
-}
-XLA_TEST_P(ExhaustiveF16UnaryTest, Tan) {
+})
+
+UNARY_TEST_F16(Tan, {
   SetParamsForSinCosTan();
   Run(Tan, std::tan);
-}
-XLA_TEST_P(ExhaustiveBF16UnaryTest, Tan) {
+})
+
+UNARY_TEST_BF16(Tan, {
   SetParamsForSinCosTan();
   Run(Tan, std::tan);
-}
+})
 
 // TODO(jlebar): Enable these.
-// XLA_TEST_FLOAT_32_BITS_OR_LESS(Atan) { Run(Atan, std::atan); }
-// XLA_TEST_FLOAT_32_BITS_OR_LESS(Atan2) { Run(Atan2, std::atan2); }
+// UNARY_TEST_FLOAT_32_BITS_OR_LESS(Atan) { Run(Atan, std::atan); }
+// UNARY_TEST_FLOAT_32_BITS_OR_LESS(Atan2) { Run(Atan2, std::atan2); }
 
-XLA_TEST_FLOAT_32_BITS_OR_LESS(Erf, { Run(Erf, std::erf); })
-XLA_TEST_FLOAT_32_BITS_OR_LESS(Erfc, { Run(Erfc, std::erfc); })
-XLA_TEST_FLOAT_32_BITS_OR_LESS(ErfInv, { Run(ErfInv, HostErfInv); })
-XLA_TEST_FLOAT_32_BITS_OR_LESS(Digamma, {
+UNARY_TEST_FLOAT_32_BITS_OR_LESS(Erf, { Run(Erf, std::erf); })
+UNARY_TEST_FLOAT_32_BITS_OR_LESS(Erfc, { Run(Erfc, std::erfc); })
+UNARY_TEST_FLOAT_32_BITS_OR_LESS(ErfInv, { Run(ErfInv, HostErfInv); })
+UNARY_TEST_FLOAT_32_BITS_OR_LESS(Digamma, {
   ErrorSpecGen error_spec_gen = GetDefaultSpecGenerator();
   if (platform_ != "Host" && platform_ != "CUDA") {
     // TODO(b/123956399): This is a fairly high error, significantly higher than
@@ -514,7 +561,7 @@ XLA_TEST_FLOAT_32_BITS_OR_LESS(Digamma, {
   }
 })
 
-XLA_TEST_FLOAT_32_BITS_OR_LESS(Lgamma, {
+UNARY_TEST_FLOAT_32_BITS_OR_LESS(Lgamma, {
   // Our implementation gets within 0.0001 rel error except for ~20 denormal
   // inputs on GPU.  Anyway 0.001 rel error should be good enough for lgamma.
   ErrorSpecGen error_spec_gen = GetDefaultSpecGenerator();
@@ -545,9 +592,7 @@ XLA_TEST_FLOAT_32_BITS_OR_LESS(Lgamma, {
   Run(Lgamma, host_lgamma, error_spec_gen);
 })
 
-XLA_TEST_FLOAT_32_BITS_OR_LESS(Round, { Run(Round, std::round); })
-
-#if defined(UNARY_TEST_TARGET_F32_OR_SMALLER)
+UNARY_TEST_FLOAT_32_BITS_OR_LESS(Round, { Run(Round, std::round); })
 
 INSTANTIATE_TEST_SUITE_P(F32, ExhaustiveF32UnaryTest,
                          ::testing::ValuesIn(CreateExhaustiveF32Ranges()));
@@ -562,8 +607,6 @@ INSTANTIATE_TEST_SUITE_P(BF16, ExhaustiveBF16UnaryTest,
                          ::testing::Values(std::make_pair(0, 1 << 16)));
 #endif
 
-#endif
-
 // Exhaustive test for unary operations for double.
 //
 // Test parameter is a tuple containing
@@ -594,42 +637,51 @@ class ExhaustiveF64UnaryTest : public ExhaustiveUnaryTest<F64>,
   }
 };
 
-XLA_TEST_P(ExhaustiveF64UnaryTest, Log) { Run(Log, std::log); }
+#if defined(UNARY_TEST_TARGET_F64) && \
+    !defined(XLA_BACKEND_DOES_NOT_SUPPORT_FLOAT64)
+#define UNARY_TEST_FLOAT_64(test_name, ...)     \
+  XLA_TEST_P(ExhaustiveF64UnaryTest, test_name) \
+  __VA_ARGS__
+#else
+#define UNARY_TEST_FLOAT_64(test_name, ...)
+#endif
 
-XLA_TEST_P(ExhaustiveF64UnaryTest, Log1p) { Run(Log1p, std::log1p); }
+UNARY_TEST_FLOAT_64(Log, { Run(Log, std::log); })
 
-XLA_TEST_P(ExhaustiveF64UnaryTest, Exp) { Run(Exp, std::exp); }
+UNARY_TEST_FLOAT_64(Log1p, { Run(Log1p, std::log1p); })
 
-XLA_TEST_P(ExhaustiveF64UnaryTest, Expm1) { Run(Expm1, std::expm1); }
+UNARY_TEST_FLOAT_64(Exp, { Run(Exp, std::exp); })
+
+UNARY_TEST_FLOAT_64(Expm1, { Run(Expm1, std::expm1); })
 
 // TODO(b/138385863): Turn on the test for GPU after fixing the bug.
-XLA_TEST_P(ExhaustiveF64UnaryTest, DISABLED_ON_GPU(PowOneHalf)) {
+UNARY_TEST_FLOAT_64(DISABLED_ON_GPU(PowOneHalf), {
   Run([](XlaOp x) { return Pow(x, ScalarLike(x, 0.5)); },
       +[](double x) { return std::pow(x, 0.5); });
-}
+})
 
-XLA_TEST_P(ExhaustiveF64UnaryTest, Rsqrt) {
+UNARY_TEST_FLOAT_64(Rsqrt, {
   Run(
       Rsqrt, +[](double x) { return 1 / std::sqrt(x); });
-}
+})
 
-XLA_TEST_P(ExhaustiveF64UnaryTest, Sqrt) { Run(Sqrt, std::sqrt); }
+UNARY_TEST_FLOAT_64(Sqrt, { Run(Sqrt, std::sqrt); })
 
-XLA_TEST_P(ExhaustiveF64UnaryTest, Acosh) { Run(Acosh, std::acosh); }
+UNARY_TEST_FLOAT_64(Acosh, { Run(Acosh, std::acosh); })
 
-XLA_TEST_P(ExhaustiveF64UnaryTest, Asinh) { Run(Asinh, std::asinh); }
+UNARY_TEST_FLOAT_64(Asinh, { Run(Asinh, std::asinh); })
 
-XLA_TEST_P(ExhaustiveF64UnaryTest, Atanh) { Run(Atanh, std::atanh); }
+UNARY_TEST_FLOAT_64(Atanh, { Run(Atanh, std::atanh); })
 
-XLA_TEST_P(ExhaustiveF64UnaryTest, Acos) { Run(Acos, std::acos); }
+UNARY_TEST_FLOAT_64(Acos, { Run(Acos, std::acos); })
 
-XLA_TEST_P(ExhaustiveF64UnaryTest, Asin) { Run(Asin, std::asin); }
+UNARY_TEST_FLOAT_64(Asin, { Run(Asin, std::asin); })
 
-XLA_TEST_P(ExhaustiveF64UnaryTest, Cosh) { Run(Cosh, std::cosh); }
+UNARY_TEST_FLOAT_64(Cosh, { Run(Cosh, std::cosh); })
 
-XLA_TEST_P(ExhaustiveF64UnaryTest, Sinh) { Run(Sinh, std::sinh); }
+UNARY_TEST_FLOAT_64(Sinh, { Run(Sinh, std::sinh); })
 
-XLA_TEST_P(ExhaustiveF64UnaryTest, Tanh) {
+UNARY_TEST_FLOAT_64(Tanh, {
   ErrorSpecGen error_spec_gen = GetDefaultSpecGenerator();
   if (platform_ == "CUDA") {
     error_spec_gen = +[](NativeT x) {
@@ -639,26 +691,24 @@ XLA_TEST_P(ExhaustiveF64UnaryTest, Tanh) {
     };
   }
   Run(Tanh, std::tanh, error_spec_gen);
-}
+})
 
-XLA_TEST_P(ExhaustiveF64UnaryTest, Cos) { Run(Cos, std::cos); }
+UNARY_TEST_FLOAT_64(Cos, { Run(Cos, std::cos); })
 
-XLA_TEST_P(ExhaustiveF64UnaryTest, Sin) { Run(Sin, std::sin); }
+UNARY_TEST_FLOAT_64(Sin, { Run(Sin, std::sin); })
 
-XLA_TEST_P(ExhaustiveF64UnaryTest, Tan) { Run(Tan, std::tan); }
+UNARY_TEST_FLOAT_64(Tan, { Run(Tan, std::tan); })
 
-XLA_TEST_P(ExhaustiveF64UnaryTest, Round) { Run(Round, std::round); }
+UNARY_TEST_FLOAT_64(Round, { Run(Round, std::round); })
 
-XLA_TEST_P(ExhaustiveF64UnaryTest, Erf) {
+UNARY_TEST_FLOAT_64(Erf, {
   Run(Erf, std::erf, [](NativeT x) { return ErrorSpec{1e-20, 1e-20}; });
-}
+})
 
-XLA_TEST_P(ExhaustiveF64UnaryTest, Erfc) {
+UNARY_TEST_FLOAT_64(Erfc, {
   Run(Erfc, std::erfc, [](NativeT x) { return ErrorSpec{1e-20, 1e-20}; });
-}
+})
 
-#if defined(UNARY_TEST_TARGET_F64)
-#if !defined(XLA_BACKEND_DOES_NOT_SUPPORT_FLOAT64)
 INSTANTIATE_TEST_SUITE_P(
     SpecialValues, ExhaustiveF64UnaryTest,
     ::testing::ValuesIn(CreateFpValuesForBoundaryTest<double>()));
@@ -672,8 +722,6 @@ INSTANTIATE_TEST_SUITE_P(
     LargeAndSmallMagnituedNormalValues, ExhaustiveF64UnaryTest,
     ::testing::ValuesIn(GetFpValuesForMagnitudeExtremeNormals<double>(
         4000000000ull, 16000000)));
-#endif
-#endif
 
 // T is the Primitive Type of the complex number
 // Test parameter is a tuple containing
@@ -741,30 +789,38 @@ class ExhaustiveComplexUnaryTestBase
 typedef ExhaustiveComplexUnaryTestBase<C64> ExhaustiveC64UnaryTest;
 typedef ExhaustiveComplexUnaryTestBase<C128> ExhaustiveC128UnaryTest;
 
-// TODO(b/138578594): Enable the test for the CPU backend after fixing the bug.
-XLA_TEST_P(ExhaustiveC64UnaryTest, DISABLED_ON_CPU(Log)) {
-  Run(Log, [](complex64 x) { return std::log<float>(x); });
-}
+#if defined(UNARY_TEST_TARGET_COMPLEX)
+#define UNARY_TEST_COMPLEX_64(test_name, ...)   \
+  XLA_TEST_P(ExhaustiveC64UnaryTest, test_name) \
+  __VA_ARGS__
+#else
+#define UNARY_TEST_COMPLEX_64(test_name, ...)
+#endif
 
-XLA_TEST_P(ExhaustiveC64UnaryTest, Sqrt) {
+// TODO(b/138578594): Enable the test for the CPU backend after fixing the bug.
+UNARY_TEST_COMPLEX_64(DISABLED_ON_CPU(Log), {
+  Run(Log, [](complex64 x) { return std::log<float>(x); });
+})
+
+UNARY_TEST_COMPLEX_64(Sqrt, {
   Run(Sqrt, [](complex64 x) {
     return static_cast<complex64>(
         std::sqrt<double>(static_cast<complex128>(x)));
   });
-}
+})
 
-XLA_TEST_P(ExhaustiveC64UnaryTest, Rsqrt) {
+UNARY_TEST_COMPLEX_64(Rsqrt, {
   Run(Rsqrt, [](complex64 x) {
     return static_cast<complex64>(
         complex128(1, 0) / std::sqrt<double>(static_cast<complex128>(x)));
   });
-}
+})
 
 // The current libc++ implementation of the complex tanh function provides
 // less accurate results when the denomenator of a complex tanh is small, due
 // to floating point precision loss. To avoid this issue for complex64 numbers,
 // we cast it to and from a complex128 when computing tanh.
-XLA_TEST_P(ExhaustiveC64UnaryTest, Tanh) {
+UNARY_TEST_COMPLEX_64(Tanh, {
   SetParamsForTanh();
   ErrorSpecGen error_spec_gen = +[](complex64 x) {
     // This implementation of Tanh becomes less accurate when the denominator
@@ -781,9 +837,8 @@ XLA_TEST_P(ExhaustiveC64UnaryTest, Tanh) {
         return static_cast<complex64>(std::tanh(static_cast<complex128>(x)));
       },
       error_spec_gen);
-}
+})
 
-#if defined(UNARY_TEST_TARGET_COMPLEX)
 INSTANTIATE_TEST_SUITE_P(
     F32SpecialValues, ExhaustiveC64UnaryTest,
     ::testing::Combine(
@@ -816,10 +871,17 @@ INSTANTIATE_TEST_SUITE_P(
                                                                          4000)),
         ::testing::ValuesIn(
             GetFpValuesForMagnitudeExtremeNormals<float>(40000, 4000))));
+
+#if defined(UNARY_TEST_TARGET_COMPLEX) && \
+    !defined(XLA_BACKEND_DOES_NOT_SUPPORT_FLOAT64)
+#define UNARY_TEST_COMPLEX_128(test_name, ...)   \
+  XLA_TEST_P(ExhaustiveC128UnaryTest, test_name) \
+  __VA_ARGS__
+#else
+#define UNARY_TEST_COMPLEX_128(test_name, ...)
 #endif
 
-
-XLA_TEST_P(ExhaustiveC128UnaryTest, Log) {
+UNARY_TEST_COMPLEX_128(Log, {
   // TODO(b/138578313): Enable the test for all values after fixing the bug.
   known_incorrect_fn_ = [&](int64 v) {
     double f = this->ConvertValue(v);
@@ -827,18 +889,18 @@ XLA_TEST_P(ExhaustiveC128UnaryTest, Log) {
            std::abs(f) < 1.0e-300;
   };
   Run(Log, [](complex128 x) { return std::log<double>(x); });
-}
+})
 
-XLA_TEST_P(ExhaustiveC128UnaryTest, Sqrt) {
+UNARY_TEST_COMPLEX_128(Sqrt, {
   // Similar to the Tanh bug.
   known_incorrect_fn_ = [&](int64 v) {
     double f = this->ConvertValue(v);
     return std::abs(f) > std::numeric_limits<double>::max() / 2;
   };
   Run(Sqrt, [](complex128 x) { return std::sqrt<double>(x); });
-}
+})
 
-XLA_TEST_P(ExhaustiveC128UnaryTest, Rsqrt) {
+UNARY_TEST_COMPLEX_128(Rsqrt, {
   ErrorSpecGen error_spec_gen = GetDefaultSpecGenerator();
   if (platform_ == "CUDA") {
     // Edge case on CUDA backend where the Log of a complex number made up of
@@ -856,16 +918,14 @@ XLA_TEST_P(ExhaustiveC128UnaryTest, Rsqrt) {
       Rsqrt,
       [](complex128 x) { return complex128(1, 0) / std::sqrt<double>(x); },
       error_spec_gen);
-}
+})
 
-XLA_TEST_P(ExhaustiveC128UnaryTest, Tanh) {
+UNARY_TEST_COMPLEX_128(Tanh, {
   SetParamsForTanh();
   Run(
       Tanh, +[](complex128 x) { return std::tanh(x); });
-}
+})
 
-#if defined(UNARY_TEST_TARGET_COMPLEX)
-#if !defined(XLA_BACKEND_DOES_NOT_SUPPORT_FLOAT64)
 INSTANTIATE_TEST_SUITE_P(
     SpecialValues, ExhaustiveC128UnaryTest,
     ::testing::Combine(
@@ -898,7 +958,5 @@ INSTANTIATE_TEST_SUITE_P(
             GetFpValuesForMagnitudeExtremeNormals<double>(40000, 2000)),
         ::testing::ValuesIn(
             GetFpValuesForMagnitudeExtremeNormals<double>(40000, 2000))));
-#endif
-#endif
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/tests/slice_test.cc b/tensorflow/compiler/xla/tests/slice_test.cc
index 3fb69419e73..c04c4ec3e9d 100644
--- a/tensorflow/compiler/xla/tests/slice_test.cc
+++ b/tensorflow/compiler/xla/tests/slice_test.cc
@@ -259,17 +259,31 @@ XLA_TEST_P(SliceR1Test, DoIt_U64) { Run<uint64>(GetParam()); }
 
 XLA_TEST_P(SliceR1Test, DoIt_S64) { Run<int64>(GetParam()); }
 
-XLA_TEST_P(SliceR1LargeTest, DoIt_F32) { Run<float>(GetParam()); }
+// TODO(b/69425338): The following tests are disable on GPU because they use
+// too much GPU memory.
+XLA_TEST_P(SliceR1LargeTest, DISABLED_ON_GPU(DoIt_F32)) {
+  Run<float>(GetParam());
+}
 
-XLA_TEST_P(SliceR1LargeTest, DoIt_F64) { Run<double>(GetParam()); }
+XLA_TEST_P(SliceR1LargeTest, DISABLED_ON_GPU(DoIt_F64)) {
+  Run<double>(GetParam());
+}
 
-XLA_TEST_P(SliceR1LargeTest, DoIt_U32) { Run<uint32>(GetParam()); }
+XLA_TEST_P(SliceR1LargeTest, DISABLED_ON_GPU(DoIt_U32)) {
+  Run<uint32>(GetParam());
+}
 
-XLA_TEST_P(SliceR1LargeTest, DoIt_S32) { Run<int32>(GetParam()); }
+XLA_TEST_P(SliceR1LargeTest, DISABLED_ON_GPU(DoIt_S32)) {
+  Run<int32>(GetParam());
+}
 
-XLA_TEST_P(SliceR1LargeTest, DoIt_U64) { Run<uint64>(GetParam()); }
+XLA_TEST_P(SliceR1LargeTest, DISABLED_ON_GPU(DoIt_U64)) {
+  Run<uint64>(GetParam());
+}
 
-XLA_TEST_P(SliceR1LargeTest, DoIt_S64) { Run<int64>(GetParam()); }
+XLA_TEST_P(SliceR1LargeTest, DISABLED_ON_GPU(DoIt_S64)) {
+  Run<int64>(GetParam());
+}
 
 XLA_TEST_P(SliceR1Test, DoIt_PRED) { Run<bool>(GetParam()); }
 
@@ -315,8 +329,6 @@ INSTANTIATE_TEST_CASE_P(
     SliceR1TestDataToString
 );
 
-// TODO(b/69425338): This uses too much memory on GPU.
-#ifndef XLA_TEST_BACKEND_GPU
 INSTANTIATE_TEST_CASE_P(
     SliceR1TestBigSlicesInstantiation,
     SliceR1LargeTest,
@@ -330,7 +342,6 @@ INSTANTIATE_TEST_CASE_P(
     ),
     SliceR1TestDataToString
 );
-#endif
 
 INSTANTIATE_TEST_CASE_P(
     SliceStridedR1TestInstantiation,
diff --git a/tensorflow/compiler/xla/xla.proto b/tensorflow/compiler/xla/xla.proto
index 8c11077d549..a3bc092ac83 100644
--- a/tensorflow/compiler/xla/xla.proto
+++ b/tensorflow/compiler/xla/xla.proto
@@ -211,7 +211,7 @@ message DebugOptions {
   int32 xla_force_host_platform_device_count = 102;
 
   // If set to true XLA:GPU invokes `ptxas` with -O0 (default is -O3).
-  bool xla_gpu_disable_ptxas_optimizations = 103;
+  bool xla_gpu_disable_gpuasm_optimizations = 103;
 
   // Enable fast math with eigen in the HLO evaluator.
   bool xla_hlo_evaluator_use_fast_path = 106;
diff --git a/tensorflow/contrib/distributions/BUILD b/tensorflow/contrib/distributions/BUILD
index 87c920efa2b..c6ebd8594e9 100644
--- a/tensorflow/contrib/distributions/BUILD
+++ b/tensorflow/contrib/distributions/BUILD
@@ -507,7 +507,6 @@ cuda_py_test(
         "//tensorflow/python:framework",
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:spectral_ops_test_util",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:nn_ops",
         "//tensorflow/python:platform_test",
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/sample_stats_test.py b/tensorflow/contrib/distributions/python/kernel_tests/sample_stats_test.py
index d6020e78667..f2be3bdb656 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/sample_stats_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/sample_stats_test.py
@@ -24,7 +24,6 @@ from tensorflow.contrib.distributions.python.ops import sample_stats
 from tensorflow.python.framework import dtypes
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import spectral_ops_test_util
 from tensorflow.python.platform import test
 
 rng = np.random.RandomState(0)
@@ -46,17 +45,16 @@ class _AutoCorrelationTest(object):
     x_ph = array_ops.placeholder_with_default(
         input=x_,
         shape=x_.shape if self.use_static_shape else None)
-    with spectral_ops_test_util.fft_kernel_label_map():
-      with self.cached_session() as sess:
-        # Setting normalize = True means we divide by zero.
-        auto_corr = sample_stats.auto_correlation(
-            x_ph, axis=1, center=False, normalize=False)
-        if self.use_static_shape:
-          self.assertEqual((2, 3), auto_corr.shape)
-        auto_corr_ = sess.run(auto_corr)
-        self.assertAllClose(
-            [[0., 0., 0.],
-             [1., 1., 1.]], auto_corr_)
+    with self.cached_session() as sess:
+      # Setting normalize = True means we divide by zero.
+      auto_corr = sample_stats.auto_correlation(
+          x_ph, axis=1, center=False, normalize=False)
+      if self.use_static_shape:
+        self.assertEqual((2, 3), auto_corr.shape)
+      auto_corr_ = sess.run(auto_corr)
+      self.assertAllClose(
+          [[0., 0., 0.],
+           [1., 1., 1.]], auto_corr_)
 
   def test_constant_sequence_axis_0_max_lags_none_center_true(self):
     x_ = np.array([[0., 0., 0.],
@@ -64,17 +62,16 @@ class _AutoCorrelationTest(object):
     x_ph = array_ops.placeholder_with_default(
         input=x_,
         shape=x_.shape if self.use_static_shape else None)
-    with spectral_ops_test_util.fft_kernel_label_map():
-      with self.cached_session() as sess:
-        # Setting normalize = True means we divide by zero.
-        auto_corr = sample_stats.auto_correlation(
-            x_ph, axis=1, normalize=False, center=True)
-        if self.use_static_shape:
-          self.assertEqual((2, 3), auto_corr.shape)
-        auto_corr_ = sess.run(auto_corr)
-        self.assertAllClose(
-            [[0., 0., 0.],
-             [0., 0., 0.]], auto_corr_)
+    with self.cached_session() as sess:
+      # Setting normalize = True means we divide by zero.
+      auto_corr = sample_stats.auto_correlation(
+          x_ph, axis=1, normalize=False, center=True)
+      if self.use_static_shape:
+        self.assertEqual((2, 3), auto_corr.shape)
+      auto_corr_ = sess.run(auto_corr)
+      self.assertAllClose(
+          [[0., 0., 0.],
+           [0., 0., 0.]], auto_corr_)
 
   def check_results_versus_brute_force(
       self, x, axis, max_lags, center, normalize):
@@ -99,16 +96,15 @@ class _AutoCorrelationTest(object):
 
     x_ph = array_ops.placeholder_with_default(
         x, shape=x.shape if self.use_static_shape else None)
-    with spectral_ops_test_util.fft_kernel_label_map():
-      with self.cached_session():
-        auto_corr = sample_stats.auto_correlation(
-            x_ph, axis=axis, max_lags=max_lags, center=center,
-            normalize=normalize)
-        if self.use_static_shape:
-          output_shape = list(x.shape)
-          output_shape[axis] = max_lags + 1
-          self.assertAllEqual(output_shape, auto_corr.shape)
-        self.assertAllClose(rxx, auto_corr.eval(), rtol=1e-5, atol=1e-5)
+    with self.cached_session():
+      auto_corr = sample_stats.auto_correlation(
+          x_ph, axis=axis, max_lags=max_lags, center=center,
+          normalize=normalize)
+      if self.use_static_shape:
+        output_shape = list(x.shape)
+        output_shape[axis] = max_lags + 1
+        self.assertAllEqual(output_shape, auto_corr.shape)
+      self.assertAllClose(rxx, auto_corr.eval(), rtol=1e-5, atol=1e-5)
 
   def test_axis_n1_center_false_max_lags_none(self):
     x = rng.randn(2, 3, 4).astype(self.dtype)
@@ -166,20 +162,18 @@ class _AutoCorrelationTest(object):
     x = rng.randn(l).astype(self.dtype)
     x_ph = array_ops.placeholder_with_default(
         x, shape=(l,) if self.use_static_shape else None)
-    with spectral_ops_test_util.fft_kernel_label_map():
-      with self.cached_session():
-        rxx = sample_stats.auto_correlation(
-            x_ph, max_lags=l // 2, center=True, normalize=False)
-        if self.use_static_shape:
-          self.assertAllEqual((l // 2 + 1,), rxx.shape)
-        rxx_ = rxx.eval()
-        # OSS CPU FFT has some accuracy issues is not the most accurate.
-        # So this tolerance is a bit bad.
-        self.assertAllClose(1., rxx_[0], rtol=0.05)
-        # The maximal error in the rest of the sequence is not great.
-        self.assertAllClose(np.zeros(l // 2), rxx_[1:], atol=0.1)
-        # The mean error in the rest is ok, actually 0.008 when I tested it.
-        self.assertLess(np.abs(rxx_[1:]).mean(), 0.02)
+    with self.cached_session():
+      rxx = sample_stats.auto_correlation(
+          x_ph, max_lags=l // 2, center=True, normalize=False)
+      if self.use_static_shape:
+        self.assertAllEqual((l // 2 + 1,), rxx.shape)
+      rxx_ = rxx.eval()
+      # OSS CPU FFT has some accuracy issues, so this tolerance is a bit bad.
+      self.assertAllClose(1., rxx_[0], rtol=0.05)
+      # The maximal error in the rest of the sequence is not great.
+      self.assertAllClose(np.zeros(l // 2), rxx_[1:], atol=0.1)
+      # The mean error in the rest is ok, actually 0.008 when I tested it.
+      self.assertLess(np.abs(rxx_[1:]).mean(), 0.02)
 
   def test_step_function_sequence(self):
     # x jumps to new random value every 10 steps.  So correlation length = 10.
@@ -187,43 +181,40 @@ class _AutoCorrelationTest(object):
          * np.ones((1, 10))).ravel().astype(self.dtype)
     x_ph = array_ops.placeholder_with_default(
         x, shape=(1000 * 10,) if self.use_static_shape else None)
-    with spectral_ops_test_util.fft_kernel_label_map():
-      with self.cached_session():
-        rxx = sample_stats.auto_correlation(
-            x_ph, max_lags=1000 * 10 // 2, center=True, normalize=False)
-        if self.use_static_shape:
-          self.assertAllEqual((1000 * 10 // 2 + 1,), rxx.shape)
-        rxx_ = rxx.eval()
-        rxx_ /= rxx_[0]
-        # Expect positive correlation for the first 10 lags, then significantly
-        # smaller negative.
-        self.assertGreater(rxx_[:10].min(), 0)
-        self.assertGreater(rxx_[9], 5 * rxx_[10:20].mean())
-        # RXX should be decreasing for the first 10 lags.
-        diff = np.diff(rxx_)
-        self.assertLess(diff[:10].max(), 0)
+    with self.cached_session():
+      rxx = sample_stats.auto_correlation(
+          x_ph, max_lags=1000 * 10 // 2, center=True, normalize=False)
+      if self.use_static_shape:
+        self.assertAllEqual((1000 * 10 // 2 + 1,), rxx.shape)
+      rxx_ = rxx.eval()
+      rxx_ /= rxx_[0]
+      # Expect positive correlation for the first 10 lags, then significantly
+      # smaller negative.
+      self.assertGreater(rxx_[:10].min(), 0)
+      self.assertGreater(rxx_[9], 5 * rxx_[10:20].mean())
+      # RXX should be decreasing for the first 10 lags.
+      diff = np.diff(rxx_)
+      self.assertLess(diff[:10].max(), 0)
 
   def test_normalization(self):
     l = 10000
     x = 3 * rng.randn(l).astype(self.dtype)
     x_ph = array_ops.placeholder_with_default(
         x, shape=(l,) if self.use_static_shape else None)
-    with spectral_ops_test_util.fft_kernel_label_map():
-      with self.cached_session():
-        rxx = sample_stats.auto_correlation(
-            x_ph, max_lags=l // 2, center=True, normalize=True)
-        if self.use_static_shape:
-          self.assertAllEqual((l // 2 + 1,), rxx.shape)
-        rxx_ = rxx.eval()
-        # Note that RXX[0] = 1, despite the fact that E[X^2] = 9, and this is
-        # due to normalize=True.
-        # OSS CPU FFT has some accuracy issues is not the most accurate.
-        # So this tolerance is a bit bad.
-        self.assertAllClose(1., rxx_[0], rtol=0.05)
-        # The maximal error in the rest of the sequence is not great.
-        self.assertAllClose(np.zeros(l // 2), rxx_[1:], atol=0.1)
-        # The mean error in the rest is ok, actually 0.008 when I tested it.
-        self.assertLess(np.abs(rxx_[1:]).mean(), 0.02)
+    with self.cached_session():
+      rxx = sample_stats.auto_correlation(
+          x_ph, max_lags=l // 2, center=True, normalize=True)
+      if self.use_static_shape:
+        self.assertAllEqual((l // 2 + 1,), rxx.shape)
+      rxx_ = rxx.eval()
+      # Note that RXX[0] = 1, despite the fact that E[X^2] = 9, and this is
+      # due to normalize=True.
+      # OSS CPU FFT has some accuracy issues, so this tolerance is a bit bad.
+      self.assertAllClose(1., rxx_[0], rtol=0.05)
+      # The maximal error in the rest of the sequence is not great.
+      self.assertAllClose(np.zeros(l // 2), rxx_[1:], atol=0.1)
+      # The mean error in the rest is ok, actually 0.008 when I tested it.
+      self.assertLess(np.abs(rxx_[1:]).mean(), 0.02)
 
 
 class AutoCorrelationTestStaticShapeFloat32(test.TestCase,
diff --git a/tensorflow/contrib/eager/python/BUILD b/tensorflow/contrib/eager/python/BUILD
index 88bf3792c55..ef8c760ad96 100644
--- a/tensorflow/contrib/eager/python/BUILD
+++ b/tensorflow/contrib/eager/python/BUILD
@@ -29,7 +29,6 @@ py_library(
         "//tensorflow/python/eager:backprop",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/eager:def_function",
-        "//tensorflow/python/eager:execution_callbacks",
         "//tensorflow/python/eager:function",
         "//tensorflow/python/eager:remote",
     ],
diff --git a/tensorflow/contrib/eager/python/tfe.py b/tensorflow/contrib/eager/python/tfe.py
index 8080d954eb7..7a7cf712543 100644
--- a/tensorflow/contrib/eager/python/tfe.py
+++ b/tensorflow/contrib/eager/python/tfe.py
@@ -39,15 +39,6 @@ To use, at program startup, call `tf.compat.v1.enable_eager_execution()`.
 
 @@custom_gradient
 
-@@add_execution_callback
-@@clear_execution_callbacks
-@@errstate
-@@ExecutionCallback
-@@inf_callback
-@@inf_nan_callback
-@@nan_callback
-@@seterr
-
 @@Iterator
 @@Saver
 @@restore_variables_on_create
@@ -117,14 +108,6 @@ from tensorflow.python.eager.context import ASYNC
 from tensorflow.python.eager.context import num_gpus
 from tensorflow.python.eager.context import set_server_def
 from tensorflow.python.eager.def_function import function
-from tensorflow.python.eager.execution_callbacks import add_execution_callback
-from tensorflow.python.eager.execution_callbacks import clear_execution_callbacks
-from tensorflow.python.eager.execution_callbacks import errstate
-from tensorflow.python.eager.execution_callbacks import ExecutionCallback
-from tensorflow.python.eager.execution_callbacks import inf_callback
-from tensorflow.python.eager.execution_callbacks import inf_nan_callback
-from tensorflow.python.eager.execution_callbacks import nan_callback
-from tensorflow.python.eager.execution_callbacks import seterr
 from tensorflow.python.eager.remote import connect_to_remote_host
 from tensorflow.python.framework.tensor_spec import TensorSpec
 from tensorflow.python.framework.ops import enable_eager_execution
diff --git a/tensorflow/contrib/feature_column/python/feature_column/sequence_feature_column_integration_test.py b/tensorflow/contrib/feature_column/python/feature_column/sequence_feature_column_integration_test.py
index bcc25b8de89..d4f4d657975 100644
--- a/tensorflow/contrib/feature_column/python/feature_column/sequence_feature_column_integration_test.py
+++ b/tensorflow/contrib/feature_column/python/feature_column/sequence_feature_column_integration_test.py
@@ -87,7 +87,7 @@ class SequenceFeatureColumnIntegrationTest(test.TestCase):
     ds = ds.batch(20)
 
     # Test on a single batch
-    features = ds.make_one_shot_iterator().get_next()
+    features = dataset_ops.make_one_shot_iterator(ds).get_next()
 
     # Tile the context features across the sequence features
     seq_layer, _ = sfc.sequence_input_layer(features, seq_cols)
diff --git a/tensorflow/contrib/learn/BUILD b/tensorflow/contrib/learn/BUILD
index d48edc027a2..e4ed2c7841a 100644
--- a/tensorflow/contrib/learn/BUILD
+++ b/tensorflow/contrib/learn/BUILD
@@ -126,7 +126,7 @@ py_test(
 
 py_test(
     name = "estimators_test",
-    size = "small",
+    size = "medium",
     srcs = ["python/learn/estimators/estimators_test.py"],
     python_version = "PY2",
     srcs_version = "PY2AND3",
diff --git a/tensorflow/contrib/learn/python/learn/estimators/estimator.py b/tensorflow/contrib/learn/python/learn/estimators/estimator.py
index 9132b2209bc..c762227b20b 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/estimator.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/estimator.py
@@ -1088,12 +1088,22 @@ class BaseEstimator(sklearn.BaseEstimator, evaluable.Evaluable,
           chief_only_hooks=chief_hooks + model_fn_ops.training_chief_hooks,
           save_checkpoint_secs=0,  # Saving is handled by a hook.
           save_summaries_steps=self._config.save_summary_steps,
+          max_wait_secs=self._config.session_creation_timeout_secs,
           config=self._session_config) as mon_sess:
         loss = None
         while not mon_sess.should_stop():
           _, loss = mon_sess.run([model_fn_ops.train_op, model_fn_ops.loss])
       return loss
 
+  def latest_checkpoint(self):
+    """Finds the filename of the latest saved checkpoint file in `model_dir`.
+
+    Returns:
+      The full path to the latest checkpoint or `None` if no checkpoint was
+      found.
+    """
+    return checkpoint_management.latest_checkpoint(self.model_dir)
+
 
 def _identity_feature_engineering_fn(features, labels):
   return features, labels
diff --git a/tensorflow/contrib/learn/python/learn/estimators/run_config.py b/tensorflow/contrib/learn/python/learn/estimators/run_config.py
index b51ea30959e..e435fd65702 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/run_config.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/run_config.py
@@ -243,7 +243,8 @@ class RunConfig(ClusterConfig, core_run_config.RunConfig):
                protocol=None,
                evaluation_master='',
                model_dir=None,
-               session_config=None):
+               session_config=None,
+               session_creation_timeout_secs=7200):
     """Constructor.
 
     The superclass `ClusterConfig` may set properties like `cluster_spec`,
@@ -282,6 +283,8 @@ class RunConfig(ClusterConfig, core_run_config.RunConfig):
         the feature.
       log_step_count_steps: The frequency, in number of global steps, that the
         global step/sec will be logged during training.
+      protocol: An optional argument which specifies the protocol used when
+        starting server. None means default to grpc.
       evaluation_master: the master on which to perform evaluation.
       model_dir: directory where model parameters, graph etc are saved. If
         `None`, will use `model_dir` property in `TF_CONFIG` environment
@@ -290,8 +293,11 @@ class RunConfig(ClusterConfig, core_run_config.RunConfig):
       session_config: a ConfigProto used to set session parameters, or None.
         Note - using this argument, it is easy to provide settings which break
         otherwise perfectly good models. Use with care.
-      protocol: An optional argument which specifies the protocol used when
-        starting server. None means default to grpc.
+      session_creation_timeout_secs: Max time workers should wait for a session
+        to become available (on initialization or when recovering a session)
+        with MonitoredTrainingSession. Defaults to 7200 seconds, but users may
+        want to set a lower value to detect problems with variable / session
+        (re)-initialization more quickly.
     """
     # Neither parent class calls super().__init__(), so here we have to
     # manually call their __init__() methods.
@@ -332,6 +338,7 @@ class RunConfig(ClusterConfig, core_run_config.RunConfig):
     self._keep_checkpoint_max = keep_checkpoint_max
     self._keep_checkpoint_every_n_hours = keep_checkpoint_every_n_hours
     self._model_dir = _get_model_dir(model_dir)
+    self._session_creation_timeout_secs = session_creation_timeout_secs
 
   @experimental
   def uid(self, whitelist=None):
@@ -408,6 +415,10 @@ class RunConfig(ClusterConfig, core_run_config.RunConfig):
   def log_step_count_steps(self):
     return self._log_step_count_steps
 
+  @property
+  def session_creation_timeout_secs(self):
+    return self._session_creation_timeout_secs
+
 
 def _count_ps(cluster_spec):
   """Counts the number of parameter servers in cluster_spec."""
diff --git a/tensorflow/contrib/makefile/proto_text_cc_files.txt b/tensorflow/contrib/makefile/proto_text_cc_files.txt
index b53f0588b56..72d489c3514 100644
--- a/tensorflow/contrib/makefile/proto_text_cc_files.txt
+++ b/tensorflow/contrib/makefile/proto_text_cc_files.txt
@@ -34,19 +34,19 @@ tensorflow/core/lib/strings/ordered_code.cc
 tensorflow/core/lib/strings/proto_text_util.cc
 tensorflow/core/lib/wav/wav_io.cc
 tensorflow/core/platform/cpu_info.cc
+tensorflow/core/platform/default/env_time.cc
+tensorflow/core/platform/default/load_library.cc
 tensorflow/core/platform/default/logging.cc
 tensorflow/core/platform/default/mutex.cc
+tensorflow/core/platform/default/port.cc
 tensorflow/core/platform/default/tracing.cc
 tensorflow/core/platform/denormal.cc
 tensorflow/core/platform/env.cc
+tensorflow/core/platform/error.cc
 tensorflow/core/platform/file_system.cc
 tensorflow/core/platform/file_system_helper.cc
 tensorflow/core/platform/numbers.cc
 tensorflow/core/platform/posix/env.cc
-tensorflow/core/platform/posix/env_time.cc
-tensorflow/core/platform/posix/error.cc
-tensorflow/core/platform/posix/load_library.cc
-tensorflow/core/platform/posix/port.cc
 tensorflow/core/platform/posix/posix_file_system.cc
 tensorflow/core/platform/protobuf.cc
 tensorflow/core/platform/protobuf_util.cc
diff --git a/tensorflow/contrib/seq2seq/python/ops/decoder.py b/tensorflow/contrib/seq2seq/python/ops/decoder.py
index ab124959001..6543f09e40d 100644
--- a/tensorflow/contrib/seq2seq/python/ops/decoder.py
+++ b/tensorflow/contrib/seq2seq/python/ops/decoder.py
@@ -410,14 +410,27 @@ def dynamic_decode(decoder,
       """
       (next_outputs, decoder_state, next_inputs,
        decoder_finished) = decoder.step(time, inputs, state)
+      decoder_state_sequence_lengths = False
       if decoder.tracks_own_finished:
         next_finished = decoder_finished
+        lengths = getattr(decoder_state, "lengths", None)
+        if lengths is not None:
+          # sequence lengths are provided by decoder_state.lengths; overwrite
+          # our sequence lengths.
+          decoder_state_sequence_lengths = True
+          sequence_lengths = math_ops.cast(lengths, dtypes.int32)
       else:
         next_finished = math_ops.logical_or(decoder_finished, finished)
-      next_sequence_lengths = array_ops.where(
-          math_ops.logical_not(finished),
-          array_ops.fill(array_ops.shape(sequence_lengths), time + 1),
-          sequence_lengths)
+
+      if decoder_state_sequence_lengths:
+        # Just pass something through the loop; at the next iteration we'll pull
+        # the sequence lengths from the decoder_state again.
+        next_sequence_lengths = sequence_lengths
+      else:
+        next_sequence_lengths = array_ops.where(
+            math_ops.logical_not(finished),
+            array_ops.fill(array_ops.shape(sequence_lengths), time + 1),
+            sequence_lengths)
 
       nest.assert_same_structure(state, decoder_state)
       nest.assert_same_structure(outputs_ta, next_outputs)
diff --git a/tensorflow/contrib/session_bundle/BUILD b/tensorflow/contrib/session_bundle/BUILD
index 737d6866283..edfc6639b0f 100644
--- a/tensorflow/contrib/session_bundle/BUILD
+++ b/tensorflow/contrib/session_bundle/BUILD
@@ -303,8 +303,7 @@ cc_library(
     hdrs = ["signature.h"],
     deprecation = "No longer supported. Switch to SavedModel immediately.",
     visibility = ["//visibility:public"],
-    deps = [
-    ] + if_not_mobile([
+    deps = if_not_mobile([
         ":manifest_proto_cc",
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:framework",
diff --git a/tensorflow/contrib/training/BUILD b/tensorflow/contrib/training/BUILD
index 017d08f5f60..1161f52cfbc 100644
--- a/tensorflow/contrib/training/BUILD
+++ b/tensorflow/contrib/training/BUILD
@@ -1,5 +1,6 @@
 # Description:
 #   contains parts of TensorFlow that are experimental or unstable and which are not supported.
+# Placeholder for Google-internal load statements.
 load("//tensorflow/core/platform:default/build_config.bzl", "tf_proto_library")
 load("//tensorflow:tensorflow.bzl", "py_test")
 
diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD
index 75f6d942dba..2dd6240cdf4 100644
--- a/tensorflow/core/BUILD
+++ b/tensorflow/core/BUILD
@@ -106,11 +106,7 @@ load("//tensorflow:tensorflow.bzl", "tf_version_info_genrule")
 load(
     "//tensorflow/core/platform:default/build_config.bzl",
     "tf_additional_all_protos",
-    "tf_additional_cloud_kernel_deps",
-    "tf_additional_cloud_op_deps",
     "tf_additional_core_deps",
-    "tf_additional_device_tracer_cuda_deps",
-    "tf_additional_device_tracer_test_flags",
     "tf_additional_human_readable_json_deps",
     "tf_additional_lib_defines",
     "tf_additional_lib_deps",
@@ -384,6 +380,7 @@ cc_library(
     name = "util_port",
     srcs = ["util/port.cc"],
     hdrs = ["util/port.h"],
+    copts = tf_copts(),
     visibility = [
         "//tensorflow/core:__pkg__",
         "//tensorflow/python:__pkg__",
@@ -726,20 +723,7 @@ cc_library(
 cc_library(
     name = "lib",
     hdrs = [
-        "lib/hash/crc32c.h",
-        "lib/hash/hash.h",
         "lib/histogram/histogram.h",
-        "lib/io/buffered_inputstream.h",
-        "lib/io/compression.h",
-        "lib/io/inputstream_interface.h",
-        "lib/io/path.h",
-        "lib/io/proto_encode_helper.h",
-        "lib/io/random_inputstream.h",
-        "lib/io/record_reader.h",
-        "lib/io/record_writer.h",
-        "lib/io/table.h",
-        "lib/io/table_builder.h",
-        "lib/io/table_options.h",
         "lib/monitoring/collected_metrics.h",
         "lib/monitoring/collection_registry.h",
         "lib/monitoring/counter.h",
@@ -755,6 +739,8 @@ cc_library(
         "//tensorflow/core/lib/bfloat16:bfloat16.h",
         "//tensorflow/core/lib/core:legacy_lib_core_headers",
         "//tensorflow/core/lib/gtl:legacy_lib_gtl_headers",
+        "//tensorflow/core/lib/hash:legacy_lib_hash_all_headers",
+        "//tensorflow/core/lib/io:legacy_lib_io_headers",
         "//tensorflow/core/lib/math:math_util.h",
         "//tensorflow/core/lib/random:legacy_lib_random_headers",
         "//tensorflow/core/lib/strings:legacy_lib_string_headers",
@@ -1445,7 +1431,7 @@ cc_library(
     ]) + if_tensorrt([
         "//tensorflow/compiler/tf2tensorrt:trt_engine_resource_ops_op_lib",
         "//tensorflow/compiler/tf2tensorrt:trt_op_libs",
-    ]) + tf_additional_cloud_op_deps(),
+    ]),
     alwayslink = 1,
 )
 
@@ -1609,7 +1595,7 @@ cc_library(
         "//tensorflow/core/kernels:training_ops",
         "//tensorflow/core/kernels:word2vec_kernels",
         "//tensorflow/core/kernels/sparse:kernels",
-    ] + tf_additional_cloud_kernel_deps() + if_not_windows([
+    ] + if_not_windows([
         "//tensorflow/core/kernels:fact_op",
         "//tensorflow/core/kernels:array_not_windows",
         "//tensorflow/core/kernels:math_not_windows",
@@ -1801,6 +1787,10 @@ filegroup(
         "//tensorflow/core/lib/core:legacy_lib_core_all_headers",
         "//tensorflow/core/lib/core:legacy_lib_core_all_srcs",
         "//tensorflow/core/lib/gtl:legacy_lib_gtl_all_headers",
+        "//tensorflow/core/lib/hash:legacy_lib_hash_all_headers",
+        "//tensorflow/core/lib/hash:legacy_lib_hash_all_srcs",
+        "//tensorflow/core/lib/io:legacy_lib_io_all_headers",
+        "//tensorflow/core/lib/io:legacy_lib_io_all_srcs",
         "//tensorflow/core/lib/random:legacy_lib_random_all_headers",
         "//tensorflow/core/lib/random:legacy_lib_random_all_srcs",
         "//tensorflow/core/lib/strings:legacy_lib_strings_all_headers",
@@ -2336,7 +2326,7 @@ tf_proto_library_cc(
     name = "worker_proto",
     srcs = ["protobuf/worker.proto"],
     cc_api_version = 2,
-    protodeps = tf_additional_all_protos() + [],
+    protodeps = tf_additional_all_protos(),
     visibility = ["//visibility:public"],
 )
 
@@ -2393,6 +2383,8 @@ LIB_INTERNAL_PRIVATE_HEADERS = [
     "//tensorflow/core/lib/bfloat16:bfloat16.h",
     "//tensorflow/core/lib/core:legacy_lib_core_all_headers",
     "//tensorflow/core/lib/gtl:legacy_lib_gtl_all_headers",
+    "//tensorflow/core/lib/hash:legacy_lib_hash_all_headers",
+    "//tensorflow/core/lib/io:legacy_lib_io_all_headers",
     "//tensorflow/core/lib/random:legacy_lib_random_all_headers",
     "//tensorflow/core/lib/strings:legacy_lib_strings_all_headers",
     "//tensorflow/core/lib/math:math_util.h",
@@ -2411,14 +2403,8 @@ LIB_INTERNAL_PRIVATE_HEADERS = [
 LIB_INTERNAL_PUBLIC_HEADERS = [
     "//tensorflow/core/lib/core:legacy_lib_internal_core_headers",
     "//tensorflow/core/lib/gtl:legacy_lib_internal_public_gtl_headers",
-    "lib/hash/hash.h",
-    "lib/io/inputbuffer.h",
-    "lib/io/iterator.h",
-    "lib/io/snappy/snappy_inputbuffer.h",
-    "lib/io/snappy/snappy_outputbuffer.h",
-    "lib/io/zlib_compression_options.h",
-    "lib/io/zlib_inputstream.h",
-    "lib/io/zlib_outputbuffer.h",
+    "//tensorflow/core/lib/hash:legacy_lib_internal_public_headers",
+    "//tensorflow/core/lib/io:legacy_lib_internal_public_headers",
     "lib/monitoring/mobile_counter.h",
     "lib/monitoring/mobile_gauge.h",
     "lib/monitoring/mobile_sampler.h",
@@ -2483,7 +2469,6 @@ cc_library(
         exclude = [
             "**/*test*",
             "framework/variant.cc",
-            "lib/hash/crc32c_accelerate.cc",
             "lib/gif/**/*",
             "lib/jpeg/**/*",
             "lib/png/**/*",
@@ -2493,6 +2478,8 @@ cc_library(
         "//tensorflow/core/platform:legacy_platform_lib_srcs",
         "//tensorflow/core/platform:legacy_lib_internal_srcs",
         "//tensorflow/core/lib/core:legacy_lib_core_all_srcs",
+        "//tensorflow/core/lib/hash:legacy_lib_internal_impl_srcs",
+        "//tensorflow/core/lib/io:legacy_lib_io_all_srcs",
         "//tensorflow/core/lib/random:legacy_lib_random_all_srcs",
         "//tensorflow/core/lib/strings:legacy_lib_strings_all_srcs",
     ],
@@ -2529,7 +2516,9 @@ cc_library(
 # File compiled with extra flags to get cpu-specific acceleration.
 cc_library(
     name = "lib_hash_crc32c_accelerate_internal",
-    srcs = ["lib/hash/crc32c_accelerate.cc"],
+    srcs = [
+        "//tensorflow/core/lib/hash:legacy_crc32_accelerate_srcs",
+    ],
     # -msse4.2 enables the use of crc32c compiler builtins.
     copts = tf_copts() + if_linux_x86_64(["-msse4.2"]),
 )
@@ -3412,28 +3401,6 @@ cc_library(
     alwayslink = 1,
 )
 
-tf_cuda_library(
-    name = "device_tracer",
-    srcs = [
-        "//tensorflow/core/platform:legacy_device_tracer_srcs",
-    ],
-    copts = tf_copts(),
-    cuda_deps = tf_additional_device_tracer_cuda_deps(),
-    visibility = [
-        "//tensorflow:internal",
-    ],
-    deps = [
-        ":core_cpu_internal",
-        ":lib",
-        ":protos_all_cc",
-        "//tensorflow/core/profiler/internal:parse_annotation",
-        "//tensorflow/core/profiler/internal:profiler_interface",
-        "//tensorflow/core/profiler/lib:traceme",
-        "@com_google_absl//absl/flags:flag",
-    ],
-    alwayslink = True,
-)
-
 tf_proto_library_cc(
     name = "replay_log_proto",
     srcs = ["protobuf/replay_log.proto"],
@@ -3649,10 +3616,8 @@ cc_library(
     name = "lib_test_internal",
     testonly = 1,
     hdrs = [
-        "lib/io/block.h",
-        "lib/io/block_builder.h",
-        "lib/io/format.h",
         "//tensorflow/core/lib/gtl:legacy_lib_test_internal_headers",
+        "//tensorflow/core/lib/io:legacy_lib_test_internal_headers",
         "//tensorflow/core/lib/random:legacy_lib_test_internal_headers",
     ],
     deps = [
@@ -3732,19 +3697,7 @@ tf_cc_tests(
     name = "low_level_library_tests",
     size = "small",
     srcs = [
-        "lib/hash/crc32c_test.cc",
-        "lib/hash/hash_test.cc",
         "lib/histogram/histogram_test.cc",
-        "lib/io/buffered_inputstream_test.cc",
-        "lib/io/inputbuffer_test.cc",
-        "lib/io/inputstream_interface_test.cc",
-        "lib/io/path_test.cc",
-        "lib/io/random_inputstream_test.cc",
-        "lib/io/record_reader_writer_test.cc",
-        "lib/io/recordio_test.cc",
-        "lib/io/snappy/snappy_buffers_test.cc",
-        "lib/io/table_test.cc",
-        "lib/io/zlib_buffers_test.cc",
         "lib/monitoring/collection_registry_test.cc",
         "lib/monitoring/counter_test.cc",
         "lib/monitoring/gauge_test.cc",
@@ -3753,6 +3706,8 @@ tf_cc_tests(
         "lib/wav/wav_io_test.cc",
         "//tensorflow/core/lib/core:legacy_lib_core_all_tests",
         "//tensorflow/core/lib/gtl:legacy_lib_gtl_tests",
+        "//tensorflow/core/lib/hash:legacy_lib_hash_all_tests",
+        "//tensorflow/core/lib/io:legacy_lib_io_all_tests",
         "//tensorflow/core/lib/math:math_util_test.cc",
         "//tensorflow/core/lib/random:legacy_lib_random_tests",
         "//tensorflow/core/lib/strings:legacy_low_level_library_tests",
@@ -3789,6 +3744,7 @@ tf_cc_tests(
         "//third_party/eigen3",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/synchronization",
+        "@com_google_absl//absl/types:optional",
         "@zlib_archive//:zlib",
     ],
 )
@@ -5259,36 +5215,6 @@ tf_cc_test(
     ],
 )
 
-tf_cc_test_gpu(
-    name = "device_tracer_test",
-    size = "small",
-    srcs = ["//tensorflow/core/platform:device_tracer_test.cc"],
-    args =
-        ["--heap_check=local"] + tf_additional_device_tracer_test_flags(),
-    linkstatic = tf_kernel_tests_linkstatic(),
-    tags = tf_cuda_tests_tags() + ["nomac"],
-    deps = [
-        ":all_kernels",
-        ":core_cpu",
-        ":core_cpu_internal",
-        ":device_tracer",
-        ":direct_session",
-        ":direct_session_internal",
-        ":framework",
-        ":framework_internal",
-        ":gpu_runtime",
-        ":lib",
-        ":lib_internal",
-        ":protos_all_cc",
-        ":test",
-        ":test_main",
-        ":testlib",
-        "//tensorflow/cc:cc_ops",
-        "//tensorflow/core/kernels:ops_util",
-        "//tensorflow/core/profiler/internal:profiler_interface",
-    ],
-)
-
 tf_cc_tests(
     name = "common_runtime_input_colocation_exemption_registry_test",
     size = "small",
diff --git a/tensorflow/core/api_def/base_api/api_def_Bitcast.pbtxt b/tensorflow/core/api_def/base_api/api_def_Bitcast.pbtxt
index f65ce366587..8d6b80094c7 100644
--- a/tensorflow/core/api_def/base_api/api_def_Bitcast.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_Bitcast.pbtxt
@@ -18,21 +18,23 @@ gives module error.
 For example,
 
 Example 1:
-```python
+
 >>> a = [1., 2., 3.]
->>> equality_bitcast = tf.bitcast(a,tf.complex128)
-tensorflow.python.framework.errors_impl.InvalidArgumentError: Cannot bitcast from float to complex128: shape [3] [Op:Bitcast]
->>> equality_cast = tf.cast(a,tf.complex128)
+>>> equality_bitcast = tf.bitcast(a, tf.complex128)
+Traceback (most recent call last):
+...
+InvalidArgumentError: Cannot bitcast from 1 to 18 [Op:Bitcast]
+>>> equality_cast = tf.cast(a, tf.complex128)
 >>> print(equality_cast)
 tf.Tensor([1.+0.j 2.+0.j 3.+0.j], shape=(3,), dtype=complex128)
-```
+
 Example 2:
-```python
+
 >>> tf.bitcast(tf.constant(0xffffffff, dtype=tf.uint32), tf.uint8)
 <tf.Tensor: ... shape=(4,), dtype=uint8, numpy=array([255, 255, 255, 255], dtype=uint8)>
-```
+
 Example 3:
-```python
+
 >>> x = [1., 2., 3.]
 >>> y = [0., 2., 3.]
 >>> equality= tf.equal(x,y)
@@ -44,10 +46,9 @@ tf.Tensor([False True True], shape=(3,), dtype=bool)
 tf.Tensor([0. 1. 1.], shape=(3,), dtype=float32)
 >>> print(equality_bitcast)
 tf.Tensor(
-[[ 0 0 0 0]
- [ 0 0 128 63]
- [ 0 0 128 63]], shape=(3, 4), dtype=uint8)
-```
+    [[  0   0   0   0]
+     [  0   0 128  63]
+     [  0   0 128  63]], shape=(3, 4), dtype=uint8)
 
 *NOTE*: Bitcast is implemented as a low-level cast, so machines with different
 endian orderings will give different results.
diff --git a/tensorflow/core/api_def/base_api/api_def_BroadcastTo.pbtxt b/tensorflow/core/api_def/base_api/api_def_BroadcastTo.pbtxt
index 669223df862..2af0ea31c62 100644
--- a/tensorflow/core/api_def/base_api/api_def_BroadcastTo.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_BroadcastTo.pbtxt
@@ -28,14 +28,13 @@ and works its way forward.
 
 For example,
 
-```python
 >>> x = tf.constant([1, 2, 3])
 >>> y = tf.broadcast_to(x, [3, 3])
->>> sess.run(y)
-array([[1, 2, 3],
-       [1, 2, 3],
-       [1, 2, 3]], dtype=int32)
-```
+>>> print(y)
+tf.Tensor(
+    [[1 2 3]
+     [1 2 3]
+     [1 2 3]], shape=(3, 3), dtype=int32)
 
 In the above example, the input Tensor with the shape of `[1, 3]`
 is broadcasted to output Tensor with shape of `[3, 3]`.
diff --git a/tensorflow/core/api_def/base_api/api_def_FFT3D.pbtxt b/tensorflow/core/api_def/base_api/api_def_FFT3D.pbtxt
index abd2e67bceb..33de5f424c9 100644
--- a/tensorflow/core/api_def/base_api/api_def_FFT3D.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_FFT3D.pbtxt
@@ -3,13 +3,13 @@ op {
   in_arg {
     name: "input"
     description: <<END
-A complex64 tensor.
+A complex tensor.
 END
   }
   out_arg {
     name: "output"
     description: <<END
-A complex64 tensor of the same shape as `input`. The inner-most 3
+A complex tensor of the same shape as `input`. The inner-most 3
   dimensions of `input` are replaced with their 3D Fourier transform.
 
 @compatibility(numpy)
diff --git a/tensorflow/core/api_def/base_api/api_def_IFFT3D.pbtxt b/tensorflow/core/api_def/base_api/api_def_IFFT3D.pbtxt
index 52f1118775b..65857c5661b 100644
--- a/tensorflow/core/api_def/base_api/api_def_IFFT3D.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_IFFT3D.pbtxt
@@ -3,13 +3,13 @@ op {
   in_arg {
     name: "input"
     description: <<END
-A complex64 tensor.
+A complex tensor.
 END
   }
   out_arg {
     name: "output"
     description: <<END
-A complex64 tensor of the same shape as `input`. The inner-most 3
+A complex tensor of the same shape as `input`. The inner-most 3
   dimensions of `input` are replaced with their inverse 3D Fourier transform.
 
 @compatibility(numpy)
diff --git a/tensorflow/core/api_def/base_api/api_def_IRFFT.pbtxt b/tensorflow/core/api_def/base_api/api_def_IRFFT.pbtxt
index 1e1caa9eade..f0cd4129a88 100644
--- a/tensorflow/core/api_def/base_api/api_def_IRFFT.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_IRFFT.pbtxt
@@ -3,7 +3,7 @@ op {
   in_arg {
     name: "input"
     description: <<END
-A complex64 tensor.
+A complex tensor.
 END
   }
   in_arg {
diff --git a/tensorflow/core/api_def/base_api/api_def_IRFFT2D.pbtxt b/tensorflow/core/api_def/base_api/api_def_IRFFT2D.pbtxt
index 9b7390a3857..15183a87c18 100644
--- a/tensorflow/core/api_def/base_api/api_def_IRFFT2D.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_IRFFT2D.pbtxt
@@ -3,7 +3,7 @@ op {
   in_arg {
     name: "input"
     description: <<END
-A complex64 tensor.
+A complex tensor.
 END
   }
   in_arg {
diff --git a/tensorflow/core/api_def/base_api/api_def_IRFFT3D.pbtxt b/tensorflow/core/api_def/base_api/api_def_IRFFT3D.pbtxt
index 1cee2ceeff0..068bac2fca3 100644
--- a/tensorflow/core/api_def/base_api/api_def_IRFFT3D.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_IRFFT3D.pbtxt
@@ -3,7 +3,7 @@ op {
   in_arg {
     name: "input"
     description: <<END
-A complex64 tensor.
+A complex tensor.
 END
   }
   in_arg {
diff --git a/tensorflow/core/api_def/base_api/api_def_ParallelInterleaveDatasetV2.pbtxt b/tensorflow/core/api_def/base_api/api_def_ParallelInterleaveDatasetV2.pbtxt
index 27bc4013c39..65215ad0052 100644
--- a/tensorflow/core/api_def/base_api/api_def_ParallelInterleaveDatasetV2.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_ParallelInterleaveDatasetV2.pbtxt
@@ -1,6 +1,42 @@
 op {
   graph_op_name: "ParallelInterleaveDatasetV2"
   visibility: HIDDEN
+  in_arg {
+    name: "input_dataset"
+    description: <<END
+Dataset that produces a stream of arguments for the function `f`.
+END
+  }
+  in_arg {
+    name: "other_arguments"
+    description: <<END
+Additional arguments to pass to `f` beyond those produced by `input_dataset`.
+Evaluated once when the dataset is instantiated.
+END
+  }
+  in_arg {
+    name: "cycle_length"
+    description: <<END
+Number of datasets (each created by applying `f` to the elements of
+`input_dataset`) among which the `ParallelInterleaveDatasetV2` will cycle in a
+round-robin fashion.
+END
+  }
+  in_arg {
+    name: "block_length"
+    description: <<END
+Number of elements at a time to produce from each interleaved invocation of a
+dataset returned by `f`.
+END
+  }
+  in_arg {
+    name: "num_parallel_calls"
+    description: <<END
+Determines the number of threads that should be used for fetching data from
+input datasets in parallel. The Python API `tf.data.experimental.AUTOTUNE` 
+constant can be used to indicate that the level of parallelism should be autotuned.
+END
+  }
   attr {
     name: "f"
     description: <<END
@@ -9,5 +45,31 @@ A function mapping elements of `input_dataset`, concatenated with
 `output_types` and `output_shapes`.
 END
   }
+  attr {
+    name: "Targuments"
+    description: <<END
+Types of the elements of `other_arguments`.
+END
+  }
+  attr {
+    name: "output_types"
+  }
+  attr {
+    name: "output_shapes"
+  }
   summary: "Creates a dataset that applies `f` to the outputs of `input_dataset`."
+  description: <<END
+The resulting dataset is similar to the `InterleaveDataset`, except that the
+dataset will fetch records from the interleaved datasets in parallel.
+
+The `tf.data` Python API creates instances of this op from
+`Dataset.interleave()` when the `num_parallel_calls` parameter of that method
+is set to any value other than `None`.
+
+By default, the output of this dataset will be deterministic, which may result
+in the dataset blocking if the next data item to be returned isn't available.
+In order to avoid head-of-line blocking, one can set the 
+`experimental_deterministic` parameter of `tf.data.Options` to `False`,
+which can improve performance at the expense of non-determinism.
+END
 }
diff --git a/tensorflow/core/api_def/base_api/api_def_QuantizeV2.pbtxt b/tensorflow/core/api_def/base_api/api_def_QuantizeV2.pbtxt
index b7311153f45..a58ca89f1b6 100644
--- a/tensorflow/core/api_def/base_api/api_def_QuantizeV2.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_QuantizeV2.pbtxt
@@ -3,13 +3,19 @@ op {
   in_arg {
     name: "min_range"
     description: <<END
-The minimum scalar value possibly produced for the input.
+The minimum value of the quantization range. This value may be adjusted by the
+op depending on other parameters. The adjusted value is written to `output_min`.
+If the `axis` attribute is specified, this must be a 1-D tensor whose size
+matches the `axis` dimension of the input and output tensors.
 END
   }
   in_arg {
     name: "max_range"
     description: <<END
-The maximum scalar value possibly produced for the input.
+The maximum value of the quantization range. This value may be adjusted by the
+op depending on other parameters. The adjusted value is written to `output_max`.
+If the `axis` attribute is specified, this must be a 1-D tensor whose size
+matches the `axis` dimension of the input and output tensors.
 END
   }
   out_arg {
@@ -21,13 +27,19 @@ END
   out_arg {
     name: "output_min"
     description: <<END
-The actual minimum scalar value used for the output.
+The final quantization range minimum, used to clip input values before scaling
+and rounding them to quantized values.
+If the `axis` attribute is specified, this will be a 1-D tensor whose size
+matches the `axis` dimension of the input and output tensors.
 END
   }
   out_arg {
     name: "output_max"
     description: <<END
-The actual maximum scalar value used for the output.
+The final quantization range maximum, used to clip input values before scaling
+and rounding them to quantized values.
+If the `axis` attribute is specified, this will be a 1-D tensor whose size
+matches the `axis` dimension of the input and output tensors.
 END
   }
   summary: "Quantize the \'input\' tensor of type float to \'output\' tensor of type \'T\'."
@@ -81,54 +93,72 @@ and dequantizing will introduce a larger and larger error.
 `SCALED` mode matches the quantization approach used in
 `QuantizeAndDequantize{V2|V3}`.
 
-If the mode is `SCALED`, we do not use the full range of the output type,
-choosing to elide the lowest possible value for symmetry (e.g., output range is
--127 to 127, not -128 to 127 for signed 8 bit quantization), so that 0.0 maps to
-0.
-
-We first find the range of values in our tensor. The
-range we use is always centered on 0, so we find m such that
+If the mode is `SCALED`, the quantization is performed by multiplying each
+input value by a scaling_factor.
+The scaling_factor is determined from `min_range` and `max_range` to be as large
+as possible such that the range from `min_range` to `max_range` is representable
+within values of type T.
 
 ```c++
-  m = max(abs(input_min), abs(input_max))
+
+  const int min_T = std::numeric_limits<T>::min();
+  const int max_T = std::numeric_limits<T>::max();
+  const float max_float = std::numeric_limits<float>::max();
+
+  const float scale_factor_from_min_side =
+      (min_T * min_range > 0) ? min_T / min_range : max_float;
+  const float scale_factor_from_max_side =
+      (max_T * max_range > 0) ? max_T / max_range : max_float;
+
+  const float scale_factor = std::min(scale_factor_from_min_side,
+                                      scale_factor_from_max_side);
 ```
 
-Our input tensor range is then `[-m, m]`.
-
-Next, we choose our fixed-point quantization buckets, `[min_fixed, max_fixed]`.
-If T is signed, this is
-
-```
-  num_bits = sizeof(T) * 8
-  [min_fixed, max_fixed] =
-      [-(1 << (num_bits - 1) - 1), (1 << (num_bits - 1)) - 1]
-```
-
-Otherwise, if T is unsigned, the fixed-point range is
-
-```
-  [min_fixed, max_fixed] = [0, (1 << num_bits) - 1]
-```
-
-From this we compute our scaling factor, s:
+We next use the scale_factor to adjust min_range and max_range as follows:
 
 ```c++
-  s = (max_fixed - min_fixed) / (2 * m)
+      min_range = min_T / scale_factor;
+      max_range = max_T / scale_factor;
 ```
 
-Now we can quantize the elements of our tensor:
+
+e.g. if T = qint8, and initially min_range = -10, and max_range = 9, we would
+compare -128/-10.0 = 12.8 to 127/9.0 = 14.11, and set scaling_factor = 12.8
+In this case, min_range would remain -10, but max_range would be adjusted to
+127 / 12.8 = 9.921875
+
+So we will quantize input values in the range (-10, 9.921875) to (-128, 127).
+
+The input tensor can now be quantized by clipping values to the range
+`min_range` to `max_range`, then multiplying by scale_factor as follows:
 
 ```c++
-result = round(input * s)
+result = round(min(max_range, max(min_range, input)) * scale_factor)
 ```
 
-One thing to watch out for is that the operator may choose to adjust the
-requested minimum and maximum values slightly during the quantization process,
-so you should always use the output ports as the range for further calculations.
-For example, if the requested minimum and maximum values are close to equal,
-they will be separated by a small epsilon value to prevent ill-formed quantized
-buffers from being created. Otherwise, you can end up with buffers where all the
-quantized values map to the same float value, which causes problems for
-operations that have to perform further calculations on them.
+The adjusted `min_range` and `max_range` are returned as outputs 2 and 3 of
+this operation. These outputs should be used as the range for any further
+calculations.
+
+
+*narrow_range (bool) attribute*
+
+If true, we do not use the minimum quantized value.
+i.e. for int8 the quantized output, it would be restricted to the range
+-127..127 instead of the full -128..127 range.
+This is provided for compatibility with certain inference backends.
+(Only applies to SCALED mode)
+
+
+*axis (int) attribute*
+
+An optional `axis` attribute can specify a dimension index of the input tensor,
+such that quantization ranges will be calculated and applied separately for each
+slice of the tensor along that dimension. This is useful for per-channel
+quantization.
+
+If axis is specified, min_range and max_range
+
+if `axis`=None, per-tensor quantization is performed as normal.
 END
 }
diff --git a/tensorflow/core/api_def/base_api/api_def_RaggedRange.pbtxt b/tensorflow/core/api_def/base_api/api_def_RaggedRange.pbtxt
index 4a9b2af8044..cec10fbf2b3 100644
--- a/tensorflow/core/api_def/base_api/api_def_RaggedRange.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_RaggedRange.pbtxt
@@ -31,13 +31,11 @@ Returns a `RaggedTensor` `result` composed from `rt_dense_values` and
 `result[i] = range(starts[i], limits[i], deltas[i])`.
 
 ```python
->>> (rt_nested_splits, rt_dense_values) = gen_ragged_ops.ragged_range(
-...     starts=[2, 5, 8], limits=[3, 5, 12], deltas=1)
->>> result = ragged.from_nested_row_splits(rt_dense_values, rt_nested_splits)
->>> print result.eval().tolist()
-[[2],               # result[0] = range(2, 3)
- [],                # result[1] = range(5, 5)
- [8, 9, 10, 11]]    # result[2] = range(8, 12)
+(rt_nested_splits, rt_dense_values) = ragged_range(
+      starts=[2, 5, 8], limits=[3, 5, 12], deltas=1)
+result = tf.ragged.from_row_splits(rt_dense_values, rt_nested_splits)
+print(result)
+<tf.RaggedTensor [[2], [], [8, 9, 10, 11]] >
 ```
 
 The input tensors `starts`, `limits`, and `deltas` may be scalars or vectors.
diff --git a/tensorflow/core/api_def/base_api/api_def_TPUCompilationResult.pbtxt b/tensorflow/core/api_def/base_api/api_def_TPUCompilationResult.pbtxt
index 78fb045160d..e901bb07115 100644
--- a/tensorflow/core/api_def/base_api/api_def_TPUCompilationResult.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_TPUCompilationResult.pbtxt
@@ -1,5 +1,10 @@
 op {
   graph_op_name: "TPUCompilationResult"
   visibility: HIDDEN
-  summary: "CompilationResultProto indicating the status of the TPU compilation."
+summary: "Returns the result of a TPU compilation."
+description: <<END
+This operation returns the result of a TPU compilation as a serialized
+CompilationResultProto, which holds a status and an error message if an error
+occurred during compilation.
+END
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_Dequantize.pbtxt b/tensorflow/core/api_def/python_api/api_def_Dequantize.pbtxt
index 96844a65b51..df7fbbff40e 100644
--- a/tensorflow/core/api_def/python_api/api_def_Dequantize.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_Dequantize.pbtxt
@@ -1,10 +1,4 @@
 op {
   graph_op_name: "Dequantize"
-  endpoint {
-    name: "quantization.dequantize"
-  }
-  endpoint {
-    name: "dequantize"
-    deprecation_version: 2
-  }
+  visibility: HIDDEN
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_Equal.pbtxt b/tensorflow/core/api_def/python_api/api_def_Equal.pbtxt
index 34717e74bcd..ead6c4f817d 100644
--- a/tensorflow/core/api_def/python_api/api_def_Equal.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_Equal.pbtxt
@@ -1,9 +1,4 @@
 op {
   graph_op_name: "Equal"
-  endpoint {
-    name: "math.equal"
-  }
-  endpoint {
-    name: "equal"
-  }
+  visibility: HIDDEN
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_NotEqual.pbtxt b/tensorflow/core/api_def/python_api/api_def_NotEqual.pbtxt
index 07fe3b6af19..6191d89ca91 100644
--- a/tensorflow/core/api_def/python_api/api_def_NotEqual.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_NotEqual.pbtxt
@@ -1,9 +1,4 @@
 op {
   graph_op_name: "NotEqual"
-  endpoint {
-    name: "math.not_equal"
-  }
-  endpoint {
-    name: "not_equal"
-  }
+  visibility: HIDDEN
 }
diff --git a/tensorflow/core/common_runtime/copy_tensor.cc b/tensorflow/core/common_runtime/copy_tensor.cc
index 844dbc2a198..897b65f6031 100644
--- a/tensorflow/core/common_runtime/copy_tensor.cc
+++ b/tensorflow/core/common_runtime/copy_tensor.cc
@@ -61,15 +61,14 @@ void CopyHostToDevice(const Tensor* input, Allocator* cpu_allocator,
       status_cb->UpdateStatus(s);
       status_cb->Unref();
     };
-    auto copier = std::bind(
+    auto copier =
         [dst, recv_dev_context, out_allocator, status_cb, cpu_allocator,
-         edge_name, sync_dst_compute](StatusCallback wrapped_done_,
-                                      // Begin unbound arguments
-                                      const Tensor& from, Tensor* to) {
+         edge_name, sync_dst_compute, wrapped_done = std::move(wrapped_done)](
+            const Tensor& from, Tensor* to) {
           if (from.dtype() == DT_VARIANT) {
             status_cb->Ref();
             CopyHostToDevice(&from, cpu_allocator, out_allocator, edge_name,
-                             dst, to, recv_dev_context, wrapped_done_,
+                             dst, to, recv_dev_context, wrapped_done,
                              sync_dst_compute);
             return Status::OK();
           } else {
@@ -85,14 +84,13 @@ void CopyHostToDevice(const Tensor* input, Allocator* cpu_allocator,
               status_cb->Ref();
               *to = Tensor(out_allocator, from.dtype(), from.shape());
               recv_dev_context->CopyCPUTensorToDevice(
-                  &from, dst, to, wrapped_done_, sync_dst_compute);
+                  &from, dst, to, wrapped_done, sync_dst_compute);
               return Status::OK();
             } else {
               return status_cb->status();
             }
           }
-        },
-        std::move(wrapped_done), std::placeholders::_1, std::placeholders::_2);
+        };
 
     const Variant* v = input->flat<Variant>().data();
     Variant* v_out = copy.flat<Variant>().data();
@@ -135,18 +133,18 @@ void CopyDeviceToDevice(CopyTensor::CopyFunction copy_function,
       status_cb->UpdateStatus(s);
       status_cb->Unref();
     };
-    auto copier = std::bind(
+    auto copier =
         [copy_function, cpu_allocator, src, dst, src_alloc_attr, dst_alloc_attr,
          recv_dev_context, send_dev_context, out_allocator, status_cb,
-         dev_to_dev_stream_index](StatusCallback wrapped_done_,
-                                  // Begin unbound arguments
-                                  const Tensor& from, Tensor* to) {
+         dev_to_dev_stream_index, wrapped_done = std::move(wrapped_done)](
+            // Begin unbound arguments
+            const Tensor& from, Tensor* to) {
           if (from.dtype() == DT_VARIANT) {
             status_cb->Ref();
             CopyDeviceToDevice(copy_function, cpu_allocator, out_allocator,
                                send_dev_context, recv_dev_context, src, dst,
                                src_alloc_attr, dst_alloc_attr, &from, to,
-                               dev_to_dev_stream_index, wrapped_done_);
+                               dev_to_dev_stream_index, wrapped_done);
             return Status::OK();
           } else {
             if (!DMAHelper::CanUseDMA(&from)) {
@@ -162,14 +160,13 @@ void CopyDeviceToDevice(CopyTensor::CopyFunction copy_function,
               *to = Tensor(out_allocator, from.dtype(), from.shape());
               copy_function(send_dev_context, recv_dev_context, src, dst,
                             src_alloc_attr, dst_alloc_attr, &from, to,
-                            dev_to_dev_stream_index, std::move(wrapped_done_));
+                            dev_to_dev_stream_index, wrapped_done);
               return Status::OK();
             } else {
               return status_cb->status();
             }
           }
-        },
-        std::move(wrapped_done), std::placeholders::_1, std::placeholders::_2);
+        };
 
     const Variant* v = input->flat<Variant>().data();
     Variant* v_out = copy.flat<Variant>().data();
@@ -248,29 +245,23 @@ void CopyTensor::ViaDMA(StringPiece edge_name, DeviceContext* send_dev_context,
 
     Tensor* cpu_tensor =
         new Tensor(cpu_allocator, input->dtype(), input->shape());
-    std::function<void(const Status&)> delete_and_done = std::bind(
-        [cpu_tensor](StatusCallback done_,
-                     // Begin unbound arguments.
-                     const Status& status) {
-          delete cpu_tensor;
-          done_(status);
-        },
-        std::move(done), std::placeholders::_1);
-    std::function<void(const Status&)> then_copy_to_other_device = std::bind(
-        [delete_and_done, recv_dev_context, cpu_tensor, cpu_allocator,
-         out_allocator, edge_name, dst, output,
-         sync_dst_compute](StatusCallback delete_and_done_,
-                           // Begin unbound arguments.
-                           Status status) {
+    auto delete_and_done = [cpu_tensor,
+                            done = std::move(done)](const Status& status) {
+      delete cpu_tensor;
+      done(status);
+    };
+    auto then_copy_to_other_device =
+        [delete_and_done = std::move(delete_and_done), recv_dev_context,
+         cpu_tensor, cpu_allocator, out_allocator, edge_name, dst, output,
+         sync_dst_compute](Status status) {
           if (!status.ok()) {
-            delete_and_done_(status);
+            delete_and_done(status);
             return;
           }
           CopyHostToDevice(cpu_tensor, cpu_allocator, out_allocator, edge_name,
                            dst, output, recv_dev_context,
-                           std::move(delete_and_done_), sync_dst_compute);
-        },
-        std::move(delete_and_done), std::placeholders::_1);
+                           std::move(delete_and_done), sync_dst_compute);
+        };
     CopyDeviceToHost(input, cpu_allocator, out_allocator, edge_name, src,
                      cpu_tensor, send_dev_context,
                      std::move(then_copy_to_other_device));
@@ -354,15 +345,14 @@ void CopyDeviceToHost(const Tensor* input, Allocator* cpu_allocator,
       status_cb->UpdateStatus(s);
       status_cb->Unref();
     };
-    auto copier = std::bind(
+    auto copier =
         [edge_name, src, send_dev_context, out_allocator, status_cb,
-         cpu_allocator](StatusCallback wrapped_done_,
-                        // Begin unbound arguments
-                        const Tensor& from, Tensor* to) {
+         cpu_allocator, wrapped_done = std::move(wrapped_done)](
+            const Tensor& from, Tensor* to) {
           if (from.dtype() == DT_VARIANT) {
             status_cb->Ref();
             CopyDeviceToHost(&from, cpu_allocator, out_allocator, edge_name,
-                             src, to, send_dev_context, wrapped_done_);
+                             src, to, send_dev_context, wrapped_done);
             return Status::OK();
           } else {
             if (!DMAHelper::CanUseDMA(&from)) {
@@ -377,14 +367,13 @@ void CopyDeviceToHost(const Tensor* input, Allocator* cpu_allocator,
               status_cb->Ref();
               *to = Tensor(out_allocator, from.dtype(), from.shape());
               send_dev_context->CopyDeviceTensorToCPU(&from, edge_name, src, to,
-                                                      wrapped_done_);
+                                                      wrapped_done);
               return Status::OK();
             } else {
               return status_cb->status();
             }
           }
-        },
-        std::move(wrapped_done), std::placeholders::_1, std::placeholders::_2);
+        };
 
     const Variant* v = input->flat<Variant>().data();
     Variant* v_out = copy.flat<Variant>().data();
diff --git a/tensorflow/core/common_runtime/data/BUILD b/tensorflow/core/common_runtime/data/BUILD
deleted file mode 100644
index 0242e2e1cfa..00000000000
--- a/tensorflow/core/common_runtime/data/BUILD
+++ /dev/null
@@ -1,33 +0,0 @@
-load("//tensorflow:tensorflow.bzl", "tf_cc_test")
-load("//tensorflow/core/platform:default/build_config.bzl", "tf_protos_all")
-
-package(
-    default_visibility = [
-        "//tensorflow:internal",
-    ],
-    licenses = ["notice"],  # Apache 2.0
-)
-
-cc_library(
-    name = "standalone",
-    srcs = ["standalone.cc"],
-    hdrs = ["standalone.h"],
-    deps = [
-        "//tensorflow/core:core_cpu_internal",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:session_options",
-        "@com_google_absl//absl/memory",
-    ],
-)
-
-tf_cc_test(
-    name = "standalone_test",
-    srcs = ["standalone_test.cc"],
-    deps = [
-        ":standalone",
-        "//tensorflow/core:all_kernels",
-        "//tensorflow/core:test",
-        "//tensorflow/core:test_main",
-    ] + tf_protos_all(),
-)
diff --git a/tensorflow/core/common_runtime/data/standalone.cc b/tensorflow/core/common_runtime/data/standalone.cc
deleted file mode 100644
index 75177a1384a..00000000000
--- a/tensorflow/core/common_runtime/data/standalone.cc
+++ /dev/null
@@ -1,140 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/core/common_runtime/data/standalone.h"
-
-#include <memory>
-
-#include "absl/memory/memory.h"
-#include "tensorflow/core/common_runtime/device_factory.h"
-#include "tensorflow/core/common_runtime/device_mgr.h"
-#include "tensorflow/core/common_runtime/function.h"
-#include "tensorflow/core/common_runtime/graph_runner.h"
-#include "tensorflow/core/common_runtime/process_util.h"
-#include "tensorflow/core/framework/dataset.h"
-#include "tensorflow/core/graph/graph.h"
-#include "tensorflow/core/graph/graph_constructor.h"
-#include "tensorflow/core/lib/core/errors.h"
-#include "tensorflow/core/public/version.h"
-#include "tensorflow/core/util/ptr_util.h"
-
-namespace tensorflow {
-namespace data {
-namespace standalone {
-
-Status Iterator::GetNext(std::vector<Tensor>* outputs, bool* end_of_input) {
-  return iterator_->GetNext(ctx_.get(), outputs, end_of_input);
-}
-
-Iterator::Iterator(IteratorBase* iterator, IteratorContext* ctx)
-    : iterator_(iterator), ctx_(ctx) {}
-
-Status Dataset::FromGraph(Params params, const GraphDef& graph_def,
-                          std::unique_ptr<Dataset>* result) {
-  Graph graph(OpRegistry::Global());
-  TF_RETURN_IF_ERROR(ImportGraphDef({}, graph_def, &graph, nullptr));
-
-  // Instantiate enough of the TF runtime to run `graph` on a single CPU device.
-  auto device_mgr = absl::make_unique<StaticDeviceMgr>(DeviceFactory::NewDevice(
-      "CPU", params.session_options, "/job:localhost/replica:0/task:0"));
-  Device* device = device_mgr->ListDevices()[0];
-  // Clone the `FunctionLibraryDefinition` to extend its lifetime extends beyond
-  // the lifetime of `graph`.
-  auto flib_def =
-      absl::make_unique<FunctionLibraryDefinition>(graph.flib_def());
-  auto pflr = absl::make_unique<ProcessFunctionLibraryRuntime>(
-      device_mgr.get(), Env::Default(), TF_GRAPH_DEF_VERSION, flib_def.get(),
-      OptimizerOptions{}, nullptr /* parent */);
-
-  string fetch_node = "";
-  for (auto node : graph_def.node()) {
-    if (node.op() == "_Retval") {
-      fetch_node = node.input(0);
-    }
-  }
-  if (fetch_node.empty()) {
-    return errors::NotFound("Failed to find a _Retval op in the given dataset");
-  }
-
-  // Run graph up to `output_node` and extract the `DatasetBase` stored in the
-  // DT_VARIANT output tensor.
-  data::DatasetBase* dataset;
-  {
-    std::vector<Tensor> outputs;
-    GraphRunner graph_runner(device);
-    TF_RETURN_IF_ERROR(graph_runner.Run(&graph, pflr->GetFLR("/device:CPU:0"),
-                                        {}, {fetch_node}, &outputs));
-    TF_RETURN_IF_ERROR(GetDatasetFromVariantTensor(outputs[0], &dataset));
-    // NOTE(mrry): The dataset is currently owned by `outputs[0]`, so acquire an
-    // additional reference.
-    dataset->Ref();
-  }
-
-  std::unique_ptr<thread::ThreadPool> pool(
-      NewThreadPoolFromSessionOptions(params.session_options));
-  *result =
-      WrapUnique(new Dataset(dataset, device_mgr.release(), pflr.release(),
-                             flib_def.release(), pool.release()));
-  return Status::OK();
-}  // static
-
-Status Dataset::MakeIterator(std::unique_ptr<Iterator>* result) {
-  // Create an `IteratorContext`, which bundles together the necessary runtime
-  // support to create and get elements from an iterator.
-  std::unique_ptr<IteratorContext> ctx;
-  {
-    // NOTE(mrry): In the current API, an `IteratorContext` is always initially
-    // created from an `OpKernelContext*`, so we need to create a fake
-    // `OpKernelContext` with the appropriate subset of parameters.
-    OpKernelContext::Params op_params;
-    op_params.function_library = pflr_->GetFLR("/device:CPU:0");
-    op_params.device = device_mgr_->ListDevices()[0];
-    op_params.runner = &runner_;
-    OpKernelContext op_ctx(&op_params, 0);
-    IteratorContext::Params params(&op_ctx);
-    params.function_handle_cache = function_handle_cache_.get();
-    params.resource_mgr = &resource_mgr_;
-    params.cancellation_manager = &cancellation_manager_;
-
-    ctx = absl::make_unique<IteratorContext>(std::move(params));
-  }
-
-  // Create the iterator from the dataset.
-  std::unique_ptr<IteratorBase> iterator;
-  TF_RETURN_IF_ERROR(dataset_->MakeIterator(ctx.get(), "iterator", &iterator));
-
-  *result = WrapUnique(new Iterator(iterator.release(), ctx.release()));
-
-  return Status::OK();
-}
-
-Dataset::Dataset(DatasetBase* dataset, DeviceMgr* device_mgr,
-                 ProcessFunctionLibraryRuntime* pflr,
-                 FunctionLibraryDefinition* flib_def, thread::ThreadPool* pool)
-    : dataset_(dataset),
-      device_mgr_(device_mgr),
-      flib_def_(flib_def),
-      pflr_(pflr),
-      pool_(pool) {
-  runner_ = [this](std::function<void()> c) { pool_->Schedule(std::move(c)); };
-  function_handle_cache_ =
-      absl::make_unique<FunctionHandleCache>(pflr_->GetFLR("/device:CPU:0"));
-}
-
-Dataset::~Dataset() { dataset_->Unref(); }
-
-}  // namespace standalone
-}  // namespace data
-}  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/data/standalone.h b/tensorflow/core/common_runtime/data/standalone.h
deleted file mode 100644
index 70a6820c63f..00000000000
--- a/tensorflow/core/common_runtime/data/standalone.h
+++ /dev/null
@@ -1,122 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_DATA_STANDALONE_H_
-#define TENSORFLOW_CORE_COMMON_RUNTIME_DATA_STANDALONE_H_
-
-#include <memory>
-
-#include "tensorflow/core/common_runtime/device_mgr.h"
-#include "tensorflow/core/framework/dataset.h"
-#include "tensorflow/core/framework/function_handle_cache.h"
-#include "tensorflow/core/lib/core/threadpool.h"
-#include "tensorflow/core/public/session_options.h"
-
-namespace tensorflow {
-namespace data {
-namespace standalone {
-
-// The purpose of the API in this file is to facilitate standalone execution of
-// a tf.data input pipeline graph.
-//
-// The API exposes two abstractions -- a `Dataset` and an `Iterator` -- which
-// encapsulate TensorFlow runtime.
-//
-// The `Dataset` abstraction represents an input pipeline as a collection
-// of data sources and a logical plan of transformations that operate over the
-// data.
-//
-// The `Iterator` abstraction represents an execution of an input pipeline that
-// can be used to enumerate its elements.
-//
-// Example usage:
-//
-//   // Create a `Dataset` by running the `graph_def` graph.
-//   tensorflow::data:standalone::Dataset::Params params;
-//   std::unique_ptr<tensorflow::data::standalone::Dataset> dataset;
-//   Status s = tensorflow::data::standalone::Dataset::FromGraph(
-//      params, graph_def, &dataset);
-//   if (!s.ok()) { /* error handling */ }
-//
-//   std::unique_ptr<tensorflow::data::standalone::Iterator> iterator;
-//   s = dataset->MakeIterator(&iterator);
-//   if (!s.ok()) { /* error handling */ }
-//
-//   bool end_of_input = false;
-//   while (!end_of_input) {
-//     std::vector<tensorflow::Tensor> outputs;
-//     s = iterator->GetNext(&outputs, &end_of_input);
-//     if (!s.ok()) { /* error handling */ }
-//     if (!end_of_input) { /* output handling */ }
-//   }
-
-class Dataset;
-
-// Represents an execution of an input pipeline that can be used to enumerate
-// its elements.
-class Iterator {
- public:
-  // Returns the next element of the input pipeline (if there is one) and an
-  // indication of whether the end of the input pipeline has been reached.
-  Status GetNext(std::vector<Tensor>* outputs, bool* end_of_input);
-
- private:
-  friend class Dataset;
-
-  Iterator(IteratorBase* iterator, IteratorContext* ctx);
-
-  std::unique_ptr<IteratorBase> iterator_;
-  std::unique_ptr<IteratorContext> ctx_;
-};
-
-// Represents an input pipeline as a collection of data sources and a logical
-// plan of transformations that operate over the data.
-class Dataset {
- public:
-  // Parameters for `Dataset` creation (e.g. TensorFlow runtime configuration).
-  struct Params {
-    SessionOptions session_options;
-  };
-
-  // Creates a new `Dataset` instance by running the given dataset graph.
-  static Status FromGraph(Params params, const GraphDef& graph_def,
-                          std::unique_ptr<Dataset>* result);
-
-  ~Dataset();
-
-  // Creates an iterator for this dataset.
-  Status MakeIterator(std::unique_ptr<Iterator>* result);
-
- private:
-  Dataset(DatasetBase* dataset, DeviceMgr* device_mgr,
-          ProcessFunctionLibraryRuntime* pflr,
-          FunctionLibraryDefinition* flib_def, thread::ThreadPool* pool);
-
-  DatasetBase* dataset_;  // owned
-  std::unique_ptr<DeviceMgr> device_mgr_;
-  std::unique_ptr<FunctionLibraryDefinition> flib_def_;
-  std::unique_ptr<ProcessFunctionLibraryRuntime> pflr_;
-  std::unique_ptr<thread::ThreadPool> pool_;
-  std::unique_ptr<FunctionHandleCache> function_handle_cache_;
-  std::function<void(std::function<void()>)> runner_;
-  ResourceMgr resource_mgr_;
-  CancellationManager cancellation_manager_;
-};
-
-}  // namespace standalone
-}  // namespace data
-}  // namespace tensorflow
-
-#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_DATA_STANDALONE_H_
diff --git a/tensorflow/core/common_runtime/data/standalone_test.cc b/tensorflow/core/common_runtime/data/standalone_test.cc
deleted file mode 100644
index e7216b2d457..00000000000
--- a/tensorflow/core/common_runtime/data/standalone_test.cc
+++ /dev/null
@@ -1,307 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/core/common_runtime/data/standalone.h"
-
-#include <memory>
-#include <vector>
-
-#include "tensorflow/core/framework/graph.pb.h"
-#include "tensorflow/core/lib/core/status_test_util.h"
-#include "tensorflow/core/platform/test.h"
-
-namespace tensorflow {
-namespace data {
-namespace standalone {
-namespace {
-
-constexpr const char* const kRangeGraphProto = R"proto(
-  node {
-    name: "Const/_0"
-    op: "Const"
-    attr {
-      key: "dtype"
-      value { type: DT_INT64 }
-    }
-    attr {
-      key: "value"
-      value {
-        tensor {
-          dtype: DT_INT64
-          tensor_shape {}
-          int64_val: 0
-        }
-      }
-    }
-  }
-  node {
-    name: "Const/_1"
-    op: "Const"
-    attr {
-      key: "dtype"
-      value { type: DT_INT64 }
-    }
-    attr {
-      key: "value"
-      value {
-        tensor {
-          dtype: DT_INT64
-          tensor_shape {}
-          int64_val: 10
-        }
-      }
-    }
-  }
-  node {
-    name: "Const/_2"
-    op: "Const"
-    attr {
-      key: "dtype"
-      value { type: DT_INT64 }
-    }
-    attr {
-      key: "value"
-      value {
-        tensor {
-          dtype: DT_INT64
-          tensor_shape {}
-          int64_val: 1
-        }
-      }
-    }
-  }
-  node {
-    name: "RangeDataset/_3"
-    op: "RangeDataset"
-    input: "Const/_0"
-    input: "Const/_1"
-    input: "Const/_2"
-    attr {
-      key: "output_shapes"
-      value { list { shape {} } }
-    }
-    attr {
-      key: "output_types"
-      value { list { type: DT_INT64 } }
-    }
-  }
-  node {
-    name: "dataset"
-    op: "_Retval"
-    input: "RangeDataset/_3"
-    attr {
-      key: "T"
-      value { type: DT_VARIANT }
-    }
-    attr {
-      key: "index"
-      value { i: 0 }
-    }
-  }
-  library {}
-  versions { producer: 96 }
-)proto";
-
-// range(10).map(lambda x: x*x)
-constexpr const char* const kMapGraphProto = R"proto(
-  node {
-    name: "Const/_0"
-    op: "Const"
-    attr {
-      key: "dtype"
-      value { type: DT_INT64 }
-    }
-    attr {
-      key: "value"
-      value {
-        tensor {
-          dtype: DT_INT64
-          tensor_shape {}
-          int64_val: 0
-        }
-      }
-    }
-  }
-  node {
-    name: "Const/_1"
-    op: "Const"
-    attr {
-      key: "dtype"
-      value { type: DT_INT64 }
-    }
-    attr {
-      key: "value"
-      value {
-        tensor {
-          dtype: DT_INT64
-          tensor_shape {}
-          int64_val: 10
-        }
-      }
-    }
-  }
-  node {
-    name: "Const/_2"
-    op: "Const"
-    attr {
-      key: "dtype"
-      value { type: DT_INT64 }
-    }
-    attr {
-      key: "value"
-      value {
-        tensor {
-          dtype: DT_INT64
-          tensor_shape {}
-          int64_val: 1
-        }
-      }
-    }
-  }
-  node {
-    name: "RangeDataset/_3"
-    op: "RangeDataset"
-    input: "Const/_0"
-    input: "Const/_1"
-    input: "Const/_2"
-    attr {
-      key: "output_shapes"
-      value { list { shape {} } }
-    }
-    attr {
-      key: "output_types"
-      value { list { type: DT_INT64 } }
-    }
-  }
-  node {
-    name: "MapDataset/_4"
-    op: "MapDataset"
-    input: "RangeDataset/_3"
-    attr {
-      key: "Targuments"
-      value { list {} }
-    }
-    attr {
-      key: "f"
-      value { func { name: "__inference_Dataset_map_<lambda>_67" } }
-    }
-    attr {
-      key: "output_shapes"
-      value { list { shape {} } }
-    }
-    attr {
-      key: "output_types"
-      value { list { type: DT_INT64 } }
-    }
-    attr {
-      key: "preserve_cardinality"
-      value { b: false }
-    }
-    attr {
-      key: "use_inter_op_parallelism"
-      value { b: true }
-    }
-  }
-  node {
-    name: "dataset"
-    op: "_Retval"
-    input: "MapDataset/_4"
-    attr {
-      key: "T"
-      value { type: DT_VARIANT }
-    }
-    attr {
-      key: "index"
-      value { i: 0 }
-    }
-  }
-  library {
-    function {
-      signature {
-        name: "__inference_Dataset_map_<lambda>_67"
-        input_arg { name: "args_0" type: DT_INT64 }
-        output_arg { name: "identity" type: DT_INT64 }
-      }
-      node_def {
-        name: "mul"
-        op: "Mul"
-        input: "args_0"
-        input: "args_0"
-        attr {
-          key: "T"
-          value { type: DT_INT64 }
-        }
-      }
-      node_def {
-        name: "Identity"
-        op: "Identity"
-        input: "mul:z:0"
-        attr {
-          key: "T"
-          value { type: DT_INT64 }
-        }
-      }
-      ret { key: "identity" value: "Identity:output:0" }
-      arg_attr {
-        key: 0
-        value {
-          attr {
-            key: "_user_specified_name"
-            value { s: "args_0" }
-          }
-        }
-      }
-    }
-  }
-  versions { producer: 96 min_consumer: 12 }
-)proto";
-
-TEST(Scalar, Standalone) {
-  struct TestCase {
-    string graph_string;
-    std::vector<int64> expected_outputs;
-  };
-  auto test_cases = {
-      TestCase{kRangeGraphProto, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9}},
-      TestCase{kMapGraphProto, {0, 1, 4, 9, 16, 25, 36, 49, 64, 81}},
-  };
-  for (auto test_case : test_cases) {
-    GraphDef graph_def;
-    protobuf::TextFormat::ParseFromString(test_case.graph_string, &graph_def);
-    std::unique_ptr<Dataset> dataset;
-    auto s = Dataset::FromGraph({}, graph_def, &dataset);
-    TF_EXPECT_OK(s);
-    std::unique_ptr<Iterator> iterator;
-    s = dataset->MakeIterator(&iterator);
-    TF_EXPECT_OK(s);
-    bool end_of_input = false;
-    for (int num_outputs = 0; !end_of_input; ++num_outputs) {
-      std::vector<tensorflow::Tensor> outputs;
-      s = iterator->GetNext(&outputs, &end_of_input);
-      TF_EXPECT_OK(s);
-      if (!end_of_input) {
-        EXPECT_EQ(outputs[0].scalar<int64>()(),
-                  test_case.expected_outputs[num_outputs]);
-      } else {
-        EXPECT_EQ(test_case.expected_outputs.size(), num_outputs);
-      }
-    }
-  }
-}
-
-}  // namespace
-}  // namespace standalone
-}  // namespace data
-}  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/direct_session.cc b/tensorflow/core/common_runtime/direct_session.cc
index fb414b59605..4eff9982b7e 100644
--- a/tensorflow/core/common_runtime/direct_session.cc
+++ b/tensorflow/core/common_runtime/direct_session.cc
@@ -500,11 +500,11 @@ Status DirectSession::RunInternal(
         if (options_.config.experimental().has_session_metadata()) {
           const auto& model_metadata =
               options_.config.experimental().session_metadata();
-          return strings::StrCat("SessionRun #id=", step_id,
+          return strings::StrCat("SessionRun#id=", step_id,
                                  ",model_id=", model_metadata.name(), ":",
                                  model_metadata.version(), "#");
         } else {
-          return strings::StrCat("SessionRun #id=", step_id, "#");
+          return strings::StrCat("SessionRun#id=", step_id, "#");
         }
       },
       profiler::TraceMeLevel::kInfo);
diff --git a/tensorflow/core/common_runtime/eager/BUILD b/tensorflow/core/common_runtime/eager/BUILD
index 61c85fa8469..63707868ee5 100644
--- a/tensorflow/core/common_runtime/eager/BUILD
+++ b/tensorflow/core/common_runtime/eager/BUILD
@@ -1,6 +1,7 @@
 load(
     "//tensorflow:tensorflow.bzl",
     "tf_cc_test",
+    "tf_copts",
     "tf_cuda_library",
 )
 load(
@@ -282,6 +283,7 @@ cc_library(
 cc_library(
     name = "mkl_eager_op_rewrite",
     srcs = ["mkl_eager_op_rewrite.cc"],
+    copts = tf_copts(allow_exceptions = True),
     deps = [
         ":eager_op_rewrite_registry",
         "//tensorflow/core:framework",
diff --git a/tensorflow/core/common_runtime/eager/attr_builder.cc b/tensorflow/core/common_runtime/eager/attr_builder.cc
index 461f3599a6b..eeef9b33921 100644
--- a/tensorflow/core/common_runtime/eager/attr_builder.cc
+++ b/tensorflow/core/common_runtime/eager/attr_builder.cc
@@ -121,39 +121,24 @@ Status AttrTypeMapForOp(const char* op_name, const AttrTypeMap** out,
   return Status::OK();
 }
 
-#define DEFINE_SET_ATTR(value_type, value_field)                             \
-  template <>                                                                \
-  AttrBuilder& AttrBuilder::Set(StringPiece attr_name, value_type&& value) { \
-    DCHECK(!node_def_finalized_) << "Calling Set() after BuildNodeDef.";     \
-    value_field.push_back(std::make_pair(string(attr_name), value));         \
-    cached_cache_key_ = absl::nullopt;                                       \
-    return *this;                                                            \
+#define DEFINE_GET_ATTR(TYPE, FIELD, ATTR_TYPE)                         \
+  template <>                                                           \
+  Status AttrBuilder::Get(StringPiece attr_name, TYPE* value) const {   \
+    auto it = encoded_attrs_.find(string(attr_name));                   \
+    if (it == encoded_attrs_.end()) {                                   \
+      return errors::NotFound("No attr named'", attr_name,              \
+                              "' found in AttrBuilder for ", op_name_); \
+    }                                                                   \
+    attr_tmp_.ParseFromString(it->second);                              \
+    TF_RETURN_IF_ERROR(AttrValueHasType(attr_tmp_, ATTR_TYPE));         \
+    *value = attr_tmp_.FIELD();                                         \
+    return Status::OK();                                                \
   }
 
-DEFINE_SET_ATTR(float, float_attrs_);
-DEFINE_SET_ATTR(int, int_attrs_);
-DEFINE_SET_ATTR(bool, bool_attrs_);
-DEFINE_SET_ATTR(tensorflow::DataType, type_attrs_);
-
-#undef DEFINE_SET_ATTR
-
-#define DEFINE_GET_ATTR(value_type, value_field)                            \
-  template <>                                                               \
-  Status AttrBuilder::Get(StringPiece attr_name, value_type* value) const { \
-    for (const auto& name_value : value_field) {                            \
-      if (attr_name == name_value.first) {                                  \
-        *value = name_value.second;                                         \
-        return Status::OK();                                                \
-      }                                                                     \
-    }                                                                       \
-    return errors::NotFound("No attr named'", attr_name,                    \
-                            "' found in AttrBuilder for ", op_name_);       \
-  }
-
-DEFINE_GET_ATTR(float, float_attrs_);
-DEFINE_GET_ATTR(int, int_attrs_);
-DEFINE_GET_ATTR(bool, bool_attrs_);
-DEFINE_GET_ATTR(tensorflow::DataType, type_attrs_);
+DEFINE_GET_ATTR(float, f, "float");
+DEFINE_GET_ATTR(int, i, "int");
+DEFINE_GET_ATTR(bool, b, "bool");
+DEFINE_GET_ATTR(tensorflow::DataType, type, "type");
 
 #undef DEFINE_GET_ATTR
 
@@ -163,32 +148,17 @@ AttrBuilder& AttrBuilder::NumInputs(int n) {
   return *this;
 }
 
-void AttrBuilder::FillAttrValueMap(AttrValueMap* m,
-                                   bool include_those_in_node_def) const {
-  for (const auto& p : int_attrs_) {
-    SetInAttrValueMap(m, p.first, p.second);
-  }
-  for (const auto& p : float_attrs_) {
-    SetInAttrValueMap(m, p.first, p.second);
-  }
-  for (const auto& p : bool_attrs_) {
-    SetInAttrValueMap(m, p.first, p.second);
-  }
-  for (const auto& p : type_attrs_) {
-    SetInAttrValueMap(m, p.first, p.second);
-  }
-  if (include_those_in_node_def && node_def_ != nullptr) {
-    for (AttrValueMap::const_iterator it = node_def_->attr().begin();
-         it != node_def_->attr().end(); ++it) {
-      m->insert(*it);
-    }
+void AttrBuilder::FillAttrValueMap(AttrValueMap* m) const {
+  for (auto& entry : encoded_attrs_) {
+    attr_tmp_.ParseFromString(entry.second);
+    m->insert(AttrValueMap::value_type(entry.first, attr_tmp_));
   }
   // For any attr-value pairs that exist in the op def (from op registry) but
   // not `m`, fill them into `m`, so that we can run a TFE_Op without having to
   // specify all the default attr values (e.g. for matmul, the `transpose_a`
   // attr defaults to false).
   const OpDef* op_def = nullptr;
-  Status s = OpDefForOp(op_name_.c_str(), &op_def);
+  Status s = OpDefForOp(op_name().c_str(), &op_def);
   // This is expected, if this op is a custom function, and is therefore not
   // present in the op registry.
   if (!s.ok()) return;
@@ -201,15 +171,22 @@ void AttrBuilder::FillAttrValueMap(AttrValueMap* m,
   }
 }
 
+void AttrBuilder::AddAttrIfNotPresent(StringPiece attr_name,
+                                      const AttrValue& value) {
+  encoded_attrs_.emplace(string(attr_name), value.SerializeAsString());
+}
+
 const NodeDef& AttrBuilder::BuildNodeDef() {
-  if (node_def_finalized_) return *node_def_;
-  MayBeInitializeNodeDef();
-  for (int i = 0; i < num_inputs_; ++i) {
-    node_def_->add_input("dummy_input");
+  if (node_def_finalized_) return node_def_;
+  if (!node_def_initialized_) {
+    InitializeNodeDef();
   }
-  FillAttrValueMap(node_def_->mutable_attr(), false);
+  for (int i = 0; i < num_inputs_; ++i) {
+    node_def_.add_input("dummy_input");
+  }
+  FillAttrValueMap(node_def_.mutable_attr());
   node_def_finalized_ = true;
-  return *node_def_;
+  return node_def_;
 }
 
 Status AttrTypeByName(const AttrTypeMap& m, const string& attr_name,
@@ -264,48 +241,21 @@ tensorflow::Fprint128 AttrBuilder::CacheKey(const StringPiece device) {
 
 tensorflow::Fprint128 AttrBuilder::BuildCacheKeyForDevice(
     const StringPiece device) const {
-  tensorflow::Fprint128 f = tensorflow::Fingerprint128(op_name_);
+  tensorflow::Fprint128 f = tensorflow::Fingerprint128(op_name());
   f = tensorflow::FingerprintCat128(f, tensorflow::Fingerprint128(device));
-  if (node_def_ != nullptr) {
-    // Some attributes are directly written to node_def_ instead of being
-    // stored explicitly.
-    string value;
-    for (const auto& attr : node_def_->attr()) {
-      attr.second.SerializeToString(&value);
-      CombineUnordered(
-          CacheKeyHelper(attr.first, tensorflow::Fingerprint128(value)), &f);
-    }
-    // Note that node_def_ may be created but not finalized. This can happen
-    // when the creation was triggered by a call to Set, but BuildNodeDef has
-    // not been called.
-    if (node_def_finalized_) return f;
-  }
-  for (const auto& p : int_attrs_) {
-    CombineUnordered(CacheKeyHelper(p.first, static_cast<uint64>(p.second)),
-                     &f);
-  }
-  static std::hash<float> float_hasher;
-  for (const auto& p : float_attrs_) {
+  for (const auto& p : encoded_attrs_) {
     CombineUnordered(
-        CacheKeyHelper(p.first, static_cast<uint64>(float_hasher(p.second))),
-        &f);
-  }
-  for (const auto& p : bool_attrs_) {
-    CombineUnordered(CacheKeyHelper(p.first, p.second ? 1u : 0u), &f);
-  }
-  for (const auto& p : type_attrs_) {
-    CombineUnordered(CacheKeyHelper(p.first, static_cast<uint64>(p.second)),
-                     &f);
+        CacheKeyHelper(p.first, tensorflow::Fingerprint128(p.second)), &f);
   }
   return f;
 }
 
-void AttrBuilder::MayBeInitializeNodeDef() {
-  if (node_def_ == nullptr) {
-    node_def_.reset(new NodeDef());
-    node_def_->set_name(op_name_);
-    node_def_->set_op(op_name_);
-  }
+void AttrBuilder::InitializeNodeDef() {
+  DCHECK(!node_def_initialized_);
+  node_def_.Clear();
+  node_def_.set_name(op_name_);
+  node_def_.set_op(op_name_);
+  node_def_initialized_ = true;
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/eager/attr_builder.h b/tensorflow/core/common_runtime/eager/attr_builder.h
index bd57f7c4639..f66ab0a8277 100644
--- a/tensorflow/core/common_runtime/eager/attr_builder.h
+++ b/tensorflow/core/common_runtime/eager/attr_builder.h
@@ -26,6 +26,7 @@ limitations under the License.
 #include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/gtl/inlined_vector.h"
 #include "tensorflow/core/lib/gtl/optional.h"
@@ -84,19 +85,27 @@ Status AttrTypeByName(const AttrTypeMap& m, const string& attr_name,
 // trigger a NodeDef creation).
 class AttrBuilder {
  public:
-  explicit AttrBuilder(const char* op)
-      : op_name_(op),
-        num_inputs_(0),
-        node_def_(nullptr),
-        node_def_finalized_(false) {}
+  explicit AttrBuilder(const char* op) { Reset(op); }
+
+  void Reset(const char* op) {
+    op_name_ = op;
+    num_inputs_ = 0;
+    encoded_attrs_.clear();
+    node_def_initialized_ = false;
+    node_def_finalized_ = false;
+    cached_cache_key_ = absl::nullopt;
+    device_for_cached_cache_key_.clear();
+  }
+
+  const string& op_name() const { return op_name_; }
 
   // Needed to work around call to ValidateNodeDef in CreateOpKernel.
   AttrBuilder& NumInputs(int n);
 
   template <class T>
   AttrBuilder& Set(StringPiece attr_name, T&& value) {
-    MayBeInitializeNodeDef();
-    SetInAttrValueMap(node_def_->mutable_attr(), string(attr_name), value);
+    SetAttrValue(value, &attr_tmp_);
+    AddAttrIfNotPresent(attr_name, attr_tmp_);
     cached_cache_key_ = absl::nullopt;
     return *this;
   }
@@ -110,32 +119,27 @@ class AttrBuilder {
     // Common attributes are stored in AttrVecs. This Get() template
     // is specialized for them below. If we end up here, the type must be
     // among those that we store in the node_def_.
-    if (node_def_ == nullptr) {
+    if (!node_def_initialized_) {
       return errors::NotFound("No attr named'", attr_name,
                               "' found in AttrBuilder for ", op_name_);
     }
-    return GetNodeAttr(node_def_, attr_name, value);
+    return GetNodeAttr(AttrSlice(node_def_), attr_name, value);
   }
 
   tensorflow::Fprint128 CacheKey(const StringPiece device);
 
-  void FillAttrValueMap(AttrValueMap* m) const { FillAttrValueMap(m, true); }
-  const NodeDef& BuildNodeDef();
-
- private:
-  template <class T>
-  using AttrVec = tensorflow::gtl::InlinedVector<std::pair<string, T>, 2>;
-
-  tensorflow::Fprint128 BuildCacheKeyForDevice(const StringPiece device) const;
-
-  void MayBeInitializeNodeDef();
   // Fill `m` with the attr-value pairs set via AttrBuilder::Set() so far, as
   // well as any default attr-value pairs from the associated op_def, if there
   // is one.
-  //
-  // If `include_those_in_node_def` is true, also include any attr-value pairs
-  // from `node_def_`.
-  void FillAttrValueMap(AttrValueMap* m, bool include_those_in_node_def) const;
+  void FillAttrValueMap(AttrValueMap* m) const;
+  const NodeDef& BuildNodeDef();
+
+ private:
+  tensorflow::Fprint128 BuildCacheKeyForDevice(const StringPiece device) const;
+
+  // Initialize the node_def_ object.
+  // REQUIRES: node_def_initialized_ = false
+  void InitializeNodeDef();
 
   template <class T>
   void SetInAttrValueMap(AttrValueMap* m, const string& attr_name,
@@ -144,34 +148,25 @@ class AttrBuilder {
         << "Calling SetInAttrValueMap after BuildNodeDef.";
     // If attribute is set more than once, its first value prevails
     if (AttrSlice(m).Find(attr_name) == nullptr) {
-      AttrValue attr_value;
-      SetAttrValue(value, &attr_value);
-      m->insert(AttrValueMap::value_type(attr_name, attr_value));
+      SetAttrValue(value, &attr_tmp_);
+      m->insert(AttrValueMap::value_type(attr_name, attr_tmp_));
     }
   }
 
-  AttrVec<int> int_attrs_;
-  AttrVec<float> float_attrs_;
-  AttrVec<bool> bool_attrs_;
-  AttrVec<tensorflow::DataType> type_attrs_;
-  const string op_name_;
+  void AddAttrIfNotPresent(StringPiece attr_name, const AttrValue& value);
+
+  gtl::FlatMap<string, string> encoded_attrs_;
+  mutable AttrValue attr_tmp_;  // For encoding
+
+  string op_name_;  // Conceptually const, but can't be because of Reset(...)
   int num_inputs_;
-  std::unique_ptr<NodeDef> node_def_;
+  NodeDef node_def_;
+  bool node_def_initialized_;
   bool node_def_finalized_;
 
   absl::optional<tensorflow::Fprint128> cached_cache_key_;
   string device_for_cached_cache_key_;
-};  // namespace tensorflow
-
-template <>
-AttrBuilder& AttrBuilder::Set(StringPiece attr_name, int&& value);
-template <>
-AttrBuilder& AttrBuilder::Set(StringPiece attr_name, float&& value);
-template <>
-AttrBuilder& AttrBuilder::Set(StringPiece attr_name, bool&& value);
-template <>
-AttrBuilder& AttrBuilder::Set(StringPiece attr_name,
-                              tensorflow::DataType&& value);
+};
 
 template <>
 Status AttrBuilder::Get(StringPiece attr_name, int* value) const;
diff --git a/tensorflow/core/common_runtime/eager/context.cc b/tensorflow/core/common_runtime/eager/context.cc
index a989de34fb7..b4fda6bda50 100644
--- a/tensorflow/core/common_runtime/eager/context.cc
+++ b/tensorflow/core/common_runtime/eager/context.cc
@@ -136,10 +136,10 @@ void EagerContext::InitDeviceMapAndAsync() {
   prioritized_device_type_list_ = ds.PrioritizedDeviceTypeList();
 }
 
-EagerExecutor* EagerContext::Executor() {
+EagerExecutor& EagerContext::Executor() {
   tf_shared_lock l(executor_map_mu_);
-  return gtl::FindWithDefault(thread_local_executor_,
-                              std::this_thread::get_id(), &default_executor_);
+  return *gtl::FindWithDefault(thread_local_executor_,
+                               std::this_thread::get_id(), &default_executor_);
 }
 
 void EagerContext::SetExecutorForThread(EagerExecutor* executor) {
@@ -263,16 +263,18 @@ void EagerContext::WaitForAndCloseRemoteContexts() {
     CloseRemoteContexts();
   }
 
-  mutex_lock l(remote_state_mu_);
-
-  default_executor_.ShutDown().IgnoreError();
-  std::unordered_map<std::thread::id, EagerExecutor*> executors_copy;
   {
-    mutex_lock l(executor_map_mu_);
-    executors_copy = thread_local_executor_;
-  }
-  for (const auto& it : executors_copy) {
-    it.second->ShutDown().IgnoreError();
+    mutex_lock l(remote_state_mu_);
+
+    default_executor_.ShutDown().IgnoreError();
+    std::unordered_map<std::thread::id, EagerExecutor*> executors_copy;
+    {
+      mutex_lock l(executor_map_mu_);
+      executors_copy = thread_local_executor_;
+    }
+    for (const auto& it : executors_copy) {
+      it.second->ShutDown().IgnoreError();
+    }
   }
 
   // This shuts down the completion queue and joins the thread polling it.
@@ -411,7 +413,8 @@ Status EagerContext::MaybeRegisterFunctionRemotely(const FunctionDef& fdef) {
   return Status::OK();
 }
 
-Status EagerContext::AddFunctionDef(const FunctionDef& fdef) {
+Status EagerContext::AddFunctionDef(const FunctionDef& fdef,
+                                    const bool add_to_local_only) {
   bool is_first_ref = false;
   {
     mutex_lock l(cache_mu_);
@@ -430,7 +433,9 @@ Status EagerContext::AddFunctionDef(const FunctionDef& fdef) {
   }
   if (is_first_ref) {
     TF_RETURN_IF_ERROR(func_lib_def_.AddFunctionDef(fdef));
-    return MaybeRegisterFunctionRemotely(fdef);
+    if (!add_to_local_only) {
+      return MaybeRegisterFunctionRemotely(fdef);
+    }
   }
   return Status::OK();
 }
@@ -638,7 +643,7 @@ Status EagerContext::InitializeRemoteMaster(
     std::unique_ptr<ServerInterface> server, WorkerEnv* worker_env,
     std::shared_ptr<WorkerSession> worker_session,
     std::unique_ptr<eager::EagerClientCache> remote_eager_workers,
-    std::unique_ptr<DeviceMgr> remote_device_manager,
+    std::unique_ptr<DynamicDeviceMgr> remote_device_manager,
     const std::vector<string>& remote_contexts, uint64 context_id,
     Rendezvous* r, DeviceMgr* local_device_mgr, int keep_alive_secs,
     DistributedFunctionLibraryRuntime* cluster_flr,
@@ -763,7 +768,7 @@ Status EagerContext::InitializeRemoteMaster(
 
 Status EagerContext::InitializeRemoteWorker(
     std::unique_ptr<eager::EagerClientCache> remote_eager_workers,
-    const DeviceMgr* remote_device_mgr,
+    const DynamicDeviceMgr* remote_device_mgr,
     const std::vector<string>& remote_contexts, uint64 context_id,
     std::function<Rendezvous*(const int64)> rendezvous_creator,
     std::unique_ptr<eager::RemoteMgr, std::function<void(eager::RemoteMgr*)>>
diff --git a/tensorflow/core/common_runtime/eager/context.h b/tensorflow/core/common_runtime/eager/context.h
index a60940d4021..47080742126 100644
--- a/tensorflow/core/common_runtime/eager/context.h
+++ b/tensorflow/core/common_runtime/eager/context.h
@@ -183,9 +183,10 @@ class EagerContext : public core::RefCounted {
 
   GraphCollector* GetGraphCollector() { return &graph_collector_; }
 
-  EagerExecutor* Executor();
+  EagerExecutor& Executor();
 
-  Status AddFunctionDef(const FunctionDef& fdef);
+  Status AddFunctionDef(const FunctionDef& fdef,
+                        const bool add_to_local_only = false);
 
   Status RemoveFunction(const string& func);
 
@@ -233,7 +234,7 @@ class EagerContext : public core::RefCounted {
     return (local_device_manager_ != nullptr) ? local_device_manager_.get()
                                               : local_unowned_device_manager_;
   }
-  const tensorflow::DeviceMgr* remote_device_mgr() const {
+  const tensorflow::DynamicDeviceMgr* remote_device_mgr() const {
     return (remote_device_manager_ != nullptr) ? remote_device_manager_.get()
                                                : remote_unowned_device_manager_;
   }
@@ -279,7 +280,7 @@ class EagerContext : public core::RefCounted {
       std::unique_ptr<ServerInterface> server, WorkerEnv* worker_env,
       std::shared_ptr<WorkerSession> worker_session,
       std::unique_ptr<eager::EagerClientCache> remote_eager_workers,
-      std::unique_ptr<DeviceMgr> remote_device_manager,
+      std::unique_ptr<DynamicDeviceMgr> remote_device_manager,
       const std::vector<string>& remote_contexts, uint64 context_id,
       Rendezvous* r, DeviceMgr* local_device_mgr, int keep_alive_secs,
       DistributedFunctionLibraryRuntime* cluster_flr,
@@ -290,7 +291,7 @@ class EagerContext : public core::RefCounted {
   // contexts in shutdown.
   Status InitializeRemoteWorker(
       std::unique_ptr<eager::EagerClientCache> remote_eager_workers,
-      const DeviceMgr* remote_device_mgr,
+      const DynamicDeviceMgr* remote_device_mgr,
       const std::vector<string>& remote_contexts, uint64 context_id,
       std::function<Rendezvous*(const int64)> rendezvous_creator,
       std::unique_ptr<eager::RemoteMgr, std::function<void(eager::RemoteMgr*)>>
@@ -357,8 +358,8 @@ class EagerContext : public core::RefCounted {
 
   // Only one of the below is set. remote_unowned_device_manager_ is set on
   // remote worker to allow running multi-device function on remote worker.
-  std::unique_ptr<DeviceMgr> remote_device_manager_;
-  const DeviceMgr* remote_unowned_device_manager_ = nullptr;
+  std::unique_ptr<DynamicDeviceMgr> remote_device_manager_;
+  const DynamicDeviceMgr* remote_unowned_device_manager_ = nullptr;
 
   // Devices owned by device_manager
   std::vector<Device*> devices_;
diff --git a/tensorflow/core/common_runtime/eager/eager_executor.cc b/tensorflow/core/common_runtime/eager/eager_executor.cc
index 09adece25c9..ff61986c29d 100644
--- a/tensorflow/core/common_runtime/eager/eager_executor.cc
+++ b/tensorflow/core/common_runtime/eager/eager_executor.cc
@@ -21,7 +21,8 @@ limitations under the License.
 namespace tensorflow {
 
 EagerExecutor::EagerExecutor(bool async)
-    : thread_(async ? tensorflow::Env::Default()->StartThread(
+    : next_node_id_(0),
+      thread_(async ? tensorflow::Env::Default()->StartThread(
                           tensorflow::ThreadOptions(), "eager_async_executor",
                           std::bind(&EagerExecutor::Run, this))
                     : nullptr) {}
@@ -34,7 +35,7 @@ EagerExecutor::~EagerExecutor() {
 
 Status EagerExecutor::ShutDown() {
   {
-    std::vector<std::unique_ptr<EagerNode>> nodes_to_destroy;
+    std::vector<core::RefCountPtr<NodeItem>> items_to_destroy;
     bool has_thread;
     Status status;
     {
@@ -46,7 +47,9 @@ Status EagerExecutor::ShutDown() {
         // thread_exited_notification_.WaitForNotification() below.
         state_ = ExecutorState::kShuttingDown;
       }
-      WaitForOrDestroyAllPendingNodes(&l, &nodes_to_destroy);
+      // It is OK to ignore the returned status here because it will be saved
+      // as the final status_.
+      WaitForAllPendingNodesLocked(&l).IgnoreError();
       state_ = ExecutorState::kShutDown;
       has_thread = thread_ != nullptr;
       status = status_;
@@ -54,8 +57,8 @@ Status EagerExecutor::ShutDown() {
         nodes_pending_.notify_all();
       }
     }
-    for (auto& node : nodes_to_destroy) {
-      node->Abort(status);
+    for (auto& item : items_to_destroy) {
+      item->node->Abort(status);
     }
     if (!has_thread) {
       return status;
@@ -67,36 +70,6 @@ Status EagerExecutor::ShutDown() {
   return status_;
 }
 
-void EagerExecutor::WaitForOrDestroyAllPendingNodes(
-    mutex_lock* lock,
-    std::vector<std::unique_ptr<EagerNode>>* nodes_to_destroy) {
-  if (state_ == ExecutorState::kShutDown) {
-    return;
-  }
-  if (thread_ == nullptr) {
-    Status status = status_;
-    if (status.ok()) {
-      status = errors::FailedPrecondition(
-          "Aborting eager nodes because EagerExecutor is being shut down "
-          "before it got a thread to run the nodes");
-      status_ = status;
-    }
-    while (!node_queue_.empty()) {
-      nodes_to_destroy->push_back(std::move(node_queue_.front()));
-      node_queue_.pop();
-    }
-    for (auto& it : unfinished_nodes_) {
-      nodes_to_destroy->push_back(absl::WrapUnique(it));
-    }
-    unfinished_nodes_.clear();
-    return;
-  }
-
-  // It is OK to ignore the returned status here because it will be saved
-  // as the final status_.
-  WaitForAllPendingNodesLocked(lock).IgnoreError();
-}
-
 bool EagerExecutor::Async() const {
   return thread_ != nullptr;
 }
@@ -112,27 +85,31 @@ const char* EagerExecutor::StateStringLocked() {
   }
 }
 
-Status EagerExecutor::Add(std::unique_ptr<EagerNode> node) {
+Status EagerExecutor::AddOrExecute(std::unique_ptr<EagerNode> node) {
   Status status;
+  core::RefCountPtr<NodeItem> item(new NodeItem);
+  item->id = next_node_id_++;
+  item->node = std::move(node);
+  item->state = NodeState::kPENDING;
 
   // If we are unable to add the node to the queue, we must call Abort. However,
   // we want to do that outside of the scope of the lock since the Abort may
   // try to call EagerExecutor::Add()
   {
     tensorflow::mutex_lock l(node_queue_mutex_);
+    VLOG(3) << "Add node [id " << item->id << "]" << item->node->DebugString()
+            << " with status: " << status_.ToString();
     if (state_ != ExecutorState::kActive) {
       status = errors::FailedPrecondition(
           "EagerExecutor accepts new EagerNodes to run only in Active state. "
           "Current state is '",
           StateStringLocked(), "'");
     } else {
-      DCHECK(thread_) << "EnableAsync should have been called before Add";
       status = status_;
-      if (status.ok()) {
-        node_queue_.push(std::move(node));
-
-        // If there were no previous nodes pending, wake the run thread to start
-        // processing requests again.
+      if (status.ok() && Async()) {
+        node_queue_.push(std::move(item));
+        // If there were no previous nodes pending, wake the run thread to
+        // start processing requests again.
         if (node_queue_.size() == 1) {
           nodes_pending_.notify_all();
         }
@@ -142,9 +119,17 @@ Status EagerExecutor::Add(std::unique_ptr<EagerNode> node) {
     }
   }
 
-  // Node needs to be aborted since it was not added to the queue
-  node->Abort(status);
-  return status;
+  if (status.ok()) {
+    // Inline execution in sync mode.
+    DCHECK(!Async());
+    RunItem(std::move(item));
+    status = this->status();
+    return status;
+  } else {
+    // Node needs to be aborted since it was not added to the queue
+    item->node->Abort(status);
+    return status;
+  }
 }
 
 tensorflow::Status EagerExecutor::WaitForAllPendingNodes() {
@@ -157,9 +142,13 @@ tensorflow::Status EagerExecutor::WaitForAllPendingNodesLocked(
   tensorflow::condition_variable cond;
   // Don't wait if an error is already set.
   if (!status_.ok()) return status_;
-  if (node_queue_.empty()) return tensorflow::Status::OK();
-  EagerNode* last_node = node_queue_.back().get();
-  node_done_notifications_.insert(std::make_pair(last_node, &cond));
+  if (node_queue_.empty() && unfinished_nodes_.empty())
+    return tensorflow::Status::OK();
+  // node_queue_ must be empty in sync mode.
+  DCHECK(Async() || node_queue_.empty());
+  auto last_id = next_node_id_ - 1;
+  VLOG(3) << "Wait for Node: [id " << last_id << "] ";
+  node_done_notifications_.insert(std::make_pair(last_id, &cond));
   cond.wait(*lock);
   // Note that we could be woken up if an error occurs, even though the node has
   // not actually executed.
@@ -183,53 +172,77 @@ tensorflow::Status EagerExecutor::status() const {
   return status_;
 }
 
-void EagerExecutor::NodeDone(EagerNode* node, const Status& status) {
-  VLOG(3) << "Node Done: " << node->DebugString();
-  std::unique_ptr<EagerNode> current_node;
-  std::vector<std::unique_ptr<EagerNode>> nodes_to_destroy;
+void EagerExecutor::NodeDone(core::RefCountPtr<NodeItem> item,
+                             const Status& status) {
+  VLOG(3) << "Node Done: [id " << item->id << "] " << item->node->DebugString()
+          << " with status: " << status.ToString();
+  DCHECK(item->state != NodeState::kDONE);
+  std::vector<core::RefCountPtr<NodeItem>> items_to_destroy;
   {
     mutex_lock l(node_queue_mutex_);
+    auto previous_state = item->state;
+    item->state = NodeState::kDONE;
     if (!status_.ok()) return;
-    if (node == node_queue_.front().get()) {
-      current_node = std::move(node_queue_.front());
-      node_queue_.pop();
+    bool need_notification = false;
+    if (previous_state == NodeState::kPENDING) {
+      if (Async()) {
+        DCHECK(!node_queue_.empty() && item.get() == node_queue_.front().get());
+        need_notification = unfinished_nodes_.empty();
+        node_queue_.pop();
+      } else {
+        need_notification = unfinished_nodes_.empty();
+      }
     } else {
-      DCHECK_GT(unfinished_nodes_.erase(node), 0);
-      current_node = absl::WrapUnique(node);
+      need_notification = item->id == unfinished_nodes_.begin()->first;
+      auto result = unfinished_nodes_.erase(item->id);
+      DCHECK_GT(result, 0);
     }
     if (!status.ok()) {
+      need_notification = true;
       status_ = status;
       // We remove any pending ops so that we don't try to execute them if
       // ClearError is called.
       errors::AppendToMessage(&status_,
-                              ". Encountered when executing an operation using "
+                              "Encountered when executing an operation using "
                               "EagerExecutor. This error cancels all future "
                               "operations and poisons their output tensors.");
       while (!node_queue_.empty()) {
-        nodes_to_destroy.push_back(std::move(node_queue_.front()));
+        items_to_destroy.push_back(std::move(node_queue_.front()));
         node_queue_.pop();
       }
       for (auto& it : unfinished_nodes_) {
-        nodes_to_destroy.push_back(absl::WrapUnique(it));
+        items_to_destroy.push_back(std::move(it.second));
       }
       unfinished_nodes_.clear();
     }
-    if (!node_done_notifications_.empty()) {
+    if (!node_done_notifications_.empty() && need_notification) {
+      uint64 upperbound_id = 0;
+      if (!unfinished_nodes_.empty()) {
+        upperbound_id = unfinished_nodes_.begin()->first - 1;
+      } else if (!node_queue_.empty()) {
+        upperbound_id = node_queue_.front()->id - 1;
+      } else {
+        upperbound_id = next_node_id_ - 1;
+      }
+      VLOG(3) << "Notify node done: [id " << item->id << " to " << upperbound_id
+              << "] ";
       // Note that we notify all waiting threads in case an error has
       // occurred. These calling threads are responsible for checking status_
       // before proceeding.
-      const auto range = status_.ok()
-                             ? node_done_notifications_.equal_range(node)
-                             : make_pair(node_done_notifications_.begin(),
-                                         node_done_notifications_.end());
+      const auto range =
+          status_.ok()
+              ? make_pair(node_done_notifications_.lower_bound(item->id),
+                          node_done_notifications_.upper_bound(upperbound_id))
+              : make_pair(node_done_notifications_.begin(),
+                          node_done_notifications_.end());
       for (auto it = range.first; it != range.second; ++it) {
         it->second->notify_all();
       }
       node_done_notifications_.erase(range.first, range.second);
     }
   }
-  for (auto& node : nodes_to_destroy) {
-    node->Abort(status);
+  for (auto& item : items_to_destroy) {
+    item->node->Abort(status);
   }
   // nodes_to_destroy will be destructed here, while not holding
   // node_queue_mutex_. This is important because, unfortunately, some nodes'
@@ -241,7 +254,7 @@ void EagerExecutor::Run() {
   auto thread_exited_notifier =
       gtl::MakeCleanup([this] { thread_exited_notification_.Notify(); });
   while (true) {
-    EagerNode* curr_node_raw;
+    core::RefCountPtr<NodeItem> curr_item;
     {
       tensorflow::mutex_lock l(node_queue_mutex_);
       while (node_queue_.empty() || !status_.ok()) {
@@ -255,28 +268,39 @@ void EagerExecutor::Run() {
       // will then contain a nullptr. This can be a problem in
       // WaitForAllPendingNodes where we get the top EagerNode pointer
       // and register a notification for its completion.
-      curr_node_raw = node_queue_.front().get();
+      curr_item.reset(node_queue_.front().get());
+      curr_item->Ref();
     }
-    VLOG(3) << "Running Node: " << curr_node_raw->DebugString();
-    AsyncEagerNode* async_node_raw = curr_node_raw->AsAsync();
-    if (async_node_raw == nullptr) {
-      tensorflow::Status status = curr_node_raw->Run();
-      NodeDone(curr_node_raw, status);
-    } else {
-      async_node_raw->RunAsync([this, curr_node_raw](const Status& status) {
-        NodeDone(curr_node_raw, status);
-      });
-      {
-        tensorflow::mutex_lock l(node_queue_mutex_);
-        // If false, NodeDone has been called.
-        if (!node_queue_.empty() &&
-            curr_node_raw == node_queue_.front().get()) {
-          node_queue_.front().release();
-          node_queue_.pop();
-          unfinished_nodes_.emplace(curr_node_raw);
-        }
-      }
+    RunItem(std::move(curr_item));
+  }
+}
+
+void EagerExecutor::RunItem(core::RefCountPtr<NodeItem> item) {
+  VLOG(3) << "Running Node: [id " << item->id << "] "
+          << item->node->DebugString();
+  AsyncEagerNode* async_node = item->node->AsAsync();
+  if (async_node == nullptr) {
+    core::RefCountPtr<NodeItem> new_ref(item.get());
+    new_ref->Ref();
+    tensorflow::Status status = item->node->Run();
+    NodeDone(std::move(new_ref), status);
+  } else {
+    auto* new_ref = item.get();
+    new_ref->Ref();
+    async_node->RunAsync([this, new_ref](const Status& status) {
+      core::RefCountPtr<NodeItem> new_item(new_ref);
+      NodeDone(std::move(new_item), status);
+    });
+  }
+  tensorflow::mutex_lock l(node_queue_mutex_);
+  if (item->state == NodeState::kPENDING) {
+    item->state = NodeState::kSCHEDULED;
+    if (!node_queue_.empty() && item.get() == node_queue_.front().get()) {
+      node_queue_.pop();
     }
+    VLOG(3) << "Add Node: [id " << item->id << "] to unfinished map.";
+    unfinished_nodes_.emplace_hint(unfinished_nodes_.end(), item->id,
+                                   std::move(item));
   }
 }
 
diff --git a/tensorflow/core/common_runtime/eager/eager_executor.h b/tensorflow/core/common_runtime/eager/eager_executor.h
index 952766e1922..534b826f4bc 100644
--- a/tensorflow/core/common_runtime/eager/eager_executor.h
+++ b/tensorflow/core/common_runtime/eager/eager_executor.h
@@ -27,11 +27,14 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/function.h"
 #include "tensorflow/core/common_runtime/rendezvous_mgr.h"
 #include "tensorflow/core/framework/rendezvous.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/core/refcount.h"
 #include "tensorflow/core/lib/gtl/inlined_vector.h"
 #include "tensorflow/core/lib/gtl/map_util.h"
 #include "tensorflow/core/lib/gtl/stl_util.h"
 #include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/platform/thread_annotations.h"
+#include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/public/version.h"
 
 namespace tensorflow {
@@ -72,12 +75,8 @@ class AsyncEagerNode : public EagerNode {
 
   AsyncEagerNode* AsAsync() final { return this; }
 
-  // This is non-blocking. It returns the scheduling status.
-  // TODO(fishx): avoid calling this AsyncEagerNode::Run.
   Status Run() final {
-    std::shared_ptr<Status> status(new Status);
-    RunAsync([status](const Status& s) { status->Update(s); });
-    return *status;
+    return errors::Unimplemented("Don't call AsyncEagerNode::Run().");
   }
 };
 
@@ -85,7 +84,6 @@ class AsyncEagerNode : public EagerNode {
 // Note that this class is thread-safe.
 // TODO(agarwal): TFE_OpAddInput may currently block if it tries to access the
 // device of the input handle. Fix that.
-// TODO(agarwal): On error, mark all affected handles as corrupted.
 // TODO(agarwal): Implement support for control dependencies.
 // TODO(agarwal): Support out-of-order execution and dispatching multiple
 // EagerNode in parallel.
@@ -105,10 +103,11 @@ class EagerExecutor {
 
   bool Async() const;
 
-  // Schedules `node` for execution. If an error occurs (e.g. EagerExecutor
-  // has already been shut down), the `node` is not added to this executor
-  // and its Abort() method is called.
-  Status Add(std::unique_ptr<EagerNode> node);
+  // - Async Mode: schedules `node` for execution.
+  // - Sync Mode: inline execute the 'node' directly.
+  // If an error occurs (e.g. EagerExecutor has already been shut down), the
+  // `node` is not added to this executor and its Abort() method is called.
+  Status AddOrExecute(std::unique_ptr<EagerNode> node);
 
   // Blocks till all currently pending ops are done.
   // In particular, if EnableAsync() has not beed called, it will not return
@@ -139,9 +138,23 @@ class EagerExecutor {
     kShutDown,
   };
 
+  enum class NodeState {
+    kPENDING,
+    kSCHEDULED,
+    kDONE,
+  };
+
+  struct NodeItem : core::RefCounted {
+    // Unique id generated in EagerExecutor::Add(). If item1.id < item2.id, it
+    // means item1.node is added before item2.node.
+    uint64 id;
+    std::unique_ptr<EagerNode> node;
+    NodeState state;
+  };
+
   const char* StateStringLocked() EXCLUSIVE_LOCKS_REQUIRED(node_queue_mutex_);
 
-  void NodeDone(EagerNode* node, const Status& status);
+  void NodeDone(core::RefCountPtr<NodeItem> item, const Status& status);
 
   // Starts execution of pending EagerNodes. This function loops till
   // thread_done_ is set to true. If any errors are encontered, these are set
@@ -149,34 +162,29 @@ class EagerExecutor {
   // `status_` is not ok.
   void Run();
 
+  void RunItem(core::RefCountPtr<NodeItem> item);
+
   // The impl of WaitForAllPendingNodes
   // `lock` is the lock that holds node_queue_mutex_.
   Status WaitForAllPendingNodesLocked(mutex_lock* lock)
       EXCLUSIVE_LOCKS_REQUIRED(node_queue_mutex_);
 
-  // If async has been enabled on this executor, just calls
-  // WaitForAllPendingNodes. Else sets the status_ to an error if it does not
-  // already contain one `lock` is the lock that holds node_queue_mutex_.
-  // Precondition: state_ != kActive.
-  void WaitForOrDestroyAllPendingNodes(
-      mutex_lock* lock,
-      std::vector<std::unique_ptr<EagerNode>>* nodes_to_destroy)
-      EXCLUSIVE_LOCKS_REQUIRED(node_queue_mutex_);
-
   Status WaitImpl(bool wait_all, uint64 node_id);
 
+  std::atomic<uint64> next_node_id_;
+
   mutable mutex node_queue_mutex_;
 
   // Used to signal that some EagerNodes are pending execution.
   condition_variable nodes_pending_ GUARDED_BY(node_queue_mutex_);
 
-  // Queue of pending EagerNodes.
-  std::queue<std::unique_ptr<EagerNode>> node_queue_
+  // Queue of pending NodeItems. Ordered by NodeItem::id.
+  std::queue<core::RefCountPtr<NodeItem>> node_queue_
       GUARDED_BY(node_queue_mutex_);
 
-  // Owned the EagerNode in it.
-  std::unordered_set<EagerNode*> unfinished_nodes_
-      GUARDED_BY(node_queue_mutex_);
+  // Ordered by NodeItem::id.
+  std::map<uint64, core::RefCountPtr<NodeItem>, std::less<uint64>>
+      unfinished_nodes_ GUARDED_BY(node_queue_mutex_);
 
   // `status_` is set based on any errors raised during execution of a
   // EagerNode.  It remains set until ClearError is called.
@@ -185,8 +193,9 @@ class EagerExecutor {
   // Map from id of a EagerNode to condition_variables (not owned by the map).
   // These condition_variables are notified and removed when that EagerNode is
   // done executing, or if an error is found in execution of any EagerNode.
-  std::multimap<EagerNode*, condition_variable*> node_done_notifications_
-      GUARDED_BY(node_queue_mutex_);
+  // The map is ordered by id.
+  std::multimap<uint64, condition_variable*, std::less<uint64>>
+      node_done_notifications_ GUARDED_BY(node_queue_mutex_);
 
   // thread_exited_notification_ is notified by the `thread_` right before it
   // exits.
diff --git a/tensorflow/core/common_runtime/eager/eager_operation.cc b/tensorflow/core/common_runtime/eager/eager_operation.cc
index 7e7b74e0f35..7da8c153106 100644
--- a/tensorflow/core/common_runtime/eager/eager_operation.cc
+++ b/tensorflow/core/common_runtime/eager/eager_operation.cc
@@ -26,28 +26,11 @@ tensorflow::Status EagerOperation::SetDeviceName(const char* device) {
   return Status::OK();
 }
 
-void EagerOperation::AddInput(tensorflow::TensorHandle* h) {
-  h->Ref();
-  inputs_.push_back(h);
-  attrs_.NumInputs(static_cast<int>(inputs_.size()));
-}
-
-void EagerOperation::UpdateInput(int i, tensorflow::TensorHandle* h) {
-  h->Ref();
-  inputs_[i]->Unref();
-  inputs_[i] = h;
-}
-
-void EagerOperation::ConsumeInput(tensorflow::TensorHandle* h) {
-  inputs_.push_back(h);
-  attrs_.NumInputs(static_cast<int>(inputs_.size()));
-}
-
 string EagerOperation::DebugString() const {
   string out;
   VLOG(1) << "EagerOperation::DebugString() over " << this;
 
-  strings::StrAppend(&out, "Name: ", name_, "\n");
+  strings::StrAppend(&out, "Name: ", Name(), "\n");
   strings::StrAppend(&out, "Device Name: [",
                      DeviceNameUtils::ParsedNameToString(device_name_), "]\n");
   strings::StrAppend(
diff --git a/tensorflow/core/common_runtime/eager/eager_operation.h b/tensorflow/core/common_runtime/eager/eager_operation.h
index 853b1a784f1..4c00ebba0c4 100644
--- a/tensorflow/core/common_runtime/eager/eager_operation.h
+++ b/tensorflow/core/common_runtime/eager/eager_operation.h
@@ -28,13 +28,9 @@ class EagerOperation {
   EagerOperation(tensorflow::EagerContext* ctx, const char* op,
                  bool is_function, const tensorflow::AttrTypeMap* t,
                  EagerExecutor* executor = nullptr)
-      : ctx_(ctx),
-        name_(op),
-        attrs_(op),
-        attr_types_(t),
-        device_(nullptr),
-        is_function_(is_function),
-        executor_(executor ? *executor : *ctx->Executor()) {}
+      : ctx_(nullptr) {
+    Reset(ctx, op, is_function, t, executor);
+  }
 
   ~EagerOperation() {
     for (tensorflow::TensorHandle* h : inputs_) {
@@ -42,12 +38,42 @@ class EagerOperation {
     }
   }
 
+  // An EagerOperation object can be reused for a different op by calling
+  // Clear(), and then Reset(...) with the same arguments that would have
+  // been provided to the constructor.
+  void Clear() {
+    ctx_ = nullptr;  // Sign that state is now cleared
+    for (tensorflow::TensorHandle* h : inputs_) {
+      h->Unref();
+    }
+    inputs_.clear();
+  }
+
+  void Reset(tensorflow::EagerContext* ctx, const char* op, bool is_function,
+             const tensorflow::AttrTypeMap* t, EagerExecutor* executor) {
+    DCHECK(ctx_ == nullptr) << "Calling Reset without first calling Release";
+    DCHECK(inputs_.empty());
+    ctx_ = ctx;
+    if (attrs_ == nullptr) {
+      attrs_.reset(new tensorflow::AttrBuilder(op));
+    } else {
+      attrs_->Reset(op);
+    }
+    attr_types_ = t;
+    device_ = nullptr;
+    device_name_ = DeviceNameUtils::ParsedName();
+    use_xla_ = false;
+    is_function_ = is_function;
+    cancellation_manager_ = nullptr;
+    executor_ = executor ? executor : (ctx ? &ctx->Executor() : nullptr);
+  }
+
   bool is_function() const { return is_function_; }
 
   tensorflow::EagerContext* EagerContext() { return ctx_; }
 
-  tensorflow::AttrBuilder* MutableAttrs() { return &attrs_; }
-  const tensorflow::AttrBuilder& Attrs() const { return attrs_; }
+  tensorflow::AttrBuilder* MutableAttrs() { return attrs_.get(); }
+  const tensorflow::AttrBuilder& Attrs() const { return *attrs_; }
 
   const tensorflow::gtl::InlinedVector<tensorflow::TensorHandle*, 4>& Inputs()
       const {
@@ -62,7 +88,7 @@ class EagerOperation {
   void UpdateInput(int i, tensorflow::TensorHandle* h);
   void ConsumeInput(tensorflow::TensorHandle* h);
 
-  const tensorflow::string& Name() const { return name_; }
+  const tensorflow::string& Name() const { return attrs_->op_name(); }
   const tensorflow::AttrTypeMap* AttrTypes() const { return attr_types_; }
 
   tensorflow::Device* Device() const { return device_; }
@@ -84,23 +110,44 @@ class EagerOperation {
     cancellation_manager_ = cancellation_manager;
   }
 
-  EagerExecutor* Executor() { return &executor_; }
+  EagerExecutor& Executor() { return *executor_; }
 
   string DebugString() const;
 
  private:
   tensorflow::EagerContext* ctx_;  // Must outlive the EagerOperation.
-  const tensorflow::string name_;
-  tensorflow::AttrBuilder attrs_;
+  std::unique_ptr<tensorflow::AttrBuilder> attrs_;
   const tensorflow::AttrTypeMap* attr_types_;
   tensorflow::gtl::InlinedVector<tensorflow::TensorHandle*, 4> inputs_;
   tensorflow::Device* device_;
   DeviceNameUtils::ParsedName device_name_;
   bool use_xla_ = false;
-  const bool is_function_;
+  bool is_function_;  // Conceptually const, but can't be because of Reset
   CancellationManager* cancellation_manager_ = nullptr;  // Not owned.
-  EagerExecutor& executor_;                              // Not owned.
+  EagerExecutor* executor_;                              // Not owned.
 };
+
+inline void EagerOperation::AddInput(tensorflow::TensorHandle* h) {
+  h->Ref();
+  inputs_.push_back(h);
+  attrs_->NumInputs(static_cast<int>(inputs_.size()));
+}
+
+inline void EagerOperation::UpdateInput(int i, tensorflow::TensorHandle* h) {
+  tensorflow::TensorHandle** slot = &inputs_[i];
+  tensorflow::TensorHandle* existing = *slot;
+  if (existing != h) {
+    h->Ref();
+    existing->Unref();
+    *slot = h;  // Update inputs_[i] to h
+  }
+}
+
+inline void EagerOperation::ConsumeInput(tensorflow::TensorHandle* h) {
+  inputs_.push_back(h);
+  attrs_->NumInputs(static_cast<int>(inputs_.size()));
+}
+
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_CORE_COMMON_RUNTIME_EAGER_EAGER_OPERATION_H_
diff --git a/tensorflow/core/common_runtime/eager/execute.cc b/tensorflow/core/common_runtime/eager/execute.cc
index 1553e0ce5d9..ef4671de56b 100644
--- a/tensorflow/core/common_runtime/eager/execute.cc
+++ b/tensorflow/core/common_runtime/eager/execute.cc
@@ -40,6 +40,7 @@ limitations under the License.
 #include "tensorflow/core/framework/function.h"
 #include "tensorflow/core/framework/logging.h"
 #include "tensorflow/core/framework/node_def_util.h"
+#include "tensorflow/core/framework/tensor_reference.h"
 #include "tensorflow/core/framework/types.pb.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/profiler/lib/traceme.h"
@@ -118,28 +119,27 @@ const string DeviceNameOrUnspecified(const DeviceNameUtils::ParsedName& name) {
              : kUnspecifiedDeviceName;
 }
 
-// This function expects *handle to point to an existing tensor handle. The
-// function will update the *handle to be pointed to the existing input tensor
-// handle or else the newly copied tensor handle. The existing handle will have
-// a Ref added, vs the new handle has a Ref due to being newly constructed.
+// This function expects *handle to point to an existing tensor handle that is
+// currently on "handle_device", but where the operation expects that input to
+// reside on "expected_input_device".  The function will arrange for this
+// transfer to happen and will return OK on success and will storage a new
+// handle to the equivalent tensor on the correct device in "*result".  Or if an
+// error is encountered, it will return a non-OK status and set "*result" to
+// nullptr.
 //
 // `op_device` is passed in explicitly because `op->device()` might be
 // unset and we might have selected some specific device to run this op on.
-Status MaybeCopyInputToExpectedDevice(EagerOperation* op, Device* op_device,
-                                      int i, Device* expected_input_device,
-                                      TensorHandle** result) {
-  tensorflow::TensorHandle* handle = op->Inputs()[i];
-  EagerContext* ctx = op->EagerContext();
-  Device* handle_device = handle->DeviceOrHostCPU(ctx);
+Status CopyInputToExpectedDevice(EagerContext* ctx, EagerOperation* op,
+                                 Device* op_device,
+                                 TensorHandle* handle,  // op->Inputs()[i]
+                                 int i, Device* handle_device,
+                                 Device* expected_input_device,
+                                 TensorHandle** result) {
+  // Should only be called when these don't match
+  DCHECK(expected_input_device != handle_device);
+  *result = nullptr;
   const string& op_device_name = DeviceNameOrUnspecified(op_device);
 
-  if (expected_input_device == handle_device) {
-    // No copy was done, so the result is just the original handle with a Ref
-    handle->Ref();
-    *result = handle;
-    return Status::OK();
-  }
-
   switch (ctx->GetDevicePlacementPolicy()) {
     case DEVICE_PLACEMENT_SILENT_FOR_INT32:
       // TODO(xpan): See if we could bubble python related error up
@@ -178,7 +178,7 @@ Status MaybeCopyInputToExpectedDevice(EagerOperation* op, Device* op_device,
   TensorHandle* result_handle = nullptr;
   profiler::TraceMe activity("_Send", profiler::TraceMeLevel::kInfo);
   Status status =
-      EagerCopyToDevice(handle, ctx, op->Executor(), expected_input_device,
+      EagerCopyToDevice(handle, ctx, &op->Executor(), expected_input_device,
                         ctx->MirrorTensors(), &result_handle);
   activity.Stop();
   if (!status.ok()) {
@@ -201,18 +201,24 @@ Status ValidateInputTypeAndPlacement(
     const core::RefCountPtr<KernelAndDevice>& kernel) {
   profiler::TraceMe activity("ValidateInputTypeAndPlacement",
                              profiler::TraceMeLevel::kInfo);
-  if (kernel->num_inputs() != op->Inputs().size()) {
+  const int n_inputs = op->Inputs().size();
+  if (kernel->num_inputs() != n_inputs) {
     return errors::InvalidArgument("expected ", kernel->num_inputs(),
-                                   " inputs, got ", op->Inputs().size());
+                                   " inputs, got ", n_inputs);
   }
-  for (int i = 0; i < op->Inputs().size(); ++i) {
+  for (int i = 0; i < n_inputs; ++i) {
+    TensorHandle* handle = op->Inputs()[i];
     Device* expected_device = kernel->InputDevice(i);
-    TensorHandle* handle = nullptr;
-    TF_RETURN_IF_ERROR(MaybeCopyInputToExpectedDevice(
-        op, kernel->device(), i, expected_device, &handle));
-    op->UpdateInput(i, handle);
-    // Unref handle since it has a ref as an input now
-    handle->Unref();
+    Device* handle_device = handle->DeviceOrHostCPU(ctx);
+    // If the input is already on the right device, then nothing to do.
+    if (expected_device != handle_device) {
+      TF_RETURN_IF_ERROR(CopyInputToExpectedDevice(ctx, op, kernel->device(),
+                                                   handle, i, handle_device,
+                                                   expected_device, &handle));
+      op->UpdateInput(i, handle);
+      // Unref handle since it has a ref as an input now
+      handle->Unref();
+    }
     if (handle->dtype != kernel->input_type(i)) {
       return errors::InvalidArgument(
           "cannot compute ", op->Name(), " as input #", i, "(zero-based)",
@@ -356,6 +362,7 @@ Status GetDeviceForInput(const EagerContext* ctx, TensorHandle* tensor_handle,
     // Use the resource's actual device because it is the device that will
     // influence partitioning the multi-device function.
     const Tensor* tensor;
+    // TODO(fishx): Avoid blocking here.
     TF_RETURN_IF_ERROR(tensor_handle->Tensor(&tensor));
     const ResourceHandle& handle = tensor->flat<ResourceHandle>()(0);
     device_name = handle.device();
@@ -458,8 +465,8 @@ Status EagerLocalExecute(EagerOperation* op, TensorHandle** retvals,
       [&] { return absl::StrCat("EagerLocalExecute: ", op->Name()); },
       profiler::TraceMeLevel::kInfo);
   EagerContext* ctx = op->EagerContext();
-  auto* executor = op->Executor();
-  TF_RETURN_IF_ERROR(executor->status());
+  auto& executor = op->Executor();
+  TF_RETURN_IF_ERROR(executor.status());
   Device* device = op->Device();
 
   Fprint128 cache_key = op->MutableAttrs()->CacheKey(
@@ -488,7 +495,7 @@ Status EagerLocalExecute(EagerOperation* op, TensorHandle** retvals,
       if (input->IsRemote()) {
         TensorHandle* handle = nullptr;
         TF_RETURN_IF_ERROR(EagerCopyToDevice(
-            input, ctx, executor, device == nullptr ? ctx->HostCPU() : device,
+            input, ctx, &executor, device == nullptr ? ctx->HostCPU() : device,
             ctx->MirrorTensors(), &handle));
         op->UpdateInput(i, handle);
         // Unref handle since it has a ref as an input now
@@ -633,7 +640,7 @@ Status EagerLocalExecute(EagerOperation* op, TensorHandle** retvals,
   // input handles are ready before executing them.
   // TODO(b/137118203): Consider executing "cheap" kernels inline for
   // performance.
-  Status s = executor->Async() ? executor->Add(std::move(node)) : node->Run();
+  Status s = executor.AddOrExecute(std::move(node));
   // Since the operation failed, we need to Unref any outputs that were
   // allocated.
   if (!s.ok()) {
@@ -698,15 +705,20 @@ Status EagerRemoteExecute(EagerOperation* op, TensorHandle** retvals,
         // Always copy to the remote CPU so that the actual device can be
         // correctly determined after the kernel is selected/instantiated, since
         // the op might have its inputs on host memory.
-        TensorHandle* handle = nullptr;
-        TF_RETURN_IF_ERROR(MaybeCopyInputToExpectedDevice(
-            op, op->Device(), i, remote_cpu_device, &handle));
-        op->UpdateInput(i, handle);
-        input = handle;
-        input_device = remote_cpu_device;
-        input_device_name = &remote_cpu_device->name();
-        // Unref handle since it has a ref as an input now
-        handle->Unref();
+        TensorHandle* handle = op->Inputs()[i];
+        Device* handle_device = handle->DeviceOrHostCPU(ctx);
+        // If the input is already on the right device, then nothing to do.
+        if (remote_cpu_device != handle_device) {
+          TF_RETURN_IF_ERROR(CopyInputToExpectedDevice(
+              ctx, op, op->Device(), handle, i, handle_device,
+              remote_cpu_device, &handle));
+          op->UpdateInput(i, handle);
+          input = handle;
+          input_device = remote_cpu_device;
+          input_device_name = &remote_cpu_device->name();
+          // Unref handle since it has a ref as an input now
+          handle->Unref();
+        }
       }
 
       TF_RETURN_IF_ERROR(ctx->RemoteMgr()->SerializeRemoteTensorHandle(
@@ -753,15 +765,14 @@ Status EagerRemoteExecute(EagerOperation* op, TensorHandle** retvals,
     }
   }
 
-  auto* executor = op->Executor();
-  bool is_async = executor->Async();
+  auto& executor = op->Executor();
   VLOG(4) << "Execute remote eager op: " << op->Name()
-          << " (is async?: " << is_async << ").";
+          << " (is async?: " << executor.Async() << ").";
 
   std::unique_ptr<EagerNode> node(
       new eager::RemoteExecuteNode(std::move(request), op_device, eager_client,
                                    op->Inputs(), {retvals, num_outputs}));
-  Status s = is_async ? executor->Add(std::move(node)) : node->Run();
+  Status s = executor.AddOrExecute(std::move(node));
   // Since the operation failed, we need to Unref any outputs that were
   // allocated.
   if (!s.ok()) {
@@ -893,6 +904,12 @@ Status EagerExecute(EagerOperation* op, TensorHandle** retvals,
 
   bool op_is_local = op->EagerContext()->IsLocalDeviceName(op->GetDeviceName());
 
+  if (!op->Executor().Async()) {
+    // In sync mode, always clear error to maintain the same behavior as before.
+    // TODO(b/141004939): Remove this.
+    op->Executor().ClearError();
+  }
+
   std::unique_ptr<tensorflow::EagerOperation> out_op;
   TF_RETURN_IF_ERROR(EagerOpRewriteRegistry::Global()->RunRewrite(
       EagerOpRewriteRegistry::PRE_EXECUTION, op, &out_op));
@@ -942,18 +959,25 @@ Status EagerKernelExecute(EagerContext* ctx,
   // overwritten during kernel execution. The reference count is incremented
   // below when we insert a copy of the Tensor into protected_tensors, and will
   // be decremented once execution is complete.
-  std::vector<tensorflow::Tensor> protected_tensors;
+  int first_index_that_needs_protecting = -1;
+  gtl::InlinedVector<TensorValue, 4> input_vector(op_inputs.size());
   for (int i = 0; i < op_inputs.size(); ++i) {
-    if (!op_inputs[i]->RefCountIsOne()) {
-      const Tensor* input_tensor = nullptr;
-      TF_RETURN_IF_ERROR(op_inputs[i]->Tensor(&input_tensor));
-      protected_tensors.push_back(*input_tensor);
+    TensorHandle* in = op_inputs[i];
+    TF_RETURN_IF_ERROR(in->TensorValue(&input_vector[i]));
+    if (first_index_that_needs_protecting < 0 && !in->RefCountIsOne()) {
+      first_index_that_needs_protecting = i;
     }
   }
 
-  gtl::InlinedVector<TensorValue, 4> input_vector(op_inputs.size());
-  for (int i = 0; i < op_inputs.size(); ++i) {
-    TF_RETURN_IF_ERROR(op_inputs[i]->TensorValue(&input_vector[i]));
+  TensorReferenceVector protected_tensors;
+  if (first_index_that_needs_protecting >= 0) {
+    for (int i = 0; i < op_inputs.size(); ++i) {
+      if (!op_inputs[i]->RefCountIsOne()) {
+        const Tensor* input_tensor = nullptr;
+        TF_RETURN_IF_ERROR(op_inputs[i]->Tensor(&input_tensor));
+        protected_tensors.emplace_back(TensorReference(*input_tensor));
+      }
+    }
   }
 
   // TODO(apassos) figure out how to record stats for ops which are a part of
@@ -964,15 +988,18 @@ Status EagerKernelExecute(EagerContext* ctx,
   // device. We don't call it now because it is an unneeded overhead (it
   // acquires a lock) and we can't recover from errors anyway.
   ScopedStepContainer* container = ctx->StepContainer();
+  Status s;
   if (container == nullptr) {
-    TF_RETURN_IF_ERROR(kernel->Run(input_vector, &outputs, maybe_stats,
-                                   maybe_step_stats, graph_collector,
-                                   cancellation_manager));
+    s = kernel->Run(input_vector, &outputs, maybe_stats, maybe_step_stats,
+                    graph_collector, cancellation_manager);
   } else {
-    TF_RETURN_IF_ERROR(kernel->Run(container, input_vector, &outputs,
-                                   maybe_stats, maybe_step_stats,
-                                   graph_collector, cancellation_manager));
+    s = kernel->Run(container, input_vector, &outputs, maybe_stats,
+                    maybe_step_stats, graph_collector, cancellation_manager);
   }
+  for (const auto& tensor_ref : protected_tensors) {
+    tensor_ref.Unref();
+  }
+  TF_RETURN_IF_ERROR(s);
   if (graph_collector != nullptr) {
     mutex_lock ml(*ctx->MetadataMu());
     {
@@ -1023,7 +1050,7 @@ Status LocalEagerCopyToDevice(TensorHandle* h, EagerContext* ctx,
   // Note that `h` may not be currently ready. However execution order will
   // make sure that `h` is ready before the copy is actually done.
   std::unique_ptr<EagerNode> node(new CopyToDeviceNode(h, *result, dstd, ctx));
-  Status s = executor->Async() ? executor->Add(std::move(node)) : node->Run();
+  Status s = executor->AddOrExecute(std::move(node));
   // Since the operation failed, we need to Unref any outputs that were
   // allocated.
   if (!s.ok()) {
@@ -1044,6 +1071,12 @@ Status EagerCopyToDevice(TensorHandle* h, EagerContext* ctx,
 
   bool recver_is_local = device->IsLocal();
 
+  if (!executor->Async()) {
+    // In sync mode, always clear error to maintain the same behavior as before.
+    // TODO(b/141004939): Remove this.
+    executor->ClearError();
+  }
+
   if (sender_is_local && recver_is_local) {
     return LocalEagerCopyToDevice(h, ctx, executor, device, result);
   } else {
@@ -1084,7 +1117,7 @@ Status EagerCopyToDevice(TensorHandle* h, EagerContext* ctx,
     }
     auto node = absl::make_unique<eager::RemoteCopyNode>(
         ctx, executor, h, result[0], device, recv_op_id);
-    Status s = executor->Async() ? executor->Add(std::move(node)) : node->Run();
+    Status s = executor->AddOrExecute(std::move(node));
     if (!s.ok()) {
       result[0]->Unref();
     }
diff --git a/tensorflow/core/common_runtime/eager/kernel_and_device.cc b/tensorflow/core/common_runtime/eager/kernel_and_device.cc
index 0c7f6d402f7..2d55e264332 100644
--- a/tensorflow/core/common_runtime/eager/kernel_and_device.cc
+++ b/tensorflow/core/common_runtime/eager/kernel_and_device.cc
@@ -328,11 +328,7 @@ Status KernelAndDeviceOp::Run(ScopedStepContainer* step_container,
     const string& op_name = kernel_->name();
     // 'ScopedActivity' will trace the OpKernel scheduling time on host.
     profiler::TraceMe activity(
-        [&] {
-          return absl::StrCat(op_name, ":", kernel_->type_string(), "#id=",
-                              step_container ? step_container->step_id() : 0,
-                              ",device=", device_->name(), ",async=false#");
-        },
+        [&] { return absl::StrCat(op_name, ":", kernel_->type_string()); },
         profiler::TraceMeLevel::kInfo);
     // 'ScopedAnnotation' will trace the OpKernel execution time on device.
     tracing::ScopedAnnotation annotation(
@@ -402,7 +398,10 @@ Status KernelAndDeviceFunc::Run(
   }
   {
     profiler::TraceMe activity(
-        [&] { return absl::StrCat("FunctionRun:", name()); },
+        [&] {
+          return absl::StrCat("FunctionRun#name=", name(), ",id=", opts.step_id,
+                              "#");
+        },
         profiler::TraceMeLevel::kInfo);
     pflr_->Run(opts, handle_, input_vector, outputs,
                [&status, &done](const Status& s) {
diff --git a/tensorflow/core/common_runtime/executor.cc b/tensorflow/core/common_runtime/executor.cc
index da2954a1c68..104e090c734 100644
--- a/tensorflow/core/common_runtime/executor.cc
+++ b/tensorflow/core/common_runtime/executor.cc
@@ -1010,7 +1010,9 @@ class ExecutorState {
     explicit FrameState(const ExecutorImpl* impl, int parallel_iters)
         : executor(impl),
           max_parallel_iterations(parallel_iters),
-          num_outstanding_iterations(1) {}
+          num_outstanding_iterations(1),
+          iterations(parallel_iters + 1),
+          iterations_raw(iterations.data()) {}
 
     // A new frame is created for each loop. Execution starts at iteration 0.
     // When a value at iteration 0 passes through a NextIteration node,
@@ -1072,9 +1074,13 @@ class ExecutorState {
     // The number of outstanding iterations.
     int num_outstanding_iterations GUARDED_BY(mu) = 1;
 
+   private:
     // The active iteration states of this frame.
     gtl::InlinedVector<IterationState*, 12> iterations;
+    IterationState** const iterations_raw GUARDED_BY(mu);
+    IterationState* iterations_first GUARDED_BY(mu);
 
+   public:
     // The NextIteration nodes to enter a new iteration. If the number of
     // outstanding iterations reaches the limit, we will defer the start of
     // the next iteration until the number of outstanding iterations falls
@@ -1086,7 +1092,7 @@ class ExecutorState {
     // we make it available to all active iterations. When the frame starts
     // a new iteration, we make all the current loop invariants available
     // to the new iteration.
-    std::vector<std::pair<const Node*, Entry>> inv_values GUARDED_BY(mu);
+    std::vector<std::pair<const NodeItem*, Entry>> inv_values GUARDED_BY(mu);
 
     // The list of dead exit nodes for the current highest iteration. We
     // will only "execute" the dead exits of the final iteration.
@@ -1113,15 +1119,22 @@ class ExecutorState {
 
     inline IterationState* GetIteration(int64 iter)
         EXCLUSIVE_LOCKS_REQUIRED(mu) {
-      size_t index = iter % iterations.size();
-      return iterations[index];
+      if (TF_PREDICT_TRUE(iter == 0)) {
+        return iterations_first;
+      } else {
+        size_t index = iter % (max_parallel_iterations + 1);
+        return iterations_raw[index];
+      }
     }
 
     inline void SetIteration(int64 iter, IterationState* state)
         EXCLUSIVE_LOCKS_REQUIRED(mu) {
-      size_t index = iter % iterations.size();
+      size_t index = iter % (max_parallel_iterations + 1);
       DCHECK(state == nullptr || iterations[index] == nullptr);
-      iterations[index] = state;
+      iterations_raw[index] = state;
+      if (index == 0) {
+        iterations_first = state;
+      }
     }
 
     // Decrement the outstanding op count and clean up the iterations in the
@@ -1168,7 +1181,7 @@ class ExecutorState {
 
     // Add a new loop invariant and make it available to all active
     // iterations.
-    void AddLoopInv(const NodeItem* item, const Entry& value,
+    void AddLoopInv(const NodeItem* item, const Entry& entry,
                     TaggedNodeSeq* ready) EXCLUSIVE_LOCKS_REQUIRED(mu);
 
     // Activate the successors of a node. Contents of *outputs are left in an
@@ -1181,6 +1194,16 @@ class ExecutorState {
     bool CleanupIterations(const GraphView* gview, int64 iter,
                            TaggedNodeSeq* ready) EXCLUSIVE_LOCKS_REQUIRED(mu);
 
+    void DumpIterationState(ExecutorState* parent) {
+      mutex_lock l(mu);
+      for (IterationState* iteration : iterations) {
+        if (iteration) {
+          LOG(WARNING) << "  Iteration:";
+          parent->DumpIterationState(this, iteration);
+        }
+      }
+    }
+
     ~FrameState() {
       for (size_t i = 0; i < iterations.size(); ++i) {
         delete iterations[i];
@@ -1191,18 +1214,17 @@ class ExecutorState {
 
   // A tagged node: <frame*, iter, node*>.
   struct TaggedNode {
-    const Node* node = nullptr;
+    const NodeItem* node_item;
     FrameState* input_frame = nullptr;
     int64 input_iter = -1;
     bool is_dead = false;
 
-    TaggedNode(const Node* t_node, FrameState* in_frame, int64 in_iter,
-               bool dead) {
-      node = t_node;
-      input_frame = in_frame;
-      input_iter = in_iter;
-      is_dead = dead;
-    }
+    TaggedNode(const NodeItem* node_item, FrameState* in_frame, int64 in_iter,
+               bool dead)
+        : node_item(node_item),
+          input_frame(in_frame),
+          input_iter(in_iter),
+          is_dead(dead) {}
   };
 
   // A drop-in replacement for std::deque<TaggedNode>.  We typically don't
@@ -1419,9 +1441,9 @@ ExecutorState::ExecutorState(const Executor::Args& args, ExecutorImpl* impl)
   root_frame_->InitializeFrameInfo(root_frame_->frame_name);
 
   // Initialize iteration 0.
-  root_frame_->iterations.resize(root_frame_->max_parallel_iterations);
-  root_frame_->iterations[0] = new IterationState(
-      root_frame_->pending_counts, root_frame_->total_input_tensors);
+  root_frame_->SetIteration(
+      0, new IterationState(root_frame_->pending_counts,
+                            root_frame_->total_input_tensors));
 
   outstanding_frames_.insert({root_frame_->frame_name, root_frame_});
 }
@@ -1535,14 +1557,18 @@ void ExecutorState::RunAsync(Executor::DoneCallback done) {
   // Initialize the ready queue.
   for (const Node* n : impl_->root_nodes_) {
     DCHECK_EQ(n->in_edges().size(), 0);
-    ready.push_back(TaggedNode{n, root_frame_, 0, false});
+    ready.push_back(
+        TaggedNode{impl_->gview_.node(n->id()), root_frame_, 0, false});
   }
   if (ready.empty()) {
     delete this;
     done(Status::OK());
   } else {
     num_outstanding_ops_ = ready.size();
-    root_frame_->iterations[0]->outstanding_ops = ready.size();
+    {
+      mutex_lock l(root_frame_->mu);
+      root_frame_->GetIteration(0)->outstanding_ops = ready.size();
+    }
     done_cb_ = std::move(done);
     // Schedule to run all the ready ops in thread pool.
     ScheduleReady(ready, nullptr);
@@ -1618,7 +1644,6 @@ bool MightTrace(const NodeItem& item,
 
 void ExecutorState::Process(TaggedNode tagged_node, int64 scheduled_nsec) {
   WithContext wc(context_);
-  const GraphView& gview = impl_->gview_;
   TaggedNodeSeq ready;
   TaggedNodeReadyQueue inline_ready;
 
@@ -1684,11 +1709,11 @@ void ExecutorState::Process(TaggedNode tagged_node, int64 scheduled_nsec) {
   while (!inline_ready.empty()) {
     tagged_node = inline_ready.front();
     inline_ready.pop_front();
-    const Node* node = tagged_node.node;
+    const NodeItem& item = *tagged_node.node_item;
     FrameState* input_frame = tagged_node.input_frame;
     const int64 input_iter = tagged_node.input_iter;
+    const Node* node = item.node;
     const int id = node->id();
-    const NodeItem& item = *gview.node(id);
 
     // TODO(misard) Replace with a finer-grain enabling flag once we
     // add better optional debugging support.
@@ -1788,7 +1813,7 @@ void ExecutorState::Process(TaggedNode tagged_node, int64 scheduled_nsec) {
           }
           FrameState* input_frame = state->tagged_node.input_frame;
           const int64 input_iter = state->tagged_node.input_iter;
-          const int id = state->tagged_node.node->id();
+          const int id = state->item->node->id();
           MaybeMarkCompleted(input_frame, input_iter, id);
           TaggedNodeSeq ready;
           if (s.ok()) {
@@ -1813,10 +1838,13 @@ void ExecutorState::Process(TaggedNode tagged_node, int64 scheduled_nsec) {
         {
           profiler::TraceMe activity(
               [&] {
+                int64 id = step_id_;
+                if (step_container_ && step_container_->step_id()) {
+                  id = step_container_->step_id();
+                }
                 return strings::StrCat(
                     op_kernel->name(), ":", op_kernel->type_string(),
-                    "#id=", step_container_ ? step_container_->step_id() : 0,
-                    ",device=", device->name(), ",async=true#");
+                    "#id=", id, ",device=", device->name(), ",async=true#");
               },
               profiler::GetTFTraceMeLevel(op_kernel->IsExpensive()));
           device->ComputeAsync(async, &state->ctx, done);
@@ -1828,9 +1856,12 @@ void ExecutorState::Process(TaggedNode tagged_node, int64 scheduled_nsec) {
 
         if (TF_PREDICT_FALSE(MightTrace(item, event_collector_))) {
           const string& op_name = op_kernel->name();
+          int64 id = step_id_;
+          if (step_container_ && step_container_->step_id()) {
+            id = step_container_->step_id();
+          }
           const string kernel_label = strings::StrCat(
-              op_name, ":", op_kernel->type_string(),
-              "#id=", step_container_ ? step_container_->step_id() : 0,
+              op_name, ":", op_kernel->type_string(), "#id=", id,
               ",device=", device->name(), ",async=false#");
           tracing::ScopedRegion region(tracing::EventCategory::kCompute,
                                        op_name);
@@ -2109,7 +2140,7 @@ void ExecutorState::PropagateOutputs(const TaggedNode& tagged_node,
       },
       profiler::GetTFTraceMeLevel(/*is_expensive=*/false));
 
-  const Node* node = tagged_node.node;
+  const Node* node = item->node;
   FrameState* input_frame = tagged_node.input_frame;
   const int64 input_iter = tagged_node.input_iter;
   const bool is_dead = tagged_node.is_dead;
@@ -2133,7 +2164,6 @@ void ExecutorState::PropagateOutputs(const TaggedNode& tagged_node,
     FindOrCreateChildFrame(input_frame, input_iter, node, &output_frame);
     output_iter = 0;
     {
-      const NodeItem* item = impl_->gview_.node(node->id());
       mutex_lock l(output_frame->mu);
       if (item->is_constant_enter) {
         // Propagate to all active iterations if this is a loop invariant.
@@ -2292,10 +2322,9 @@ void ExecutorState::ScheduleReady(const TaggedNodeSeq& ready,
     return;
   }
 
-  const GraphView& gview = impl_->gview_;
   const TaggedNode* curr_expensive_node = nullptr;
   for (auto& tagged_node : ready) {
-    const NodeItem& item = *gview.node(tagged_node.node->id());
+    const NodeItem& item = *tagged_node.node_item;
     if (tagged_node.is_dead || !item.kernel->IsExpensive()) {
       // Inline this inexpensive node.
       inline_ready->push_back(tagged_node);
@@ -2311,7 +2340,6 @@ void ExecutorState::ScheduleReady(const TaggedNodeSeq& ready,
   }
   if (curr_expensive_node) {
     if (inline_ready->empty()) {
-      // Tail recursion optimization
       inline_ready->push_back(*curr_expensive_node);
     } else {
       // There are inline nodes to run already. We dispatch this expensive
@@ -2443,11 +2471,7 @@ void ExecutorState::DumpState() {
     for (auto& frame : outstanding_frames_) {
       LOG(WARNING) << frame.first;
       FrameState* frame_state = frame.second;
-      mutex_lock frame_lock(frame_state->mu);
-      for (IterationState* iteration : frame_state->iterations) {
-        LOG(WARNING) << "  Iteration:";
-        DumpIterationState(frame_state, iteration);
-      }
+      frame_state->DumpIterationState(this);
     }
     dumped_on_error_ = true;
   }
@@ -2578,11 +2602,12 @@ void ExecutorState::FindOrCreateChildFrame(FrameState* frame, int64 iter,
   temp->parent_iter = iter;
   temp->InitializeFrameInfo(enter_name);
 
-  // 'iterations' is a fixed-length circular buffer.
-  temp->iterations.resize(temp->max_parallel_iterations + 1);
   // Initialize iteration 0.
-  temp->iterations[0] =
-      new IterationState(temp->pending_counts, temp->total_input_tensors);
+  {
+    mutex_lock l(temp->mu);
+    temp->SetIteration(
+        0, new IterationState(temp->pending_counts, temp->total_input_tensors));
+  }
 
   {
     mutex_lock executor_lock(mu_);
@@ -2613,8 +2638,8 @@ void ExecutorState::DeleteFrame(FrameState* frame, TaggedNodeSeq* ready) {
       for (const Edge* e : node->out_edges()) {
         const Node* dst_node = e->dst();
 
-        const auto dst_pending_id =
-            impl_->gview_.node(dst_node->id())->pending_id;
+        const NodeItem& dst_item = *impl_->gview_.node(dst_node->id());
+        const auto dst_pending_id = dst_item.pending_id;
 
         // TODO(yuanbyu): We don't need this if we require the subgraph
         // given to an executor not to contain a sink node.
@@ -2644,7 +2669,7 @@ void ExecutorState::DeleteFrame(FrameState* frame, TaggedNodeSeq* ready) {
         }
         if (dst_ready) {
           if (IsControlTrigger(dst_node)) dst_dead = false;
-          ready->emplace_back(dst_node, parent_frame, parent_iter, dst_dead);
+          ready->emplace_back(&dst_item, parent_frame, parent_iter, dst_dead);
           parent_iter_state->outstanding_ops++;
         }
       }
@@ -2768,7 +2793,7 @@ void ExecutorState::FrameState::ActivateNodes(const NodeItem* item,
     // Add dst to the ready queue if it's ready
     if (dst_ready) {
       if (dst_item->is_control_trigger) dst_dead = false;
-      ready->emplace_back(dst_item->node, this, iter, dst_dead);
+      ready->emplace_back(dst_item, this, iter, dst_dead);
       iter_state->outstanding_ops++;
     }
   }
@@ -2794,10 +2819,9 @@ void ExecutorState::FrameState::ActivateLoopInvs(const GraphView* gview,
                                                  TaggedNodeSeq* ready) {
   // Propagate loop invariants to the new iteration.
   for (auto& node_entry : inv_values) {
-    const Node* node = node_entry.first;
+    const NodeItem* item = node_entry.first;
     const Entry& entry = node_entry.second;
     const bool is_dead = !entry.has_value;
-    const NodeItem* item = gview->node(node->id());
     EntryVector outputs{entry};
     ActivateNodes(item, is_dead, iter, &outputs, ready);
   }
@@ -2807,7 +2831,7 @@ void ExecutorState::FrameState::AddLoopInv(const NodeItem* item,
                                            const Entry& entry,
                                            TaggedNodeSeq* ready) {
   // Store this value.
-  inv_values.push_back({item->node, entry});
+  inv_values.push_back({item, entry});
 
   // Make this value available to all iterations.
   const bool is_dead = !entry.has_value;
diff --git a/tensorflow/core/common_runtime/function.cc b/tensorflow/core/common_runtime/function.cc
index d8e68b762a4..7b6ff16b4b3 100644
--- a/tensorflow/core/common_runtime/function.cc
+++ b/tensorflow/core/common_runtime/function.cc
@@ -1111,14 +1111,10 @@ void FunctionLibraryRuntimeImpl::Run(const Options& opts, Handle handle,
     Rendezvous* rendezvous = new IntraProcessRendezvous(device_mgr_);
     run_opts.rendezvous = rendezvous;
     run_opts.create_rendezvous = false;
-    done = std::bind(
-        [rendezvous](DoneCallback done,
-                     // Begin unbound arguments.
-                     const Status& status) {
-          rendezvous->Unref();
-          done(status);
-        },
-        std::move(done), std::placeholders::_1);
+    done = [done = std::move(done), rendezvous](const Status& status) {
+      rendezvous->Unref();
+      done(status);
+    };
   }
 
   LocalHandle local_handle = parent_->GetHandleOnDevice(device_name_, handle);
@@ -1186,14 +1182,10 @@ void FunctionLibraryRuntimeImpl::Run(const Options& opts, Handle handle,
     Rendezvous* rendezvous = new IntraProcessRendezvous(device_mgr_);
     run_opts.rendezvous = rendezvous;
     run_opts.create_rendezvous = false;
-    done = std::bind(
-        [rendezvous](DoneCallback done,
-                     // Begin unbound arguments.
-                     const Status& status) {
-          rendezvous->Unref();
-          done(status);
-        },
-        std::move(done), std::placeholders::_1);
+    done = [done = std::move(done), rendezvous](const Status& status) {
+      rendezvous->Unref();
+      done(status);
+    };
   }
 
   LocalHandle local_handle = parent_->GetHandleOnDevice(device_name_, handle);
diff --git a/tensorflow/core/common_runtime/gpu/gpu_device.cc b/tensorflow/core/common_runtime/gpu/gpu_device.cc
index b32a7385f65..11888be6219 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_device.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_device.cc
@@ -249,6 +249,20 @@ class BaseGPUDevice::StreamGroupFactory {
       VLOG(2) << "Created stream[" << stream_group_within_gpu
               << "] = " << group->compute;
 
+#if TENSORFLOW_USE_ROCM
+      // ROCm streams are lightweight and will not necessarily trigger device
+      // queue init until they are first used. For optimal performance,
+      // compute and nccl streams must be immediate siblings.
+      group->nccl = new se::Stream(executor);
+      group->nccl->Init();
+      VLOG(2) << "Created nccl_stream[" << stream_group_within_gpu
+              << "] = " << group->nccl;
+
+      // Force underlying resource creation now.
+      group->compute->ThenWaitFor(group->nccl);
+      group->nccl->ThenWaitFor(group->compute);
+#endif
+
       group->host_to_device = new se::Stream(executor);
       group->host_to_device->Init();
       VLOG(2) << "Created host_to_device_stream[" << stream_group_within_gpu
@@ -371,8 +385,12 @@ Status BaseGPUDevice::Init(const SessionOptions& options) {
     streams_.push_back(StreamGroupFactory::Global().GetOrCreate(
         tf_gpu_id_, i, executor_, options.config.gpu_options()));
     device_contexts_.push_back(new GPUDeviceContext(
-        i, streams_.back()->compute, streams_.back()->host_to_device,
-        streams_.back()->device_to_host, streams_.back()->device_to_device));
+        i, streams_.back()->compute,
+#if TENSORFLOW_USE_ROCM
+        streams_.back()->nccl,
+#endif
+        streams_.back()->host_to_device, streams_.back()->device_to_host,
+        streams_.back()->device_to_device));
   }
 
   em_ = EventMgrFactory::Singleton()->GetEventMgr(executor_,
@@ -681,17 +699,13 @@ Status BaseGPUDevice::MaybeCopyTensorToGPU(
       return err;
     }
 
-    StatusCallback wrapped_done = std::bind(
-        [to, copy](StatusCallback done_,
-                   // Begin unbound arguments.
-                   const Status& s) {
-          if (s.ok()) {
-            *to = std::move(*copy);
-          }
-          delete copy;
-          done_(s);
-        },
-        std::move(done), std::placeholders::_1);
+    auto wrapped_done = [to, copy, done = std::move(done)](const Status& s) {
+      if (s.ok()) {
+        *to = std::move(*copy);
+      }
+      delete copy;
+      done(s);
+    };
 
     tracing::ScopedAnnotation annotation("MakeTensorFromProto");
     device_contexts_[0]->CopyCPUTensorToDevice(
diff --git a/tensorflow/core/common_runtime/gpu/gpu_device.h b/tensorflow/core/common_runtime/gpu/gpu_device.h
index cbba89d0d05..ae7611fee72 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_device.h
+++ b/tensorflow/core/common_runtime/gpu/gpu_device.h
@@ -137,6 +137,9 @@ class BaseGPUDevice : public LocalDevice {
   friend class GPUDeviceTestHelper;
   struct StreamGroup {
     se::Stream* compute = nullptr;
+#if TENSORFLOW_USE_ROCM
+    se::Stream* nccl = nullptr;
+#endif
     se::Stream* host_to_device = nullptr;
     se::Stream* device_to_host = nullptr;
     gtl::InlinedVector<se::Stream*, 4> device_to_device;
diff --git a/tensorflow/core/common_runtime/gpu/gpu_process_state.cc b/tensorflow/core/common_runtime/gpu/gpu_process_state.cc
index 3dae01bcc1d..8c8cf711e93 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_process_state.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_process_state.cc
@@ -125,9 +125,11 @@ Allocator* GPUProcessState::GetGPUAllocator(const GPUOptions& options,
     // If true, checks for memory overwrites by writing
     // distinctive patterns on both ends of allocated memory.
     if (useCudaMemoryGuardAllocator()) {
+      LOG(INFO) << "Using memory guard allocator for GPU.";
       gpu_allocator = new GPUDebugAllocator(gpu_allocator, platform_gpu_id);
       gpu_allocator = new GPUNanResetAllocator(gpu_allocator, platform_gpu_id);
     } else if (useCudaMallocAllocator()) {
+      LOG(INFO) << "Using CUDA malloc allocator for GPU.";
       // If true, passes all allocation requests through to cudaMalloc
       // useful for doing memory debugging with tools like cuda-memcheck
       // **WARNING** probably will not work in a multi-gpu scenario
diff --git a/tensorflow/core/common_runtime/gpu_device_context.h b/tensorflow/core/common_runtime/gpu_device_context.h
index eab46b79c17..4f6325a6d19 100644
--- a/tensorflow/core/common_runtime/gpu_device_context.h
+++ b/tensorflow/core/common_runtime/gpu_device_context.h
@@ -30,18 +30,28 @@ class GPUDeviceContext : public DeviceContext {
  public:
   // Does not take ownership of streams.
   GPUDeviceContext(int stream_id, se::Stream* stream,
+#if TENSORFLOW_USE_ROCM
+                   se::Stream* nccl_stream,
+#endif
                    se::Stream* host_to_device_stream,
                    se::Stream* device_to_host_stream,
                    gtl::InlinedVector<se::Stream*, 4> device_to_device_stream)
       : stream_id_(stream_id),
         stream_(stream),
+#if TENSORFLOW_USE_ROCM
+        nccl_stream_(nccl_stream),
+#endif
         host_to_device_stream_(host_to_device_stream),
         device_to_host_stream_(device_to_host_stream),
-        device_to_device_stream_(device_to_device_stream) {}
+        device_to_device_stream_(device_to_device_stream) {
+  }
 
   ~GPUDeviceContext() override {}
 
   se::Stream* stream() const override { return stream_; }
+#if TENSORFLOW_USE_ROCM
+  se::Stream* nccl_stream() const { return nccl_stream_; }
+#endif
   se::Stream* host_to_device_stream() const { return host_to_device_stream_; }
   se::Stream* device_to_host_stream() const { return device_to_host_stream_; }
   se::Stream* device_to_device_stream(int index) const {
@@ -72,6 +82,10 @@ class GPUDeviceContext : public DeviceContext {
   // The default primary stream to use for this context.
   // All the memory belongs to this stream.
   se::Stream* stream_;
+#if TENSORFLOW_USE_ROCM
+  // The stream to use for nccl operations.
+  se::Stream* nccl_stream_;
+#endif
   // The stream to use for copying data from host into GPU.
   se::Stream* host_to_device_stream_;
   // The stream to use for copying data from GPU to host.
diff --git a/tensorflow/core/common_runtime/metrics.cc b/tensorflow/core/common_runtime/metrics.cc
index bb3ea39bedd..8b5f6c3501c 100644
--- a/tensorflow/core/common_runtime/metrics.cc
+++ b/tensorflow/core/common_runtime/metrics.cc
@@ -61,6 +61,18 @@ auto* tf_data_elements_counter = monitoring::Counter<1>::New(
 auto* tf_data_optimization_counter = monitoring::Counter<1>::New(
     "/tensorflow/data/optimization", "tf.data optimization", "name");
 
+auto* parse_dense_feature_counter = monitoring::Counter<0>::New(
+    "/tensorflow/data/dense_feature",
+    "The number of dense features parsed by ops for parsing tf.Example.");
+
+auto* parse_sparse_feature_counter = monitoring::Counter<0>::New(
+    "/tensorflow/data/sparse_feature",
+    "The number of sparse features parsed by ops for parsing tf.Example.");
+
+auto* parse_ragged_feature_counter = monitoring::Counter<0>::New(
+    "/tensorflow/data/ragged_feature",
+    "The number of ragged features parsed by ops for parsing tf.Example.");
+
 auto* build_graph_calls = monitoring::Counter<0>::New(
     "/tensorflow/core/graph_build_calls",
     "The number of times TensorFlow has created a new client graph. "
@@ -105,6 +117,18 @@ void RecordTFDataOptimization(const string& name, int64 num_changes) {
   tf_data_optimization_counter->GetCell(name)->IncrementBy(num_changes);
 }
 
+void RecordParseDenseFeature(int64 num_features) {
+  parse_dense_feature_counter->GetCell()->IncrementBy(num_features);
+}
+
+void RecordParseSparseFeature(int64 num_features) {
+  parse_sparse_feature_counter->GetCell()->IncrementBy(num_features);
+}
+
+void RecordParseRaggedFeature(int64 num_features) {
+  parse_ragged_feature_counter->GetCell()->IncrementBy(num_features);
+}
+
 void RecordGraphInputTensors(const size_t size) {
   graph_run_input_tensor_bytes->GetCell()->Add(size);
 }
diff --git a/tensorflow/core/common_runtime/metrics.h b/tensorflow/core/common_runtime/metrics.h
index 1c0f795978c..b531d70b10e 100644
--- a/tensorflow/core/common_runtime/metrics.h
+++ b/tensorflow/core/common_runtime/metrics.h
@@ -43,6 +43,15 @@ void RecordTFDataElements(const string& name, int64 num_elements);
 // The `name` argument identifies the optimization (e.g. "noop_eliminiation").
 void RecordTFDataOptimization(const string& name, int64 num_changes);
 
+// Records parsing of dense tensor features.
+void RecordParseDenseFeature(int64 num_features);
+
+// Records parsing of sparse tensor features.
+void RecordParseSparseFeature(int64 num_features);
+
+// Records parsing of ragged tensor features.
+void RecordParseRaggedFeature(int64 num_features);
+
 // Records the size of input/output tensors in bytes.
 void RecordGraphInputTensors(const size_t size);
 void RecordGraphOutputTensors(const size_t size);
diff --git a/tensorflow/core/common_runtime/process_function_library_runtime.cc b/tensorflow/core/common_runtime/process_function_library_runtime.cc
index 34ee14eea07..89c86285d3a 100644
--- a/tensorflow/core/common_runtime/process_function_library_runtime.cc
+++ b/tensorflow/core/common_runtime/process_function_library_runtime.cc
@@ -622,10 +622,9 @@ Status ProcessFunctionLibraryRuntime::InstantiateMultiDevice(
                                     options.output_devices, device_set_,
                                     arg_nodes, ret_nodes));
 
-  std::unique_ptr<MultiDeviceFunctionData> data =
-      absl::make_unique<MultiDeviceFunctionData>(
-          function_name, function_key, ret_node_names.size(),
-          lib_def->ReachableDefinitions(*fdef), std::move(ret_types));
+  auto data = absl::make_unique<MultiDeviceFunctionData>(
+      function_name, function_key, ret_node_names.size(),
+      lib_def->ReachableDefinitions(*fdef), std::move(ret_types));
 
   GraphOptimizationPassOptions optimization_options;
   // TODO(iga): Thread other relevant options from SessionOptions.
@@ -1188,25 +1187,22 @@ void ProcessFunctionLibraryRuntime::RunInternal(
         opts.rets_alloc_attrs;
     std::vector<Tensor>* remote_rets = new std::vector<Tensor>;
     flr->Run(opts, handle, args, remote_rets,
-             std::bind(
-                 [source_device, target_device, target_incarnation, rendezvous,
-                  device_context, rets_alloc_attrs, remote_rets,
-                  rets](const Status& status,
-                        FunctionLibraryRuntime::DoneCallback& done) {
-                   if (!status.ok()) {
-                     delete remote_rets;
-                     done(status);
-                     return;
-                   }
-                   int64 num_returns = remote_rets->size();
-                   delete remote_rets;
-                   // Now receive the return values from the target.
-                   ReceiveTensorsAsync(target_device, source_device, "ret_",
-                                       target_incarnation, num_returns,
-                                       device_context, rets_alloc_attrs,
-                                       rendezvous, rets, std::move(done));
-                 },
-                 std::placeholders::_1, std::move(done)));
+             [source_device, target_device, target_incarnation, rendezvous,
+              device_context, rets_alloc_attrs, remote_rets, rets,
+              done = std::move(done)](const Status& status) mutable {
+               if (!status.ok()) {
+                 delete remote_rets;
+                 done(status);
+                 return;
+               }
+               int64 num_returns = remote_rets->size();
+               delete remote_rets;
+               // Now receive the return values from the target.
+               ReceiveTensorsAsync(target_device, source_device, "ret_",
+                                   target_incarnation, num_returns,
+                                   device_context, rets_alloc_attrs, rendezvous,
+                                   rets, std::move(done));
+             });
     return;
   }
   if (parent_ != nullptr) {
@@ -1239,35 +1235,32 @@ void ProcessFunctionLibraryRuntime::Run(
   rets->reserve(frame->num_retvals());
 
   Run(opts, handle, args, rets,
-      std::bind(
-          [frame, rets](FunctionLibraryRuntime::DoneCallback& done,
-                        // Begin unbound arguments.
-                        const Status& status) {
-            std::unique_ptr<std::vector<Tensor>> rets_releaser(rets);
 
-            if (!status.ok()) {
-              done(status);
-              return;
-            }
+      [frame, rets, done = std::move(done)](const Status& status) {
+        std::unique_ptr<std::vector<Tensor>> rets_releaser(rets);
 
-            if (rets->size() != frame->num_retvals()) {
-              done(errors::Internal(
-                  "Number of return values from function (", rets->size(),
-                  ") did not match expected number of return values (",
-                  frame->num_retvals(), ")."));
-              return;
-            }
+        if (!status.ok()) {
+          done(status);
+          return;
+        }
 
-            for (size_t i = 0; i < frame->num_retvals(); ++i) {
-              Status s = frame->SetRetval(i, (*rets)[i]);
-              if (!s.ok()) {
-                done(s);
-                return;
-              }
-            }
-            done(Status::OK());
-          },
-          std::move(done), std::placeholders::_1));
+        if (rets->size() != frame->num_retvals()) {
+          done(errors::Internal(
+              "Number of return values from function (", rets->size(),
+              ") did not match expected number of return values (",
+              frame->num_retvals(), ")."));
+          return;
+        }
+
+        for (size_t i = 0; i < frame->num_retvals(); ++i) {
+          Status s = frame->SetRetval(i, (*rets)[i]);
+          if (!s.ok()) {
+            done(s);
+            return;
+          }
+        }
+        done(Status::OK());
+      });
 }
 
 void ProcessFunctionLibraryRuntime::CleanUp(
diff --git a/tensorflow/core/common_runtime/process_util.cc b/tensorflow/core/common_runtime/process_util.cc
index 0122aab51f7..40b19f6cfd4 100644
--- a/tensorflow/core/common_runtime/process_util.cc
+++ b/tensorflow/core/common_runtime/process_util.cc
@@ -139,12 +139,10 @@ void SchedClosure(std::function<void()> closure) {
   uint64 id = tracing::GetUniqueArg();
   tracing::RecordEvent(tracing::EventCategory::kScheduleClosure, id);
 
-  Env::Default()->SchedClosure(std::bind(
-      [id](std::function<void()> closure) {
-        tracing::ScopedRegion region(tracing::EventCategory::kRunClosure, id);
-        closure();
-      },
-      std::move(closure)));
+  Env::Default()->SchedClosure([id, closure = std::move(closure)]() {
+    tracing::ScopedRegion region(tracing::EventCategory::kRunClosure, id);
+    closure();
+  });
 }
 
 void SchedNonBlockingClosureAfter(int64 micros, std::function<void()> closure) {
diff --git a/tensorflow/core/common_runtime/rendezvous_mgr.cc b/tensorflow/core/common_runtime/rendezvous_mgr.cc
index 0ebe10f11e2..70ae3cbb452 100644
--- a/tensorflow/core/common_runtime/rendezvous_mgr.cc
+++ b/tensorflow/core/common_runtime/rendezvous_mgr.cc
@@ -137,36 +137,28 @@ void IntraProcessRendezvous::RecvAsync(const ParsedKey& parsed,
   // Recv the tensor from local_.
   local_->RecvAsync(
       parsed, recv_args,
-      std::bind(
-          [this, parsed](DoneCallback done,
-                         // Begin unbound arguments.
-                         const Status& status,
-                         const Rendezvous::Args& send_args,
-                         const Rendezvous::Args& recv_args, const Tensor& in,
-                         bool is_dead) {
-            // If "in" is an uninitialized tensor, do copy-construction to
-            // preserve the uninitialized state, along with data type and shape
-            // info, which is useful for debugger purposes.
-            Tensor* out = in.IsInitialized() ? new Tensor : new Tensor(in);
+      [this, parsed, done = std::move(done)](
+          const Status& status, const Rendezvous::Args& send_args,
+          const Rendezvous::Args& recv_args, const Tensor& in,
+          bool is_dead) mutable {
+        // If "in" is an uninitialized tensor, do copy-construction to
+        // preserve the uninitialized state, along with data type and shape
+        // info, which is useful for debugger purposes.
+        Tensor* out = in.IsInitialized() ? new Tensor : new Tensor(in);
 
-            auto final_callback = std::bind(
-                [send_args, recv_args, out, is_dead](DoneCallback done,
-                                                     // Begin unbound arguments.
-                                                     const Status& s) {
-                  done(s, send_args, recv_args, *out, is_dead);
-                  delete out;
-                },
-                std::move(done), std::placeholders::_1);
+        auto final_callback = [send_args, recv_args, out, is_dead,
+                               done = std::move(done)](const Status& s) {
+          done(s, send_args, recv_args, *out, is_dead);
+          delete out;
+        };
 
-            if (status.ok() && in.IsInitialized()) {
-              SameWorkerRecvDone(parsed, send_args, recv_args, in, out,
-                                 std::move(final_callback));
-            } else {
-              final_callback(status);
-            }
-          },
-          std::move(done), std::placeholders::_1, std::placeholders::_2,
-          std::placeholders::_3, std::placeholders::_4, std::placeholders::_5));
+        if (status.ok() && in.IsInitialized()) {
+          SameWorkerRecvDone(parsed, send_args, recv_args, in, out,
+                             std::move(final_callback));
+        } else {
+          final_callback(status);
+        }
+      });
 }
 
 void IntraProcessRendezvous::StartAbort(const Status& s) {
diff --git a/tensorflow/core/common_runtime/scoped_allocator.cc b/tensorflow/core/common_runtime/scoped_allocator.cc
index 3c3e4ffd6d9..e61df192c39 100644
--- a/tensorflow/core/common_runtime/scoped_allocator.cc
+++ b/tensorflow/core/common_runtime/scoped_allocator.cc
@@ -13,7 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #include "tensorflow/core/common_runtime/scoped_allocator.h"
+
 #include "tensorflow/core/common_runtime/scoped_allocator_mgr.h"
+#include "tensorflow/core/platform/dynamic_annotations.h"
 
 namespace tensorflow {
 
@@ -34,7 +36,7 @@ ScopedAllocator::ScopedAllocator(const Tensor& backing_tensor, int32 scope_id,
   tbuf_->Ref();
   // Hold this until all expected_calls have been made.
   container->Ref();
-  CHECK_GE(tbuf_->size(), fields.back().offset + fields.back().bytes);
+  CHECK_GE(tbuf_->size(), fields.back().offset + fields.back().bytes_requested);
 }
 
 ScopedAllocator::~ScopedAllocator() {
@@ -56,43 +58,66 @@ ScopedAllocator::~ScopedAllocator() {
 void* ScopedAllocator::AllocateRaw(int32 field_index, size_t num_bytes) {
   VLOG(1) << "ScopedAllocator index " << id_ << " AllocateRaw "
           << "field " << field_index << " num_bytes " << num_bytes;
-  mutex_lock l(mu_);
-  if (expected_call_count_ <= 0) {
-    LOG(ERROR) << "Scoped allocator " << name_
-               << " could not satisfy request for " << num_bytes
-               << " bytes, expected uses exhausted. ";
-    return nullptr;
-  }
-
-  int32_t num_fields = static_cast<int32>(fields_.size());
-  if (field_index >= num_fields) {
-    LOG(ERROR) << "ScopedAllocator " << name_
-               << " received unexpected field number " << field_index;
-    return nullptr;
-  }
-
-  const Field& f = fields_[field_index];
-  if (num_bytes != f.bytes) {
-    LOG(ERROR) << "ScopedAllocator " << name_ << " got request for "
-               << num_bytes << " bytes from field " << field_index
-               << " which has precalculated size " << f.bytes << " and offset "
-               << f.offset;
-    return nullptr;
-  }
-
-  void* ptr = static_cast<void*>((tbuf_->template base<char>() + f.offset));
-
-  ++live_alloc_count_;
-  --expected_call_count_;
-  if (0 == expected_call_count_) {
-    for (auto& f : fields_) {
-      container_->Drop(f.scope_id, this);
+  void* ptr = nullptr;
+  const Field* field = nullptr;
+  {
+    mutex_lock l(mu_);
+    if (expected_call_count_ <= 0) {
+      LOG(ERROR) << "Scoped allocator " << name_
+                 << " could not satisfy request for " << num_bytes
+                 << " bytes, expected uses exhausted. ";
+      return nullptr;
+    }
+
+    int32_t num_fields = static_cast<int32>(fields_.size());
+    if (field_index >= num_fields) {
+      LOG(ERROR) << "ScopedAllocator " << name_
+                 << " received unexpected field number " << field_index;
+      return nullptr;
+    }
+
+    field = &fields_[field_index];
+    if (num_bytes != field->bytes_requested) {
+      LOG(ERROR) << "ScopedAllocator " << name_ << " got request for "
+                 << num_bytes << " bytes from field " << field_index
+                 << " which has precalculated size " << field->bytes_requested
+                 << " and offset " << field->offset;
+      return nullptr;
+    }
+
+    ptr = static_cast<void*>((tbuf_->template base<char>() + field->offset));
+
+    ++live_alloc_count_;
+    --expected_call_count_;
+    if (0 == expected_call_count_) {
+      for (auto& f : fields_) {
+        container_->Drop(f.scope_id, this);
+      }
+      container_->Drop(id_, this);
+      container_->Unref();
+      container_ = nullptr;
     }
-    container_->Drop(id_, this);
-    container_->Unref();
-    container_ = nullptr;
   }
-  VLOG(1) << "AllocateRaw returning " << ptr;
+  VLOG(2) << "AllocateRaw returning " << ptr << " bytes_requested "
+          << field->bytes_requested << " bytes_allocated "
+          << field->bytes_allocated;
+
+  // If there is overshoot due to alignment, let MSAN believe that the padding
+  // is initialized.  This is okay because we do not use this memory region for
+  // anything meaningful.
+  if (field->bytes_allocated > field->bytes_requested) {
+    size_t extra_bytes = field->bytes_allocated - field->bytes_requested;
+    void* extra_buf = static_cast<void*>(static_cast<char*>(ptr) +
+                                         field->bytes_allocated - extra_bytes);
+    VLOG(2) << "AllocateRaw requested " << num_bytes
+            << " bytes which is not divisible by kAllocatorAlignment="
+            << Allocator::kAllocatorAlignment << " and hence we allocated "
+            << field->bytes_allocated << ". Annotating " << extra_bytes
+            << " bytes starting at " << extra_buf
+            << " with TF_ANNOTATE_MEMORY_IS_INITIALIZED";
+    TF_ANNOTATE_MEMORY_IS_INITIALIZED(extra_buf, extra_bytes);
+  }
+
   return ptr;
 }
 
diff --git a/tensorflow/core/common_runtime/scoped_allocator.h b/tensorflow/core/common_runtime/scoped_allocator.h
index 683bbc7e9ed..1b458aacb23 100644
--- a/tensorflow/core/common_runtime/scoped_allocator.h
+++ b/tensorflow/core/common_runtime/scoped_allocator.h
@@ -35,7 +35,8 @@ class ScopedAllocator {
   struct Field {
     int32 scope_id;
     size_t offset;
-    size_t bytes;
+    size_t bytes_requested;
+    size_t bytes_allocated;
   };
   // Field index that refers to backing tensor, not any aliased field.
   static const int32 kBackingIndex = -1;
diff --git a/tensorflow/core/common_runtime/scoped_allocator_mgr.cc b/tensorflow/core/common_runtime/scoped_allocator_mgr.cc
index 8ac6adc2e48..96f55aa4f4d 100644
--- a/tensorflow/core/common_runtime/scoped_allocator_mgr.cc
+++ b/tensorflow/core/common_runtime/scoped_allocator_mgr.cc
@@ -166,21 +166,35 @@ size_t ScopedAllocatorMgr::PopulateFields(
     const DataType dtype, std::vector<ScopedAllocator::Field>* fields) {
   const int32 num_fields = static_cast<int32>(shapes.size());
   fields->resize(num_fields);
+  // At the end of iteration `i`, `offset` points to the offset from the start
+  // of the backing buffer until the end of `field[i].bytes_allocated`.  This
+  // is aligned to `kAllocatorAlignment`.
   size_t offset = 0;
   for (int32 i = 0; i < num_fields; ++i) {
+    size_t bytes_requested = shapes[i].num_elements() * DataTypeSize(dtype);
+    auto* field = &((*fields)[i]);
+    field->scope_id = scope_id + 1 + i;
+    field->bytes_requested = bytes_requested;
+    field->offset = offset;
+    offset += bytes_requested;
+
+    // Compute actual #bytes allocated, which may include padding due to
+    // alignment.
+    size_t bytes_allocated = bytes_requested;
     size_t overshoot = offset % Allocator::kAllocatorAlignment;
     if (overshoot > 0) {
-      offset += (Allocator::kAllocatorAlignment - overshoot);
+      size_t alignment_bytes = Allocator::kAllocatorAlignment - overshoot;
+      bytes_allocated += alignment_bytes;
+      offset += alignment_bytes;
     }
-    size_t bytes = shapes[i].num_elements() * DataTypeSize(dtype);
-    (*fields)[i].scope_id = scope_id + 1 + i;
-    (*fields)[i].bytes = bytes;
-    (*fields)[i].offset = offset;
-    VLOG(1) << "field=" << i << " scope_id=" << (*fields)[i].scope_id
-            << " bytes=" << (*fields)[i].bytes
-            << " offset=" << (*fields)[i].offset;
-    offset += bytes;
+    field->bytes_allocated = bytes_allocated;
+
+    VLOG(1) << "field=" << i << " scope_id=" << field->scope_id
+            << " bytes_requested=" << field->bytes_requested
+            << " offset=" << field->offset
+            << " bytes_allocated=" << field->bytes_allocated;
   }
+
   return offset;
 }
 
diff --git a/tensorflow/core/common_runtime/scoped_allocator_mgr_test.cc b/tensorflow/core/common_runtime/scoped_allocator_mgr_test.cc
index 38e07e47f24..a359924f056 100644
--- a/tensorflow/core/common_runtime/scoped_allocator_mgr_test.cc
+++ b/tensorflow/core/common_runtime/scoped_allocator_mgr_test.cc
@@ -110,13 +110,13 @@ TEST_F(ScopedAllocatorMgrTest, PopulateFields) {
   InitTensor();
   PopulateFields();
   EXPECT_EQ(0, fields_[0].offset);
-  EXPECT_EQ(512 * sizeof(float), fields_[0].bytes);
+  EXPECT_EQ(512 * sizeof(float), fields_[0].bytes_requested);
   EXPECT_EQ(scope_id_ + 1, fields_[0].scope_id);
   EXPECT_EQ(512 * sizeof(float), fields_[1].offset);
-  EXPECT_EQ(9 * sizeof(float), fields_[1].bytes);
+  EXPECT_EQ(9 * sizeof(float), fields_[1].bytes_requested);
   EXPECT_EQ(scope_id_ + 2, fields_[1].scope_id);
   EXPECT_EQ(521 * sizeof(float) + AlignmentPadding(), fields_[2].offset);
-  EXPECT_EQ(512 * sizeof(float), fields_[2].bytes);
+  EXPECT_EQ(512 * sizeof(float), fields_[2].bytes_requested);
   EXPECT_EQ(scope_id_ + 3, fields_[2].scope_id);
 }
 
@@ -185,9 +185,10 @@ TEST_F(ScopedAllocatorMgrTest, AllocatorInitFail) {
   fields_.resize(1);
   fields_[0].scope_id = scope_id_ + 1;
   fields_[0].offset = 0;
-  fields_[0].bytes = backing_tensor_shape_.num_elements() * 2 * sizeof(float);
-  // fields[0].offset + fields[0].bytes is larger than the size of the backing
-  // tensor, so this check should fail
+  fields_[0].bytes_requested =
+      backing_tensor_shape_.num_elements() * 2 * sizeof(float);
+  // fields[0].offset + fields[0].bytes_requested is larger than the size of the
+  // backing tensor, so this check should fail
   EXPECT_DEATH(Status s = AddScopedAllocator(1, scope_id_), "");
 }
 
diff --git a/tensorflow/core/common_runtime/shape_refiner.cc b/tensorflow/core/common_runtime/shape_refiner.cc
index 2333e55ef46..df6076300d9 100644
--- a/tensorflow/core/common_runtime/shape_refiner.cc
+++ b/tensorflow/core/common_runtime/shape_refiner.cc
@@ -561,6 +561,13 @@ Status ShapeRefiner::ConstantPartialShape(InferenceContext* target_context,
   } else if (src_op == "StridedSlice") {
     TF_RETURN_IF_ERROR(
         PartialStridedSliceShape(input_edge->src(), src_context, result));
+  } else if (src_op == "VariableShape") {
+    auto* handle_data = src_context->input_handle_shapes_and_types(0);
+    if (handle_data != nullptr && !handle_data->empty()) {
+      *result = handle_data->at(0).shape;
+    } else {
+      *result = target_context->UnknownShape();
+    }
   } else {
     Tensor t;
     bool evaluated = false;
diff --git a/tensorflow/core/distributed_runtime/base_rendezvous_mgr.cc b/tensorflow/core/distributed_runtime/base_rendezvous_mgr.cc
index 2751deb5be2..3bae8ecbb6e 100644
--- a/tensorflow/core/distributed_runtime/base_rendezvous_mgr.cc
+++ b/tensorflow/core/distributed_runtime/base_rendezvous_mgr.cc
@@ -71,16 +71,13 @@ void BaseRendezvousMgr::RecvLocalAsync(int64 step_id,
                                        const Rendezvous::ParsedKey& parsed,
                                        Rendezvous::DoneCallback done) {
   auto rendez = FindOrCreate(step_id);
-  using namespace std::placeholders;
-  Rendezvous::DoneCallback done_cb = std::bind(
-      [rendez](Rendezvous::DoneCallback done,
-               // Begin unbound arguments.
-               const Status& s, const Rendezvous::Args& send_args,
-               const Rendezvous::Args& recv_args, const Tensor& v, bool dead) {
-        rendez->Unref();
-        done(s, send_args, recv_args, v, dead);
-      },
-      std::move(done), _1, _2, _3, _4, _5);
+  auto done_cb = [rendez, done = std::move(done)](
+                     const Status& s, const Rendezvous::Args& send_args,
+                     const Rendezvous::Args& recv_args, const Tensor& v,
+                     bool dead) {
+    rendez->Unref();
+    done(s, send_args, recv_args, v, dead);
+  };
   rendez->RecvLocalAsync(parsed, std::move(done_cb));
 }
 
diff --git a/tensorflow/core/distributed_runtime/eager/BUILD b/tensorflow/core/distributed_runtime/eager/BUILD
index 34fc44af097..235a84c6d15 100644
--- a/tensorflow/core/distributed_runtime/eager/BUILD
+++ b/tensorflow/core/distributed_runtime/eager/BUILD
@@ -21,6 +21,29 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "cluster_function_library_runtime",
+    srcs = [
+        "cluster_function_library_runtime.cc",
+    ],
+    hdrs = [
+        "cluster_function_library_runtime.h",
+    ],
+    deps = [
+        ":eager_client",
+        ":remote_execute_node",
+        "//tensorflow/core:core_cpu_internal",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:framework_internal",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/common_runtime/eager:context",
+        "//tensorflow/core/common_runtime/eager:eager_operation",
+        "//tensorflow/core/common_runtime/eager:tensor_handle",
+        "@com_google_absl//absl/types:span",
+    ],
+)
+
 cc_library(
     name = "destroy_tensor_handle_node",
     hdrs = ["destroy_tensor_handle_node.h"],
@@ -64,6 +87,7 @@ cc_library(
         "eager_service_impl.h",
     ],
     deps = [
+        ":cluster_function_library_runtime",
         ":remote_mgr",
         ":remote_tensor_handle",
         "//tensorflow:grpc++",
@@ -95,6 +119,7 @@ tf_cc_test(
     name = "eager_service_impl_test",
     srcs = ["eager_service_impl_test.cc"],
     deps = [
+        ":cluster_function_library_runtime",
         ":eager_service_impl",
         ":remote_mgr",
         "//tensorflow/c:c_api",
@@ -111,6 +136,7 @@ tf_cc_test(
         "//tensorflow/core/distributed_runtime:test_utils",
         "//tensorflow/core/distributed_runtime:worker_env",
         "//tensorflow/core/distributed_runtime/rpc:rpc_rendezvous_mgr",
+        "@com_google_absl//absl/types:span",
     ],
 )
 
diff --git a/tensorflow/core/distributed_runtime/eager/cluster_function_library_runtime.cc b/tensorflow/core/distributed_runtime/eager/cluster_function_library_runtime.cc
new file mode 100644
index 00000000000..6c143e93fe2
--- /dev/null
+++ b/tensorflow/core/distributed_runtime/eager/cluster_function_library_runtime.cc
@@ -0,0 +1,160 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/distributed_runtime/eager/cluster_function_library_runtime.h"
+
+#include <map>
+
+#include "tensorflow/core/common_runtime/eager/context.h"
+#include "tensorflow/core/common_runtime/eager/eager_operation.h"
+#include "tensorflow/core/common_runtime/function.h"
+#include "tensorflow/core/distributed_runtime/eager/eager_client.h"
+#include "tensorflow/core/distributed_runtime/eager/remote_execute_node.h"
+#include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/gtl/cleanup.h"
+
+namespace tensorflow {
+namespace eager {
+
+Status EagerClusterFunctionLibraryRuntime::Instantiate(
+    const string& function_name, const FunctionLibraryDefinition& lib_def,
+    AttrSlice attrs, const FunctionLibraryRuntime::InstantiateOptions& options,
+    FunctionLibraryRuntime::LocalHandle* handle) {
+  const tensorflow::AttrTypeMap* attr_types;
+  bool is_function = false;
+  TF_RETURN_IF_ERROR(tensorflow::AttrTypeMapForOp(function_name.c_str(),
+                                                  &attr_types, &is_function));
+  if (!is_function) {
+    return errors::Internal(function_name, " is not a function.");
+  }
+  auto op = absl::make_unique<EagerOperation>(ctx_, function_name.c_str(),
+                                              is_function, attr_types);
+  TF_RETURN_IF_ERROR(op->SetDeviceName(options.target.c_str()));
+
+  VLOG(1) << "CFLR::Instantiate: " << function_name << " on " << options.target
+          << " (this: " << this << ")";
+  eager::EagerClient* eager_client = nullptr;
+  Device* device;
+  TF_RETURN_IF_ERROR(ctx_->FindDeviceFromName(options.target.c_str(), &device));
+  TF_RETURN_IF_ERROR(ctx_->GetClient(device, &eager_client));
+
+  if (eager_client == nullptr) {
+    return errors::InvalidArgument("Could not find eager client for target: ",
+                                   options.target);
+  }
+
+  const FunctionLibraryDefinition& func_lib_def =
+      options.lib_def ? *options.lib_def : lib_def;
+
+  RegisterFunctionRequest request;
+  const uint64 context_id = ctx_->GetContextId();
+  request.set_context_id(context_id);
+  // TODO(yujingzhang): add FunctionDefLibrary to RegisterFunctionRequest to
+  // support nested functions.
+  *request.mutable_function_def() = *func_lib_def.Find(function_name);
+  request.set_is_component_function(true);
+
+  Status status;
+  Notification done;
+  RegisterFunctionResponse response;
+  eager_client->RegisterFunctionAsync(&request, &response, [&](Status s) {
+    status = s;
+    done.Notify();
+  });
+  done.WaitForNotification();
+  TF_RETURN_IF_ERROR(status);
+
+  mutex_lock l(mu_);
+  *handle = function_data_.size();
+  function_data_.emplace_back(options.target, context_id, eager_client,
+                              std::move(op));
+  return Status::OK();
+}
+
+void EagerClusterFunctionLibraryRuntime::Run(
+    const FunctionLibraryRuntime::Options& opts,
+    FunctionLibraryRuntime::LocalHandle handle, gtl::ArraySlice<Tensor> args,
+    std::vector<Tensor>* rets, FunctionLibraryRuntime::DoneCallback done) {
+  done(errors::Unimplemented("Not implemented"));
+}
+
+void EagerClusterFunctionLibraryRuntime::Run(
+    const FunctionLibraryRuntime::Options& opts,
+    FunctionLibraryRuntime::LocalHandle handle, const int64 op_id,
+    absl::Span<eager::RemoteTensorHandle* const> args,
+    FunctionLibraryRuntime::DoneCallback done) {
+  FunctionData* function_data = nullptr;
+  {
+    mutex_lock l(mu_);
+    DCHECK_LE(handle, function_data_.size());
+    function_data = &function_data_[handle];
+  }
+
+  EagerClient* eager_client = function_data->eager_client;
+  if (eager_client == nullptr) {
+    done(errors::Internal("Could not find eager client"));
+    return;
+  }
+
+  Device* device;
+  Status s = ctx_->FindDeviceFromName(function_data->target.c_str(), &device);
+  if (!s.ok()) {
+    done(errors::Internal("Failed to get device"));
+    return;
+  }
+
+  EagerOperation* op = function_data->op.get();
+
+  eager::EnqueueRequest* request = new eager::EnqueueRequest;
+  request->set_context_id(function_data->context_id);
+  eager::Operation* remote_op = request->add_queue()->mutable_operation();
+  for (size_t i = 0; i < args.size(); ++i) {
+    remote_op->add_inputs()->Swap(args[i]);
+  }
+  // TODO(yujingzhang): add step_id to eager::Operation to make sure that all
+  // component functions use the same step id.
+  // The remote component function should use the same op_id as its parent
+  // multi-device function's in order to get the global unqiue op_id generated
+  // by the master context.
+  remote_op->set_id(op_id);
+  remote_op->set_name(op->Name());
+  op->Attrs().FillAttrValueMap(remote_op->mutable_attrs());
+  remote_op->set_device(function_data->target);
+
+  for (auto handle : op->Inputs()) {
+    handle->Ref();
+  }
+
+  // TODO(yujingzhang): Use RemoteExecuteNode once we enable async execution.
+  EnqueueResponse* response = new EnqueueResponse;
+  eager_client->EnqueueAsync(request, response,
+                             [op, request, response, done](const Status& s) {
+                               for (auto handle : op->Inputs()) {
+                                 handle->Unref();
+                               }
+                               done(s);
+                               delete request;
+                               delete response;
+                             });
+}
+
+void EagerClusterFunctionLibraryRuntime::CleanUp(
+    uint64 step_id, FunctionLibraryRuntime::LocalHandle handle,
+    FunctionLibraryRuntime::DoneCallback done) {
+  done(Status::OK());
+}
+
+}  // namespace eager
+}  // namespace tensorflow
diff --git a/tensorflow/core/distributed_runtime/eager/cluster_function_library_runtime.h b/tensorflow/core/distributed_runtime/eager/cluster_function_library_runtime.h
new file mode 100644
index 00000000000..56a7ee189fe
--- /dev/null
+++ b/tensorflow/core/distributed_runtime/eager/cluster_function_library_runtime.h
@@ -0,0 +1,87 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_EAGER_CLUSTER_FUNCTION_LIBRARY_RUNTIME_H_
+#define TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_EAGER_CLUSTER_FUNCTION_LIBRARY_RUNTIME_H_
+
+#include "tensorflow/core/common_runtime/device_mgr.h"
+#include "tensorflow/core/common_runtime/eager/context.h"
+#include "tensorflow/core/common_runtime/eager/eager_operation.h"
+#include "tensorflow/core/common_runtime/eager/tensor_handle.h"
+#include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/protobuf/remote_tensor_handle.pb.h"
+
+namespace tensorflow {
+
+struct WorkerSession;
+
+namespace eager {
+
+// EagerClusterFunctionLibraryRuntime contains methods to Instantiate and Run
+// functions across processes by making RPCs through eager service.
+class EagerClusterFunctionLibraryRuntime
+    : public DistributedFunctionLibraryRuntime {
+ public:
+  EagerClusterFunctionLibraryRuntime(EagerContext* ctx,
+                                     DeviceMgr* remote_device_mgr)
+      : ctx_(ctx), remote_device_mgr_(remote_device_mgr) {}
+
+  ~EagerClusterFunctionLibraryRuntime() override{};
+
+  Status Instantiate(const string& function_name,
+                     const FunctionLibraryDefinition& lib_def, AttrSlice attrs,
+                     const FunctionLibraryRuntime::InstantiateOptions& options,
+                     FunctionLibraryRuntime::LocalHandle* handle) override;
+
+  void Run(const FunctionLibraryRuntime::Options& opts,
+           FunctionLibraryRuntime::LocalHandle handle,
+           gtl::ArraySlice<Tensor> args, std::vector<Tensor>* rets,
+           FunctionLibraryRuntime::DoneCallback done) override;
+
+  void Run(const FunctionLibraryRuntime::Options& opts,
+           FunctionLibraryRuntime::LocalHandle handle, const int64 op_id,
+           absl::Span<eager::RemoteTensorHandle* const> args,
+           FunctionLibraryRuntime::DoneCallback done) override;
+
+  void CleanUp(uint64 step_id, FunctionLibraryRuntime::LocalHandle handle,
+               FunctionLibraryRuntime::DoneCallback done) override;
+
+  DeviceMgr* remote_device_mgr() const override { return remote_device_mgr_; }
+
+ private:
+  EagerContext* ctx_;
+  DeviceMgr* remote_device_mgr_;  // not owned.
+
+  struct FunctionData {
+    const string target;
+    const uint64 context_id;
+    EagerClient* eager_client = nullptr;
+    std::unique_ptr<EagerOperation> op;
+
+    FunctionData(const string& target, const uint64 context_id,
+                 EagerClient* eager_client, std::unique_ptr<EagerOperation> op)
+        : target(target),
+          context_id(context_id),
+          eager_client(eager_client),
+          op(std::move(op)) {}
+  };
+
+  mutable mutex mu_;
+  std::vector<FunctionData> function_data_ GUARDED_BY(mu_);
+};
+
+}  // namespace eager
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_EAGER_CLUSTER_FUNCTION_LIBRARY_RUNTIME_H_
diff --git a/tensorflow/core/distributed_runtime/eager/destroy_tensor_handle_node.h b/tensorflow/core/distributed_runtime/eager/destroy_tensor_handle_node.h
index b3d482dc0c8..6f1a4fb6f2d 100644
--- a/tensorflow/core/distributed_runtime/eager/destroy_tensor_handle_node.h
+++ b/tensorflow/core/distributed_runtime/eager/destroy_tensor_handle_node.h
@@ -28,17 +28,23 @@ namespace eager {
 class DestroyTensorHandleNode : public tensorflow::AsyncEagerNode {
  public:
   DestroyTensorHandleNode(std::unique_ptr<EnqueueRequest> request,
-                          EagerClient* eager_client)
+                          EagerClient* eager_client, bool ready)
       : tensorflow::AsyncEagerNode(),
         request_(std::move(request)),
-        eager_client_(eager_client) {}
+        eager_client_(eager_client),
+        ready_(ready) {}
 
   void RunAsync(StatusCallback done) override {
     EnqueueResponse* response = new EnqueueResponse;
-    eager_client_->StreamingEnqueueAsync(
+    bool ready = ready_;
+    // NOTE(fishx): Don't use StreamingEnqueueAsync here. When a
+    // StreamingEnqueueAsync request fails all following requests will fail as
+    // well. We don't want this request poison following requests since it is
+    // safe to ignore a failing destroy tensor handle request.
+    eager_client_->EnqueueAsync(
         request_.get(), response,
-        [response, done](const tensorflow::Status& s) {
-          if (!s.ok()) {
+        [response, ready, done](const tensorflow::Status& s) {
+          if (!s.ok() && ready) {
             LOG(WARNING) << "Ignoring an error encountered when deleting "
                             "remote tensors handles: "
                          << s.ToString();
@@ -59,6 +65,7 @@ class DestroyTensorHandleNode : public tensorflow::AsyncEagerNode {
  private:
   std::unique_ptr<EnqueueRequest> request_;
   EagerClient* eager_client_;  // Not owned, and must outlive this node.
+  bool ready_;
 };
 
 }  // namespace eager
diff --git a/tensorflow/core/distributed_runtime/eager/eager_service_impl.cc b/tensorflow/core/distributed_runtime/eager/eager_service_impl.cc
index 7bb526ca0a7..0b3c8b5d449 100644
--- a/tensorflow/core/distributed_runtime/eager/eager_service_impl.cc
+++ b/tensorflow/core/distributed_runtime/eager/eager_service_impl.cc
@@ -280,7 +280,7 @@ Status EagerServiceImpl::Enqueue(const EnqueueRequest* request,
   TF_RETURN_IF_ERROR(GetServerContext(request->context_id(), &context));
   core::ScopedUnref context_unref(context);
 
-  EagerExecutor* executor =
+  EagerExecutor& executor =
       stream_id == kInvalidStreamId
           ? context->Context()->Executor()
           : context->Context()->RemoteMgr()->GetOrCreateExecutorForStream(
@@ -289,16 +289,14 @@ Status EagerServiceImpl::Enqueue(const EnqueueRequest* request,
   for (const auto& item : request->queue()) {
     auto* queue_response = response->add_queue_response();
     if (item.has_operation()) {
-      s = ExecuteOp(item.operation(), context->Context(), executor,
+      s = ExecuteOp(item.operation(), context->Context(), &executor,
                     queue_response);
     } else if (item.has_handle_to_decref()) {
       auto handle_to_decref = absl::make_unique<RemoteTensorHandleInternal>(
           item.handle_to_decref());
       auto node = absl::make_unique<ClientTensorHandleDeleteNode>(
           context, std::move(handle_to_decref));
-      s = executor->Async()
-              ? context->Context()->Executor()->Add(std::move(node))
-              : node->Run();
+      s = context->Context()->Executor().AddOrExecute(std::move(node));
     } else {
       s = SendTensor(item.send_tensor(), context->Context());
     }
@@ -326,7 +324,7 @@ Status EagerServiceImpl::WaitQueueDone(const WaitQueueDoneRequest* request,
         "EagerServiceImpl::WaitQueueDone is not "
         "implemented for particular op IDs.");
   }
-  return context->Context()->Executor()->WaitForAllPendingNodes();
+  return context->Context()->Executor().WaitForAllPendingNodes();
 }
 
 Status EagerServiceImpl::KeepAlive(const KeepAliveRequest* request,
@@ -368,7 +366,10 @@ Status EagerServiceImpl::RegisterFunction(
   TF_RETURN_IF_ERROR(GetServerContext(request->context_id(), &context));
   core::ScopedUnref context_unref(context);
 
-  return context->Context()->AddFunctionDef(request->function_def());
+  // If the function is a component of a multi-device function, we only need to
+  // register it locally.
+  return context->Context()->AddFunctionDef(request->function_def(),
+                                            request->is_component_function());
 }
 
 Status EagerServiceImpl::SendTensor(const SendTensorRequest* request,
@@ -391,7 +392,7 @@ Status EagerServiceImpl::SendTensor(const SendTensorRequest* request,
     Device* device;
     TF_RETURN_IF_ERROR(
         ctx->FindDeviceFromName(request->device_name().c_str(), &device));
-    TF_RETURN_IF_ERROR(EagerCopyToDevice(tensor_handle, ctx, ctx->Executor(),
+    TF_RETURN_IF_ERROR(EagerCopyToDevice(tensor_handle, ctx, &ctx->Executor(),
                                          device, false, &copied_handle));
     tensors.push_back(copied_handle);
     tensor_handle->Unref();
@@ -419,7 +420,7 @@ Status EagerServiceImpl::SendTensor(const SendTensorOp& send_tensor,
     TF_RETURN_IF_ERROR(eager_context->FindDeviceFromName(
         send_tensor.device_name().c_str(), &device));
     TF_RETURN_IF_ERROR(EagerCopyToDevice(tensor_handle, eager_context,
-                                         eager_context->Executor(), device,
+                                         &eager_context->Executor(), device,
                                          false, &copied_handle));
     tensors.push_back(copied_handle);
     tensor_handle->Unref();
diff --git a/tensorflow/core/distributed_runtime/eager/eager_service_impl_test.cc b/tensorflow/core/distributed_runtime/eager/eager_service_impl_test.cc
index d278f56b99c..7f0915a471f 100644
--- a/tensorflow/core/distributed_runtime/eager/eager_service_impl_test.cc
+++ b/tensorflow/core/distributed_runtime/eager/eager_service_impl_test.cc
@@ -17,8 +17,10 @@ limitations under the License.
 
 #include <string.h>
 
+#include "absl/types/span.h"
 #include "tensorflow/c/c_api_internal.h"
 #include "tensorflow/core/common_runtime/eager/tensor_handle.h"
+#include "tensorflow/core/distributed_runtime/eager/cluster_function_library_runtime.h"
 #include "tensorflow/core/distributed_runtime/eager/remote_mgr.h"
 #include "tensorflow/core/distributed_runtime/rpc/rpc_rendezvous_mgr.h"
 #include "tensorflow/core/distributed_runtime/session_mgr.h"
@@ -42,6 +44,13 @@ namespace {
 class TestEagerServiceImpl : public EagerServiceImpl {
  public:
   explicit TestEagerServiceImpl(const WorkerEnv* env) : EagerServiceImpl(env) {}
+  Status GetEagerContext(const uint64 context_id, EagerContext** ctx) {
+    ServerContext* context = nullptr;
+    TF_RETURN_IF_ERROR(GetServerContext(context_id, &context));
+    core::ScopedUnref context_unref(context);
+    *ctx = context->Context();
+    return Status::OK();
+  }
   Status GetTensorHandle(const uint64 context_id,
                          const RemoteTensorHandleInternal& remote_handle,
                          tensorflow::TensorHandle** handle) {
@@ -54,10 +63,48 @@ class TestEagerServiceImpl : public EagerServiceImpl {
   }
 };
 
-class DummyEagerClientCache : public EagerClientCache {
-  Status GetClient(const string& target, EagerClient** client) override {
-    return errors::Unimplemented("");
+class FakeEagerClient : public EagerClient {
+ public:
+  FakeEagerClient() {}
+  ~FakeEagerClient() override {}
+
+  void SetServiceImpl(TestEagerServiceImpl* impl) { impl_ = impl; }
+
+#define CLIENT_METHOD(method)                                         \
+  void method##Async(const method##Request* request,                  \
+                     method##Response* response, StatusCallback done) \
+      override {                                                      \
+    done(impl_->method(request, response));                           \
   }
+
+  CLIENT_METHOD(CreateContext);
+  CLIENT_METHOD(Enqueue);
+  CLIENT_METHOD(WaitQueueDone);
+  CLIENT_METHOD(KeepAlive);
+  CLIENT_METHOD(CloseContext);
+  CLIENT_METHOD(RegisterFunction);
+#undef CLIENT_METHOD
+
+  void StreamingEnqueueAsync(const EnqueueRequest* request,
+                             EnqueueResponse* response,
+                             StatusCallback done) override {
+    done(errors::Unimplemented(""));
+  }
+
+ private:
+  TestEagerServiceImpl* impl_;
+};
+
+class DummyEagerClientCache : public EagerClientCache {
+ public:
+  DummyEagerClientCache() : client_(new FakeEagerClient) {}
+  Status GetClient(const string& target, EagerClient** client) override {
+    *client = client_.get();
+    return Status::OK();
+  }
+
+ private:
+  std::unique_ptr<EagerClient> client_;
 };
 
 class FakeCache : public TestWorkerCache {
@@ -66,6 +113,10 @@ class FakeCache : public TestWorkerCache {
     eager_client_cache->reset(new DummyEagerClientCache);
     return Status::OK();
   }
+
+  void ListWorkers(std::vector<string>* workers) const override {
+    workers->push_back("/job:localhost/replica:0/task:0");
+  }
 };
 
 class EagerServiceImplTest : public ::testing::Test {
@@ -311,6 +362,110 @@ TEST_F(EagerServiceImplTest, BasicFunctionTest) {
                                                &close_context_response));
 }
 
+// Test executes a function through EagerClusterFunctionLibraryRuntime.
+TEST_F(EagerServiceImplTest, ClusterFLRTest) {
+  TestEagerServiceImpl eager_service_impl(&worker_env_);
+
+  uint64 context_id = random::New64();
+
+  CreateContextRequest request;
+  request.mutable_server_def()->set_job_name("localhost");
+  request.mutable_server_def()->set_task_index(0);
+  request.set_context_id(context_id);
+  CreateContextResponse response;
+  TF_ASSERT_OK(eager_service_impl.CreateContext(&request, &response));
+
+  const string target_device = "/job:localhost/replica:0/task:0/device:CPU:0";
+
+  // Make the fake EagerClient use the local eager_service_impl.
+  EagerContext* ctx = nullptr;
+  TF_ASSERT_OK(eager_service_impl.GetEagerContext(context_id, &ctx));
+  Device* device;
+  TF_ASSERT_OK(ctx->FindDeviceFromName(target_device.c_str(), &device));
+  EagerClient* client;
+  TF_ASSERT_OK(ctx->GetClient(device, &client));
+  FakeEagerClient* fake_client = static_cast<FakeEagerClient*>(client);
+  fake_client->SetServiceImpl(&eager_service_impl);
+
+  auto eager_cluster_flr =
+      absl::make_unique<EagerClusterFunctionLibraryRuntime>(ctx, nullptr);
+  tensorflow::FunctionDef fdef = MatMulFunction();
+
+  // Create the remote input for MatMulFunction.
+  EnqueueRequest remote_enqueue_request;
+  remote_enqueue_request.set_context_id(context_id);
+  EnqueueResponse remote_enqueue_response;
+  std::unordered_map<string, AttrValue> const_attrs;
+  AttrValue val;
+  val.set_type(tensorflow::DataType::DT_FLOAT);
+  const_attrs.insert({"dtype", val});
+  val.Clear();
+  SetTensorProto(val.mutable_tensor());
+  const_attrs.insert({"value", val});
+  AddOperationToEnqueueRequest(1, "Const", {}, const_attrs, target_device,
+                               &remote_enqueue_request);
+  TF_ASSERT_OK(eager_service_impl.Enqueue(&remote_enqueue_request,
+                                          &remote_enqueue_response));
+
+  // Instantiate MatMulFunction.
+  FunctionLibraryRuntime::InstantiateOptions options;
+  options.target = target_device;
+  options.is_multi_device_function = true;
+  options.input_devices.push_back(target_device);
+  FunctionLibraryRuntime::Handle handle;
+  FunctionLibraryDefinition func_lib_def{OpRegistry::Global(), {}};
+  TF_ASSERT_OK(func_lib_def.AddFunctionDef(fdef));
+  TF_ASSERT_OK(eager_cluster_flr->Instantiate(
+      fdef.signature().name(), func_lib_def, AttrSlice(&fdef.attr()), options,
+      &handle));
+
+  // Run MatMulFunction.
+  FunctionLibraryRuntime::Options opts;
+  const int64 step_id = opts.step_id;
+  Notification done;
+  Status status;
+  RemoteTensorHandle input;
+  input.set_op_id(1);
+  input.set_output_num(0);
+  input.set_op_device(target_device);
+  input.set_device(target_device);
+  eager_cluster_flr->Run(opts, handle, 2, {&input},
+                         [&status, &done](const Status& s) {
+                           status = s;
+                           done.Notify();
+                         });
+  done.WaitForNotification();
+  TF_ASSERT_OK(status);
+
+  const tensorflow::Tensor* t = nullptr;
+  tensorflow::TensorHandle* tensor_handle;
+  TF_ASSERT_OK(eager_service_impl.GetTensorHandle(
+      context_id, RemoteTensorHandleInternal(2, 0), &tensor_handle));
+  TF_ASSERT_OK(tensor_handle->Tensor(&t));
+  auto actual = t->flat<float>();
+  EXPECT_EQ(4, actual.size());
+  EXPECT_EQ(7, actual(0));
+  EXPECT_EQ(10, actual(1));
+  EXPECT_EQ(15, actual(2));
+  EXPECT_EQ(22, actual(3));
+
+  Status cleanup_status;
+  bool callback_is_called = false;
+  eager_cluster_flr->CleanUp(
+      step_id, handle, [&cleanup_status, &callback_is_called](const Status& s) {
+        callback_is_called = true;
+        cleanup_status.Update(s);
+      });
+  EXPECT_TRUE(callback_is_called);
+  TF_ASSERT_OK(cleanup_status);
+
+  CloseContextRequest close_context_request;
+  close_context_request.set_context_id(context_id);
+  CloseContextResponse close_context_response;
+  TF_ASSERT_OK(eager_service_impl.CloseContext(&close_context_request,
+                                               &close_context_response));
+}
+
 // Test creates a context and attempts to send a tensor (using the RPC), and
 // then use the tensor.
 TEST_F(EagerServiceImplTest, SendTensorTest) {
diff --git a/tensorflow/core/distributed_runtime/eager/remote_copy_node.cc b/tensorflow/core/distributed_runtime/eager/remote_copy_node.cc
index 8a5f5a38d12..fba7e4db93c 100644
--- a/tensorflow/core/distributed_runtime/eager/remote_copy_node.cc
+++ b/tensorflow/core/distributed_runtime/eager/remote_copy_node.cc
@@ -341,20 +341,20 @@ void RemoteCopyNode::RunAsync(StatusCallback done) {
   }
   StartSend();
 
-  auto done_wrapper = std::bind(
-      [this](const StatusCallback& done, const Status& s) {
-        if (!s.ok() && errors::IsCancelled(s)) {
-          Status send_status = captured_state_->GetSendStatus();
-          if (!send_status.ok()) {
-            // In this case, Recv is cancelled because the Send op failed.
-            // Return the status of the Send op instead.
-            done(send_status);
-          }
-        } else {
-          done(s);
-        }
-      },
-      std::move(done), std::placeholders::_1);
+  const std::shared_ptr<CapturedSharedState>& captured_state = captured_state_;
+  auto done_wrapper = [captured_state,
+                       done = std::move(done)](const Status& s) {
+    if (!s.ok() && errors::IsCancelled(s)) {
+      Status send_status = captured_state->GetSendStatus();
+      if (!send_status.ok()) {
+        // In this case, Recv is cancelled because the Send op failed.
+        // Return the status of the Send op instead.
+        done(send_status);
+      }
+    } else {
+      done(s);
+    }
+  };
 
   // StartRecv() takes care of doing the right thing to dst handle.
   // No need to poison it after this point.
diff --git a/tensorflow/core/distributed_runtime/eager/remote_mgr.cc b/tensorflow/core/distributed_runtime/eager/remote_mgr.cc
index 1c9e1fcf054..c8b78d74c86 100644
--- a/tensorflow/core/distributed_runtime/eager/remote_mgr.cc
+++ b/tensorflow/core/distributed_runtime/eager/remote_mgr.cc
@@ -139,7 +139,7 @@ Status RemoteMgr::DeserializeRemoteTensorHandle(const RemoteTensorHandle& in,
   return Status::OK();
 }
 
-EagerExecutor* RemoteMgr::GetOrCreateExecutorForStream(uint64 stream_id) {
+EagerExecutor& RemoteMgr::GetOrCreateExecutorForStream(uint64 stream_id) {
   mutex_lock l(executor_map_mu_);
   auto it = executor_map_.find(stream_id);
   if (it == executor_map_.end()) {
@@ -149,7 +149,7 @@ EagerExecutor* RemoteMgr::GetOrCreateExecutorForStream(uint64 stream_id) {
     DCHECK(it_and_bool.second);
     it = it_and_bool.first;
   }
-  return &it->second;
+  return it->second;
 }
 
 void RemoteMgr::DeleteExecutorForStream(uint64 stream_id) {
diff --git a/tensorflow/core/distributed_runtime/eager/remote_mgr.h b/tensorflow/core/distributed_runtime/eager/remote_mgr.h
index 69410434af3..07bd899921b 100644
--- a/tensorflow/core/distributed_runtime/eager/remote_mgr.h
+++ b/tensorflow/core/distributed_runtime/eager/remote_mgr.h
@@ -67,7 +67,7 @@ class RemoteMgr {
   Status DeserializeRemoteTensorHandle(const RemoteTensorHandle& in,
                                        TensorHandle** out);
 
-  EagerExecutor* GetOrCreateExecutorForStream(uint64 stream_id);
+  EagerExecutor& GetOrCreateExecutorForStream(uint64 stream_id);
 
   void DeleteExecutorForStream(uint64 stream_id);
 
diff --git a/tensorflow/core/distributed_runtime/eager/remote_tensor_handle_data.cc b/tensorflow/core/distributed_runtime/eager/remote_tensor_handle_data.cc
index 85ad20e51d9..163db36ce34 100644
--- a/tensorflow/core/distributed_runtime/eager/remote_tensor_handle_data.cc
+++ b/tensorflow/core/distributed_runtime/eager/remote_tensor_handle_data.cc
@@ -25,8 +25,8 @@ namespace {
 
 void DestoryRemoteTensorHandle(EagerContext* ctx,
                                eager::EagerClient* eager_client,
-                               uint64 context_id, uint64 op_id,
-                               int output_num) {
+                               uint64 context_id, uint64 op_id, int output_num,
+                               bool ready) {
   if (ctx->GetContextId() != context_id) {
     // This means that this tensor was pointing to a remote device, which
     // has been changed out from under us. Simply return since there is
@@ -44,10 +44,10 @@ void DestoryRemoteTensorHandle(EagerContext* ctx,
   VLOG(3) << "Sending request to delete " << request->DebugString();
   std::unique_ptr<EagerNode> node(
       absl::make_unique<eager::DestroyTensorHandleNode>(std::move(request),
-                                                        eager_client));
-  auto* executor = ctx->Executor();
-  if (executor->Async()) {
-    Status status = executor->Add(std::move(node));
+                                                        eager_client, ready));
+  auto& executor = ctx->Executor();
+  if (executor.Async()) {
+    Status status = executor.AddOrExecute(std::move(node));
     if (!status.ok()) {
       LOG(ERROR) << "Unable to destroy remote tensor handles: "
                  << status.error_message();
@@ -56,13 +56,13 @@ void DestoryRemoteTensorHandle(EagerContext* ctx,
     // This thread may still hold tensorflow::StreamingRPCState::mu_. We need
     // to send out the destroy request in a new thread to avoid deadlock.
     auto* released_node = node.release();
-    (*ctx->runner())([released_node] {
-      Status status = released_node->Run();
+    (*ctx->runner())([ctx, released_node] {
+      Status status =
+          ctx->Executor().AddOrExecute(absl::WrapUnique(released_node));
       if (!status.ok()) {
         LOG(ERROR) << "Unable to destroy remote tensor handles: "
                    << status.error_message();
       }
-      delete released_node;
     });
   }
 }
@@ -87,7 +87,7 @@ RemoteTensorHandleData::RemoteTensorHandleData(int64 op_id, int output_num,
 
 RemoteTensorHandleData::~RemoteTensorHandleData() {
   DestoryRemoteTensorHandle(ctx_, eager_client_, context_id_, op_id_,
-                            output_num_);
+                            output_num_, /*ready=*/true);
   ctx_->Unref();
 }
 
@@ -150,7 +150,7 @@ UnshapedRemoteTensorHandleData::UnshapedRemoteTensorHandleData(
 UnshapedRemoteTensorHandleData::~UnshapedRemoteTensorHandleData() {
   if (delete_remote_tensor_) {
     DestoryRemoteTensorHandle(ctx_, eager_client_, context_id_, op_id_,
-                              output_num_);
+                              output_num_, /*ready=*/false);
   }
   ctx_->Unref();
 }
diff --git a/tensorflow/core/distributed_runtime/graph_mgr.cc b/tensorflow/core/distributed_runtime/graph_mgr.cc
index 5d06bf9a75b..9c4d3e25b1d 100644
--- a/tensorflow/core/distributed_runtime/graph_mgr.cc
+++ b/tensorflow/core/distributed_runtime/graph_mgr.cc
@@ -418,7 +418,7 @@ void GraphMgr::ExecuteAsync(const string& handle, const int64 step_id,
                             CancellationManager* cancellation_manager,
                             const NamedTensors& in, StatusCallback done) {
   const uint64 start_time_usecs = Env::Default()->NowMicros();
-  string session_id_meta = strings::StrCat("RunGraph #id=", step_id, "#");
+  string session_id_meta = strings::StrCat("RunGraph#id=", step_id, "#");
   auto* activity = new profiler::TraceMe(absl::string_view(session_id_meta),
                                          profiler::TraceMeLevel::kInfo);
   // Lookup an item. Holds one ref while executing.
diff --git a/tensorflow/core/distributed_runtime/master.cc b/tensorflow/core/distributed_runtime/master.cc
index d19a022b31f..658182c6243 100644
--- a/tensorflow/core/distributed_runtime/master.cc
+++ b/tensorflow/core/distributed_runtime/master.cc
@@ -689,13 +689,11 @@ void Master::MakeCallable(const MakeCallableRequest* req,
     return;
   }
 
-  SchedClosure(std::bind(
-      [session, req, resp](MyClosure done) {
-        Status s = session->MakeCallable(*req, resp);
-        session->Unref();
-        done(s);
-      },
-      std::move(done)));
+  SchedClosure([session, req, resp, done = std::move(done)]() {
+    Status s = session->MakeCallable(*req, resp);
+    session->Unref();
+    done(s);
+  });
 }
 
 void Master::RunCallable(CallOptions* opts, const RunCallableRequest* req,
@@ -712,13 +710,11 @@ void Master::RunCallable(CallOptions* opts, const RunCallableRequest* req,
     return;
   }
 
-  SchedClosure(std::bind(
-      [session, opts, req, resp](MyClosure done) {
-        Status s = session->RunCallable(opts, *req, resp);
-        session->Unref();
-        done(s);
-      },
-      std::move(done)));
+  SchedClosure([session, opts, req, resp, done = std::move(done)]() {
+    Status s = session->RunCallable(opts, *req, resp);
+    session->Unref();
+    done(s);
+  });
 }
 
 void Master::ReleaseCallable(const ReleaseCallableRequest* req,
@@ -729,13 +725,11 @@ void Master::ReleaseCallable(const ReleaseCallableRequest* req,
     return;
   }
 
-  SchedClosure(std::bind(
-      [session, req, resp](MyClosure done) {
-        Status s = session->ReleaseCallable(*req, resp);
-        session->Unref();
-        done(s);
-      },
-      std::move(done)));
+  SchedClosure([session, req, resp, done = std::move(done)]() {
+    Status s = session->ReleaseCallable(*req, resp);
+    session->Unref();
+    done(s);
+  });
 }
 
 }  // end namespace tensorflow
diff --git a/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_client.cc b/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_client.cc
index 4c1f20d6b93..bbe1a51aa5b 100644
--- a/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_client.cc
+++ b/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_client.cc
@@ -56,9 +56,8 @@ namespace {
  */
 bool EnableStreaming() {
   bool result;
-  // TODO(b/139210648): Turn on this flag by default.
   TF_CHECK_OK(ReadBoolFromEnvVar("TF_ENABLE_EAGER_CLIENT_STREAMING_ENQUEUE",
-                                 false, &result));
+                                 true, &result));
   return result;
 }
 
diff --git a/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_service_impl.h b/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_service_impl.h
index ae9477049ab..29328df6f5b 100644
--- a/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_service_impl.h
+++ b/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_service_impl.h
@@ -84,7 +84,13 @@ class GrpcEagerServiceImpl : public AsyncServiceInterface {
   // executor queue.
   void StreamingEnqueueHandler(
       StreamingCall<EnqueueRequest, EnqueueResponse>* call) {
+    call->Ref();
     enqueue_streaming_thread_.Schedule([this, call]() {
+      if (call->RefCountIsOne()) {
+        // This StreamingCall has already been shutdown. Don't need to anything.
+        call->Unref();
+        return;
+      }
       // NOTE(fishx): Use the address of StreamingCall as the stream_id since we
       // reuse the same StreamingCall for multiple requests in the same
       // streaming connection.
@@ -100,6 +106,7 @@ class GrpcEagerServiceImpl : public AsyncServiceInterface {
                 << " on request " << call->request().DebugString();
         call->Finish(ToGrpcStatus(status));
       }
+      call->Unref();
 
       // We do not tell gRPC to accept a new StreamingEnqueue request because
       // this method can be called multiple times for a given streaming call.
diff --git a/tensorflow/core/distributed_runtime/rpc/rpc_rendezvous_mgr.cc b/tensorflow/core/distributed_runtime/rpc/rpc_rendezvous_mgr.cc
index a267371ed42..ba68fe4afb8 100644
--- a/tensorflow/core/distributed_runtime/rpc/rpc_rendezvous_mgr.cc
+++ b/tensorflow/core/distributed_runtime/rpc/rpc_rendezvous_mgr.cc
@@ -136,18 +136,13 @@ class RpcRecvTensorCall : public BaseRecvTensorCall {
   // Start the main RecvTensor call, checking for an async abort.
   void StartRTCall(std::function<void()> recv_done) {
     resp_.InitAlloc(dst_device_, alloc_attrs_);
-    using namespace std::placeholders;
-    StatusCallback cb = std::bind(
-        [this](std::function<void()> recv_done,
-               // Begin unbound arguments.
-               const Status& s) {
-          if (!s.ok()) {
-            mutex_lock l(mu_);
-            status_.Update(s);
-          }
-          recv_done();
-        },
-        std::move(recv_done), _1);
+    auto cb = [this, recv_done = std::move(recv_done)](const Status& s) {
+      if (!s.ok()) {
+        mutex_lock l(mu_);
+        status_.Update(s);
+      }
+      recv_done();
+    };
     wi_->RecvTensorAsync(&opts_, &req_, &resp_, std::move(cb));
   }
 
diff --git a/tensorflow/core/distributed_runtime/session_mgr.cc b/tensorflow/core/distributed_runtime/session_mgr.cc
index 7fe3c685a93..ce80cb7e048 100644
--- a/tensorflow/core/distributed_runtime/session_mgr.cc
+++ b/tensorflow/core/distributed_runtime/session_mgr.cc
@@ -107,9 +107,12 @@ Status SessionMgr::CreateSession(
     };
     AsRemoteDevices(worker_env_->env, cluster_device_attributes, cb,
                     &cluster_devices);
-    std::unique_ptr<DeviceMgr> remote_devices;
-    if (!cluster_device_attributes.empty())
-      remote_devices = MakeUnique<StaticDeviceMgr>(std::move(cluster_devices));
+    std::unique_ptr<DynamicDeviceMgr> remote_devices;
+    if (!cluster_device_attributes.empty()) {
+      remote_devices = MakeUnique<DynamicDeviceMgr>();
+      TF_RETURN_IF_ERROR(
+          remote_devices->AddDevices(std::move(cluster_devices)));
+    }
 
     auto graph_mgr = MakeUnique<GraphMgr>(worker_env_, device_mgr.get());
     worker_session.reset(
@@ -120,9 +123,12 @@ Status SessionMgr::CreateSession(
   } else {
     AsRemoteDevices(worker_env_->env, cluster_device_attributes, nullptr,
                     &cluster_devices);
-    std::unique_ptr<DeviceMgr> remote_devices;
-    if (!cluster_device_attributes.empty())
-      remote_devices = MakeUnique<StaticDeviceMgr>(std::move(cluster_devices));
+    std::unique_ptr<DynamicDeviceMgr> remote_devices;
+    if (!cluster_device_attributes.empty()) {
+      remote_devices = MakeUnique<DynamicDeviceMgr>();
+      TF_RETURN_IF_ERROR(
+          remote_devices->AddDevices(std::move(cluster_devices)));
+    }
     // Borrow the WorkerEnv's DeviceMgr for the WorkerSession, so
     // that resources using it can use its devices after the
     // WorkerSession has been deleted.
diff --git a/tensorflow/core/distributed_runtime/worker_interface.h b/tensorflow/core/distributed_runtime/worker_interface.h
index cf8099ab96f..dcb43c372ba 100644
--- a/tensorflow/core/distributed_runtime/worker_interface.h
+++ b/tensorflow/core/distributed_runtime/worker_interface.h
@@ -62,13 +62,12 @@ class WorkerInterface {
 
   virtual void RunGraphAsync(CallOptions* opts, const RunGraphRequest* request,
                              RunGraphResponse* response, StatusCallback done) {
-    // TODO(mrry): Convert this to std::bind/std::move if the overhead
-    // of std::function copying becomes too much.
     RunGraphRequestWrapper* wrapped_request = new ProtoRunGraphRequest(request);
     MutableRunGraphResponseWrapper* wrapped_response =
         new NonOwnedProtoRunGraphResponse(response);
     RunGraphAsync(opts, wrapped_request, wrapped_response,
-                  [wrapped_request, wrapped_response, done](const Status& s) {
+                  [wrapped_request, wrapped_response,
+                   done = std::move(done)](const Status& s) {
                     done(s);
                     delete wrapped_request;
                     delete wrapped_response;
diff --git a/tensorflow/core/distributed_runtime/worker_session.cc b/tensorflow/core/distributed_runtime/worker_session.cc
index 9a816283014..b3459b64ebb 100644
--- a/tensorflow/core/distributed_runtime/worker_session.cc
+++ b/tensorflow/core/distributed_runtime/worker_session.cc
@@ -104,12 +104,11 @@ class WorkerFreeListCache : public WorkerCacheInterface {
 
 }  // namespace
 
-WorkerSession::WorkerSession(const string& session_name,
-                             const string& worker_name,
-                             std::unique_ptr<WorkerCacheInterface> worker_cache,
-                             std::unique_ptr<DeviceMgr> device_mgr,
-                             std::unique_ptr<GraphMgr> graph_mgr,
-                             std::unique_ptr<DeviceMgr> remote_device_mgr)
+WorkerSession::WorkerSession(
+    const string& session_name, const string& worker_name,
+    std::unique_ptr<WorkerCacheInterface> worker_cache,
+    std::unique_ptr<DeviceMgr> device_mgr, std::unique_ptr<GraphMgr> graph_mgr,
+    std::unique_ptr<DynamicDeviceMgr> remote_device_mgr)
     : session_name(session_name),
       worker_name(worker_name),
       worker_cache(new WorkerFreeListCache(std::move(worker_cache))),
@@ -132,18 +131,17 @@ std::shared_ptr<WorkerSession> WorkerSession::CreateWithBorrowedDeviceMgr(
     const string& session_name, const string& worker_name,
     std::unique_ptr<WorkerCacheInterface> worker_cache,
     DeviceMgr* borrowed_device_mgr, std::unique_ptr<GraphMgr> graph_mgr,
-    std::unique_ptr<DeviceMgr> remote_device_mgr) {
+    std::unique_ptr<DynamicDeviceMgr> remote_device_mgr) {
   return std::shared_ptr<WorkerSession>(new WorkerSession(
       session_name, worker_name, std::move(worker_cache), borrowed_device_mgr,
       std::move(graph_mgr), std::move(remote_device_mgr)));
 }
 
-WorkerSession::WorkerSession(const string& session_name,
-                             const string& worker_name,
-                             std::unique_ptr<WorkerCacheInterface> worker_cache,
-                             DeviceMgr* borrowed_device_mgr,
-                             std::unique_ptr<GraphMgr> graph_mgr,
-                             std::unique_ptr<DeviceMgr> remote_device_mgr)
+WorkerSession::WorkerSession(
+    const string& session_name, const string& worker_name,
+    std::unique_ptr<WorkerCacheInterface> worker_cache,
+    DeviceMgr* borrowed_device_mgr, std::unique_ptr<GraphMgr> graph_mgr,
+    std::unique_ptr<DynamicDeviceMgr> remote_device_mgr)
     : session_name(session_name),
       worker_name(worker_name),
       worker_cache(new WorkerFreeListCache(std::move(worker_cache))),
diff --git a/tensorflow/core/distributed_runtime/worker_session.h b/tensorflow/core/distributed_runtime/worker_session.h
index 1f0d45f2146..30e31645a21 100644
--- a/tensorflow/core/distributed_runtime/worker_session.h
+++ b/tensorflow/core/distributed_runtime/worker_session.h
@@ -49,7 +49,7 @@ struct WorkerSession {
     return device_mgr_ ? device_mgr_.get() : borrowed_device_mgr_;
   }
 
-  DeviceMgr* remote_device_mgr() { return remote_device_mgr_.get(); }
+  DynamicDeviceMgr* remote_device_mgr() { return remote_device_mgr_.get(); }
 
   // graph_mgr keeps track of the registered graphs of this session.
   //
@@ -63,13 +63,13 @@ struct WorkerSession {
                 std::unique_ptr<WorkerCacheInterface> worker_cache,
                 std::unique_ptr<DeviceMgr> device_mgr,
                 std::unique_ptr<GraphMgr> graph_mgr,
-                std::unique_ptr<DeviceMgr> remote_device_mgr);
+                std::unique_ptr<DynamicDeviceMgr> remote_device_mgr);
 
   static std::shared_ptr<WorkerSession> CreateWithBorrowedDeviceMgr(
       const string& session_name, const string& worker_name,
       std::unique_ptr<WorkerCacheInterface> worker_cache,
       DeviceMgr* borrowed_device_mgr, std::unique_ptr<GraphMgr> graph_mgr,
-      std::unique_ptr<DeviceMgr> remote_device_mgr);
+      std::unique_ptr<DynamicDeviceMgr> remote_device_mgr);
 
   ~WorkerSession();
 
@@ -78,11 +78,11 @@ struct WorkerSession {
                 std::unique_ptr<WorkerCacheInterface> worker_cache,
                 DeviceMgr* borrowed_device_mgr,
                 std::unique_ptr<GraphMgr> graph_mgr,
-                std::unique_ptr<DeviceMgr> remote_device_mgr);
+                std::unique_ptr<DynamicDeviceMgr> remote_device_mgr);
 
   const std::unique_ptr<DeviceMgr> device_mgr_;
   DeviceMgr* const borrowed_device_mgr_;  // Not owned.
-  const std::unique_ptr<DeviceMgr> remote_device_mgr_;
+  const std::unique_ptr<DynamicDeviceMgr> remote_device_mgr_;
 };
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/framework/common_shape_fns.cc b/tensorflow/core/framework/common_shape_fns.cc
index 8586530b2df..d57c0f83867 100644
--- a/tensorflow/core/framework/common_shape_fns.cc
+++ b/tensorflow/core/framework/common_shape_fns.cc
@@ -366,7 +366,8 @@ Status EinsumShape(shape_inference::InferenceContext* c) {
     output_bcast_shape = input_bcast_shapes[0];
   } else if (input_bcast_shapes.size() == 2) {
     TF_RETURN_IF_ERROR(BroadcastBinaryOpOutputShapeFnHelper(
-        c, input_bcast_shapes[0], input_bcast_shapes[1], &output_bcast_shape));
+        c, input_bcast_shapes[0], input_bcast_shapes[1], true,
+        &output_bcast_shape));
   }
 
   bool output_has_ellipsis = false;
@@ -441,7 +442,7 @@ Status BatchMatMulV2Shape(shape_inference::InferenceContext* c) {
   TF_RETURN_IF_ERROR(c->Subshape(b_shape, 0, -2, &b_batch_shape));
 
   TF_RETURN_IF_ERROR(BroadcastBinaryOpOutputShapeFnHelper(
-      c, a_batch_shape, b_batch_shape, &output_batch_shape));
+      c, a_batch_shape, b_batch_shape, true, &output_batch_shape));
 
   ShapeHandle output_shape;
   TF_RETURN_IF_ERROR(c->Concatenate(
@@ -1633,6 +1634,7 @@ Status QuantizedConcatV2Shape(InferenceContext* c, int num_inputs_to_concat) {
 Status BroadcastBinaryOpOutputShapeFnHelper(InferenceContext* c,
                                             ShapeHandle shape_x,
                                             ShapeHandle shape_y,
+                                            bool incompatible_shape_error,
                                             ShapeHandle* out) {
   CHECK_NOTNULL(out);
   if (!c->RankKnown(shape_x) || !c->RankKnown(shape_y)) {
@@ -1666,8 +1668,16 @@ Status BroadcastBinaryOpOutputShapeFnHelper(InferenceContext* c,
       // or the same as the known dim.
       // - If either dimension is 1, the other dimension is the output.
       if (c->Value(dim_x) > 1) {
+        if (!incompatible_shape_error) {
+          *out = c->UnknownShape();
+          return Status::OK();
+        }
         dims.push_back(dim_x);
       } else if (c->Value(dim_y) > 1) {
+        if (!incompatible_shape_error) {
+          *out = c->UnknownShape();
+          return Status::OK();
+        }
         dims.push_back(dim_y);
       } else if (c->Value(dim_x) == 1) {
         dims.push_back(dim_y);
@@ -1676,6 +1686,10 @@ Status BroadcastBinaryOpOutputShapeFnHelper(InferenceContext* c,
       } else if (dim_y.SameHandle(dim_x)) {
         dims.push_back(dim_x);
       } else {
+        if (!incompatible_shape_error) {
+          *out = c->UnknownShape();
+          return Status::OK();
+        }
         dims.push_back(c->UnknownDim());
       }
     } else if (c->Value(dim_x) == 1 || c->Value(dim_y) == 1) {
@@ -1689,7 +1703,14 @@ Status BroadcastBinaryOpOutputShapeFnHelper(InferenceContext* c,
       }
     } else {
       DimensionHandle dim;
-      TF_RETURN_IF_ERROR(c->Merge(dim_x, dim_y, &dim));
+      Status s = c->Merge(dim_x, dim_y, &dim);
+      if (!s.ok()) {
+        if (!incompatible_shape_error) {
+          *out = c->MakeShape({});
+          return Status::OK();
+        }
+        return s;
+      }
       dims.push_back(dim);
     }
   }
diff --git a/tensorflow/core/framework/common_shape_fns.h b/tensorflow/core/framework/common_shape_fns.h
index 55da52f895b..590aa98b60b 100644
--- a/tensorflow/core/framework/common_shape_fns.h
+++ b/tensorflow/core/framework/common_shape_fns.h
@@ -306,6 +306,7 @@ Status QuantizedConcatV2Shape(InferenceContext* c, int num_inputs_to_concat);
 Status BroadcastBinaryOpOutputShapeFnHelper(InferenceContext* c,
                                             ShapeHandle shape_x,
                                             ShapeHandle shape_y,
+                                            bool incompatible_shape_error,
                                             ShapeHandle* out);
 
 // Shape function for binary operators that broadcast their inputs
@@ -313,8 +314,8 @@ Status BroadcastBinaryOpOutputShapeFnHelper(InferenceContext* c,
 inline Status BroadcastBinaryOpOutputShapeFn(InferenceContext* c,
                                              int output_index) {
   ShapeHandle out;
-  TF_RETURN_IF_ERROR(
-      BroadcastBinaryOpOutputShapeFnHelper(c, c->input(0), c->input(1), &out));
+  TF_RETURN_IF_ERROR(BroadcastBinaryOpOutputShapeFnHelper(
+      c, c->input(0), c->input(1), true, &out));
   c->set_output(output_index, out);
   return Status::OK();
 }
diff --git a/tensorflow/core/framework/function.h b/tensorflow/core/framework/function.h
index 476c6055801..a6efad47fa1 100644
--- a/tensorflow/core/framework/function.h
+++ b/tensorflow/core/framework/function.h
@@ -18,6 +18,11 @@ limitations under the License.
 
 #include <vector>
 
+// clang-format off
+// Required for IS_MOBILE_PLATFORM
+#include "tensorflow/core/platform/platform.h"
+// clang-format on
+
 #include "tensorflow/core/framework/attr_value.pb.h"
 #include "tensorflow/core/framework/attr_value_util.h"
 #include "tensorflow/core/framework/function.pb.h"
@@ -34,6 +39,9 @@ limitations under the License.
 #include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/platform/protobuf.h"
 #include "tensorflow/core/protobuf/config.pb.h"
+#if !defined(IS_MOBILE_PLATFORM)
+#include "tensorflow/core/protobuf/remote_tensor_handle.pb.h"
+#endif  // IS_MOBILE_PLATFORM
 
 namespace tensorflow {
 
@@ -816,6 +824,18 @@ class DistributedFunctionLibraryRuntime {
                    FunctionLibraryRuntime::LocalHandle handle,
                    gtl::ArraySlice<Tensor> args, std::vector<Tensor>* rets,
                    FunctionLibraryRuntime::DoneCallback done) = 0;
+
+#if !defined(IS_MOBILE_PLATFORM)
+  // TODO(yujingzhang): Support outputting tensors on remote devices.
+  virtual void Run(const FunctionLibraryRuntime::Options& opts,
+                   FunctionLibraryRuntime::LocalHandle handle,
+                   const int64 op_id,
+                   absl::Span<eager::RemoteTensorHandle* const> args,
+                   FunctionLibraryRuntime::DoneCallback done) {
+    done(errors::Unimplemented("Unimplemented."));
+  }
+#endif  // IS_MOBILE_PLATFORM
+
   virtual void CleanUp(uint64 step_id,
                        FunctionLibraryRuntime::LocalHandle handle,
                        FunctionLibraryRuntime::DoneCallback done) = 0;
diff --git a/tensorflow/core/framework/function_testlib.cc b/tensorflow/core/framework/function_testlib.cc
index ad82bb6f667..d1562169915 100644
--- a/tensorflow/core/framework/function_testlib.cc
+++ b/tensorflow/core/framework/function_testlib.cc
@@ -569,22 +569,70 @@ FunctionDef RandomUniformLess() {
 
 FunctionDef MakeRangeDataset() {
   return FDH::Define(
-      // Name
-      "MakeRangeDataset",
-      // Args
-      {"start: int64", "stop: int64", "step: int64"},
-      // Return values
-      {"y:variant"},
-      // Attr def
+      /*name=*/"MakeRangeDataset",
+      /*arg_def=*/{"start: int64", "stop: int64", "step: int64"},
+      /*ret_def=*/{"y:variant"},
+      /*attr_def=*/
       {"output_types: list(type) >= 1", "output_shapes: list(shape) >= 1"},
-      // Nodes
-      {{{"y"},
-        "RangeDataset",
-        {"start", "stop", "step"},
+      /*node_def=*/
+      {{/*ret=*/{"y"},
+        /*op=*/"RangeDataset",
+        /*arg=*/{"start", "stop", "step"},
+        /*attr=*/
         {{"output_types", "$output_types"},
          {"output_shapes", "$output_shapes"}}}});
 }
 
+FunctionDef MakeBatchDataset() {
+  return FDH::Define(
+      /*name=*/"MakeBatchDataset",
+      /*arg_def=*/
+      {"input_dataset: variant", "batch_size: int64", "drop_remainder: bool"},
+      /*ret_def=*/{"y: variant"},
+      /*attr_def=*/
+      {"parallel_copy: bool = false", "output_types: list(type) >= 1",
+       "output_shapes: list(shape) >= 1"},
+      /*node_def=*/
+      {{/*ret=*/{"y"},
+        /*op=*/"BatchDatasetV2",
+        /*arg=*/{"input_dataset", "batch_size", "drop_remainder"},
+        /*attr=*/
+        {{"parallel_copy", "$parallel_copy"},
+         {"output_types", "$output_types"},
+         {"output_shapes", "$output_shapes"}}}});
+}
+
+FunctionDef MakeMapDataset(bool has_other_args) {
+  std::vector<string> args = {"input_dataset: variant"};
+  std::vector<string> inputs = {"input_dataset"};
+  if (has_other_args) {
+    args.emplace_back("other_arguments: Targuments");
+    inputs.emplace_back("other_arguments");
+  }
+
+  return FDH::Define(
+      /*name=*/"MakeMapDataset",
+      /*arg_def=*/args,
+      /*ret_def=*/
+      {"y: variant"},
+      /*attr_def=*/
+      {"f: func", "Targuments: list(type) >= 0",
+       "output_types: list(type) >= 1", "output_shapes: list(shape) >= 1",
+       "use_inter_op_parallelism: bool = true",
+       "preserve_cardinality: bool = false"},
+      /*node_def=*/
+      {{/*ret=*/{"y"},
+        /*op=*/"MapDataset",
+        /*arg=*/inputs,
+        /*attr=*/
+        {{"f", "$f"},
+         {"Targuments", "$Targuments"},
+         {"output_types", "$output_types"},
+         {"output_shapes", "$output_shapes"},
+         {"use_inter_op_parallelism", "$use_inter_op_parallelism"},
+         {"preserve_cardinality", "$preserve_cardinality"}}}});
+}
+
 FunctionDef MakeTakeDataset() {
   return FDH::Define(
       // Name
diff --git a/tensorflow/core/framework/function_testlib.h b/tensorflow/core/framework/function_testlib.h
index 7c7e275cfac..06b9d48c289 100644
--- a/tensorflow/core/framework/function_testlib.h
+++ b/tensorflow/core/framework/function_testlib.h
@@ -40,6 +40,14 @@ class Attrs {
     }
   }
 
+  Attrs(
+      const std::vector<std::pair<string, FunctionDefHelper::AttrValueWrapper>>&
+          attrs) {
+    for (const auto& aval : attrs) {
+      map_.insert({aval.first, aval.second.proto});
+    }
+  }
+
   operator AttrSlice() { return AttrSlice(&map_); }  // NOLINT(runtime/explicit)
 
  private:
@@ -60,16 +68,16 @@ GraphDef GDef(gtl::ArraySlice<NodeDef> nodes,
 // For testing convenience, we provide a few simple functions that can
 // be easily executed and tested.
 
-// x:T -> x * 2.
+// x: T -> x * 2.
 FunctionDef XTimesTwo();
 
-// x:T -> cpu(x * 2) + cpu(x * 3).
+// x: T -> cpu(x * 2) + cpu(x * 3).
 FunctionDef TwoDeviceTimesFive();
 
-// x:T -> cpu(x * 2), gpu(x * 3).
+// x: T -> cpu(x * 2), gpu(x * 3).
 FunctionDef TwoDeviceMult();
 
-// cpu(x):T, gpu(y):T -> cpu(x * 2), gpu(y * 3).
+// cpu(x): T, gpu(y): T -> cpu(x * 2), gpu(y * 3).
 FunctionDef TwoDeviceInputOutput();
 
 // Function taking a list of Tensors as input.
@@ -78,25 +86,25 @@ FunctionDef FuncWithListInput();
 // Function returning a list of Tensors as output.
 FunctionDef FuncWithListOutput();
 
-// x:T -> x + x.
+// x: T -> x + x.
 FunctionDef XAddX();
 
-// x: T, y:T -> x + y.
+// x: T, y: T -> x + y.
 FunctionDef XAddY();
 
-// x:T -> x * 2, where x is int32.
+// x: T -> x * 2, where x is int32.
 FunctionDef XTimesTwoInt32();
 
-// x:T -> (x * 2) * 2.
+// x: T -> (x * 2) * 2.
 FunctionDef XTimesFour();
 
-// x:T -> ((x * 2) * 2) * 2.
+// x: T -> ((x * 2) * 2) * 2.
 FunctionDef XTimes16();
 
-// w:T, x:T, b:T -> MatMul(w, x) + b
+// w: T, x: T, b: T -> MatMul(w, x) + b
 FunctionDef WXPlusB();
 
-// x:T -> x:T, T is a type which we automatically converts to a bool.
+// x: T -> x: T, T is a type which we automatically converts to a bool.
 FunctionDef NonZero();
 
 // x: T -> bool.
@@ -105,46 +113,56 @@ FunctionDef IsZero();
 // x: T -> int64
 FunctionDef RandomUniform();
 
-// x:T, y:T -> y:T, x:T
+// x: T, y:T  -> y: T, x: T
 FunctionDef Swap();
 
-// x:T, y:T -> y:T, x:T, the body has no nodes.
+// x: T, y: T -> y: T, x: T, the body has no nodes.
 FunctionDef EmptyBodySwap();
 
-// x:float, y:resource -> y:resource, 2*x:float.
+// x: float, y: resource -> y: resource, 2*x: float.
 FunctionDef ResourceOutput();
 
-// x:resource -> x:resource
+// x: resource -> x: resource
 FunctionDef ResourceIdentity();
 
-// x:resource -> y:float.
+// x: resource -> y: float.
 FunctionDef ReadResourceVariable();
 
 // Contains malformed control flow which can't be run by the executor.
 FunctionDef InvalidControlFlow();
 
-// x:T -> x <= N.
+// x: T -> x <= N.
 FunctionDef LessThanOrEqualToN(int64 N);
 
-// x:T, y:T -> x+1, x*y
+// x: T, y: T -> x + 1, x * y
 FunctionDef XPlusOneXTimesY();
 
-// x:T, y:T -> x <= N
+// x: T, y: T -> x <= N
 FunctionDef XYXLessThanOrEqualToN(int64 N);
 
 // x: T -> bool
 FunctionDef RandomUniformLess();
 
-// start:int64, stop:int64, step:int64 -> y: RangeDatasetOp::Dataset
+// start: int64, stop: int64, step: int64 -> y: RangeDatasetOp::Dataset
 FunctionDef MakeRangeDataset();
 
-// input_dataset:variant, count:int64 -> y: TakeDataset::Dataset
+// input_dataset: variant, batch_size: int64, drop_remainder: bool
+// -> y: BatchDatasetV2::Dataset
+FunctionDef MakeBatchDataset();
+
+// input_dataset: variant, other_arguments: Targuments, f: func,
+// Targuments: list(type), output_types: list(type), output_shapes: list(shape),
+// use_inter_op_parallelism: bool, preserve_cardinality: bool
+// -> y: MapDatasetOp::Dataset
+FunctionDef MakeMapDataset(bool has_other_args);
+
+// input_dataset: variant, count: int64 -> y: TakeDataset::Dataset
 FunctionDef MakeTakeDataset();
 
-// x:T -> y: TensorSliceDatasetOp::Dataset
+// x: T -> y: TensorSliceDatasetOp::Dataset
 FunctionDef MakeTensorSliceDataset();
 
-// x:T -> y: T, idx: out_idx
+// x: T -> y: T, idx: out_idx
 FunctionDef Unique();
 
 void FunctionTestSchedClosure(std::function<void()> fn);
diff --git a/tensorflow/core/framework/node_def_util.cc b/tensorflow/core/framework/node_def_util.cc
index 9bfd9af6c92..b9b953fc66b 100644
--- a/tensorflow/core/framework/node_def_util.cc
+++ b/tensorflow/core/framework/node_def_util.cc
@@ -742,7 +742,7 @@ namespace {
 
 using ::tensorflow::strings::Scanner;
 
-bool IsValidOpName(StringPiece sp) {
+bool IsValidNodeName(StringPiece sp) {
   Scanner scanner(sp);
   scanner.One(Scanner::LETTER_DIGIT_DOT)
       .Any(Scanner::LETTER_DIGIT_DASH_DOT_SLASH_UNDERSCORE);
@@ -801,16 +801,16 @@ Status ValidateOpInput(const string& input_name, bool* is_control_input) {
   }
 }
 
-Status ValidateOpName(const string& op_name) {
-  if (IsValidOpName(op_name)) {
+Status ValidateNodeName(const string& node_name) {
+  if (IsValidNodeName(node_name)) {
     return Status::OK();
   } else {
-    return errors::InvalidArgument("Illegal op name '", op_name, "'");
+    return errors::InvalidArgument("Illegal op name '", node_name, "'");
   }
 }
 
 Status ValidateExternalNodeDefSyntax(const NodeDef& node_def) {
-  Status s = ValidateOpName(node_def.name());
+  Status s = ValidateNodeName(node_def.name());
   if (!s.ok()) {
     return AttachDef(s, node_def);
   }
diff --git a/tensorflow/core/framework/op_def_util.cc b/tensorflow/core/framework/op_def_util.cc
index 7b0f77a7825..94401d4a6c5 100644
--- a/tensorflow/core/framework/op_def_util.cc
+++ b/tensorflow/core/framework/op_def_util.cc
@@ -249,16 +249,29 @@ static Status ValidateArg(const OpDef::ArgDef& arg, const OpDef& op_def,
   return Status::OK();
 }
 
-Status ValidateOpDef(const OpDef& op_def) {
+bool IsValidOpName(StringPiece sp) {
   using ::tensorflow::strings::Scanner;
 
+  Scanner scanner(sp);
+  scanner.One(Scanner::UPPERLETTER).Any(Scanner::LETTER_DIGIT_UNDERSCORE);
+
+  while (true) {
+    if (!scanner.GetResult())  // Some error in previous iteration.
+      return false;
+    if (scanner.empty())  // No error, but nothing left, good.
+      return true;
+
+    // Absorb another name/namespace, starting with a '>'
+    scanner.One(Scanner::RANGLE)
+        .One(Scanner::UPPERLETTER)
+        .Any(Scanner::LETTER_DIGIT_UNDERSCORE);
+  }
+}
+
+Status ValidateOpDef(const OpDef& op_def) {
   if (!absl::StartsWith(op_def.name(), "_")) {
-    VALIDATE(Scanner(op_def.name())
-                 .One(Scanner::UPPERLETTER)
-                 .Any(Scanner::LETTER_DIGIT_UNDERSCORE)
-                 .Eos()
-                 .GetResult(),
-             "Invalid name: ", op_def.name(), " (Did you use CamelCase?)");
+    VALIDATE(IsValidOpName(op_def.name()), "Invalid name: ", op_def.name(),
+             " (Did you use CamelCase?)");
   }
 
   std::set<string> names;  // for detecting duplicate names
diff --git a/tensorflow/core/framework/op_def_util_test.cc b/tensorflow/core/framework/op_def_util_test.cc
index 50ff5914f80..e6afbc81c44 100644
--- a/tensorflow/core/framework/op_def_util_test.cc
+++ b/tensorflow/core/framework/op_def_util_test.cc
@@ -75,12 +75,26 @@ TEST_F(ValidateOpDefTest, OpDefValid) {
   TF_EXPECT_OK(TestBuilder(OpDefBuilder("X").Attr("a: int >= -5 = 3")));
   TF_EXPECT_OK(TestBuilder(OpDefBuilder("X").Attr("a: numbertype")));
   TF_EXPECT_OK(TestBuilder(OpDefBuilder("Uppercase")));
+
+  TF_EXPECT_OK(TestBuilder(OpDefBuilder("Namespace>X").Attr("a: int")));
+  TF_EXPECT_OK(TestBuilder(OpDefBuilder("Namespace>X>Y").Attr("a: int")));
 }
 
 TEST_F(ValidateOpDefTest, InvalidName) {
   ExpectFailure(TestBuilder(OpDefBuilder("lower").Attr("a: int")),
                 "Invalid name");
   ExpectFailure(TestBuilder(OpDefBuilder("BadSuffix 7%")), "Invalid name");
+  ExpectFailure(TestBuilder(OpDefBuilder(">OpName").Attr("a: int")),
+                "Invalid name");
+  // Can't have a dangling empty namespace
+  ExpectFailure(TestBuilder(OpDefBuilder("OpName>").Attr("a: int")),
+                "Invalid name");
+  // Each namespace section must be Camelcased
+  ExpectFailure(TestBuilder(OpDefBuilder("OpName>b").Attr("a: int")),
+                "Invalid name");
+  // Can't have empty namespaces
+  ExpectFailure(TestBuilder(OpDefBuilder("OpName>A>>B").Attr("a: int")),
+                "Invalid name");
 }
 
 TEST_F(ValidateOpDefTest, DuplicateName) {
diff --git a/tensorflow/core/framework/op_kernel.cc b/tensorflow/core/framework/op_kernel.cc
index b11d70a3817..fda95aa625c 100644
--- a/tensorflow/core/framework/op_kernel.cc
+++ b/tensorflow/core/framework/op_kernel.cc
@@ -257,16 +257,14 @@ Status OpKernelConstruction::allocate_persistent(
   // for now just do the same thing as allocate_temp
   // TODO(misard) add specific memory tracking for persistent tensors
   Tensor persistent;
-  Status s = allocate_temp(type, shape, &persistent);
-  if (!s.ok()) {
-    return s;
-  }
+  TF_RETURN_IF_ERROR(allocate_temp(type, shape, &persistent));
+
   *out_persistent = PersistentTensor(persistent);
   Tensor* allocated = out_persistent->AccessTensor(this);
   if (out_tensor) {
     *out_tensor = allocated;
   }
-  return s;
+  return Status::OK();
 }
 
 // OpKernelContext -----------------------------------------------------------
@@ -1244,12 +1242,18 @@ Status FindKernelRegistration(
     TF_RETURN_IF_ERROR(KernelAttrsMatch(iter->second.def, node_attrs, &match));
     if (match) {
       if (*reg != nullptr) {
-        return errors::InvalidArgument(
-            "Multiple OpKernel registrations match NodeDef '",
-            FormatNodeDefForError(node_name, has_experimental_debug_info,
-                                  experimental_debug_info),
-            "': '", (*reg)->def.ShortDebugString(), "' and '",
-            iter->second.def.ShortDebugString(), "'");
+        if ((*reg)->def.priority() == iter->second.def.priority()) {
+          return errors::InvalidArgument(
+              "Multiple OpKernel registrations match NodeDef at the same "
+              "priority '",
+              FormatNodeDefForError(node_name, has_experimental_debug_info,
+                                    experimental_debug_info),
+              "': '", (*reg)->def.ShortDebugString(), "' and '",
+              iter->second.def.ShortDebugString(), "'");
+        } else if ((*reg)->def.priority() > iter->second.def.priority()) {
+          continue;
+        }
+        // iter->second's priority is higher than *reg.
       }
       *reg = &iter->second;
     } else {
@@ -1488,18 +1492,16 @@ Status CreateOpKernel(DeviceType device_type, DeviceBase* device,
 
   // Look up the Op registered for this op name.
   const OpDef* op_def = nullptr;
-  Status s = OpRegistry::Global()->LookUpOpDef(node_def.op(), &op_def);
-  if (!s.ok()) return s;
+  TF_RETURN_IF_ERROR(OpRegistry::Global()->LookUpOpDef(node_def.op(), &op_def));
 
   // Validate node_def against OpDef.
-  s = ValidateNodeDef(node_def, *op_def);
-  if (!s.ok()) return s;
+  TF_RETURN_IF_ERROR(ValidateNodeDef(node_def, *op_def));
 
   // Look up kernel registration.
   const KernelRegistration* registration;
   bool was_attr_mismatch;
-  s = FindKernelRegistration(device_type, node_def, &registration,
-                             &was_attr_mismatch);
+  Status s = FindKernelRegistration(device_type, node_def, &registration,
+                                    &was_attr_mismatch);
   if (!s.ok()) {
     errors::AppendToMessage(&s, " when instantiating ", node_def.op());
     return s;
diff --git a/tensorflow/core/framework/rendezvous.cc b/tensorflow/core/framework/rendezvous.cc
index 0543db062f3..5219fb29192 100644
--- a/tensorflow/core/framework/rendezvous.cc
+++ b/tensorflow/core/framework/rendezvous.cc
@@ -269,18 +269,12 @@ class LocalRendezvousImpl : public Rendezvous {
         // NOTE(mrry): We must wrap `done` with code that deregisters the
         // cancellation callback before calling the `done` callback, because the
         // cancellation manager may no longer be live after `done` is called.
-        auto wrapped_done = std::bind(
-            [cm, token](const DoneCallback& done,
-                        // Begin unbound arguments.
-                        const Status& s, const Args& send_args,
-                        const Args& recv_args, const Tensor& v, bool dead) {
-              cm->TryDeregisterCallback(token);
-              done(s, send_args, recv_args, v, dead);
-            },
-            std::move(done), std::placeholders::_1, std::placeholders::_2,
-            std::placeholders::_3, std::placeholders::_4,
-            std::placeholders::_5);
-        item->waiter = std::move(wrapped_done);
+        item->waiter = [cm, token, done = std::move(done)](
+                           const Status& s, const Args& send_args,
+                           const Args& recv_args, const Tensor& v, bool dead) {
+          cm->TryDeregisterCallback(token);
+          done(s, send_args, recv_args, v, dead);
+        };
       } else {
         item->waiter = std::move(done);
       }
diff --git a/tensorflow/core/graph/graph.cc b/tensorflow/core/graph/graph.cc
index 2ae70160d2c..363fe19d335 100644
--- a/tensorflow/core/graph/graph.cc
+++ b/tensorflow/core/graph/graph.cc
@@ -683,11 +683,10 @@ void Graph::ToGraphDefSubRange(GraphDef* graph_def, int from_node_id) const {
             << " is overflowing the expected number of inputs ("
             << node->num_inputs() << ") for node " << node->DebugString();
         CHECK(inputs[edge->dst_input()] == nullptr)
-            << "Edge " << edge->src()->DebugString() << ":"
-            << edge->dst()->DebugString() << " with dst_input "
-            << edge->dst_input() << " and had pre-existing input edge "
-            << inputs[edge->dst_input()]->src()->DebugString() << ":"
-            << inputs[edge->dst_input()]->dst()->DebugString();
+            << "Edge " << edge->src()->name() << "->" << edge->dst()->name()
+            << " conflicts with pre-existing input edge "
+            << inputs[edge->dst_input()]->src()->name() << "->"
+            << inputs[edge->dst_input()]->dst()->name();
 
         inputs[edge->dst_input()] = edge;
       }
diff --git a/tensorflow/core/graph/graph_constructor.cc b/tensorflow/core/graph/graph_constructor.cc
index b462ab3438c..65d4ee9c561 100644
--- a/tensorflow/core/graph/graph_constructor.cc
+++ b/tensorflow/core/graph/graph_constructor.cc
@@ -66,12 +66,23 @@ inline bool IsNextIteration(const NodeDef& node_def) {
 
 bool IsValidNodeName(StringPiece s, bool allow_internal_ops) {
   using ::tensorflow::strings::Scanner;
-  return Scanner(s)
+  Scanner scanner(s);
+  scanner
       .One(allow_internal_ops ? Scanner::LETTER_DIGIT_DOT_UNDERSCORE
                               : Scanner::LETTER_DIGIT_DOT)
-      .Any(Scanner::LETTER_DIGIT_DASH_DOT_SLASH_UNDERSCORE)
-      .Eos()
-      .GetResult();
+      .Any(Scanner::LETTER_DIGIT_DASH_DOT_SLASH_UNDERSCORE);
+
+  while (true) {
+    if (!scanner.GetResult())  // Some error in previous iteration.
+      return false;
+    if (scanner.empty())  // No error, but nothing left, good.
+      return true;
+
+    // Absorb another piece, starting with a '>'
+    scanner.One(Scanner::RANGLE)
+        .One(Scanner::LETTER_DIGIT_DOT)
+        .Any(Scanner::LETTER_DIGIT_DASH_DOT_SLASH_UNDERSCORE);
+  }
 }
 
 class GraphConstructor {
diff --git a/tensorflow/core/graph/mkl_graph_util.h b/tensorflow/core/graph/mkl_graph_util.h
index 2ea20d01225..ba2526b2c35 100644
--- a/tensorflow/core/graph/mkl_graph_util.h
+++ b/tensorflow/core/graph/mkl_graph_util.h
@@ -206,6 +206,7 @@ static inline bool IsMklElementWiseOp(const string& op_name, DataType T) {
     return false;
   }
   bool result = (0 == op_name.compare(GetMklOpName("Add")) ||
+                 0 == op_name.compare(GetMklOpName("AddV2")) ||
                  0 == op_name.compare(GetMklOpName("Sub")) ||
                  0 == op_name.compare(GetMklOpName("Mul")) ||
                  0 == op_name.compare(GetMklOpName("Maximum")) ||
diff --git a/tensorflow/core/graph/mkl_layout_pass.cc b/tensorflow/core/graph/mkl_layout_pass.cc
index 1487200b4e3..8cb65926a4f 100644
--- a/tensorflow/core/graph/mkl_layout_pass.cc
+++ b/tensorflow/core/graph/mkl_layout_pass.cc
@@ -350,6 +350,7 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
     // in the MklUtil.h (IsMklElementWiseOp method) to ensure that the
     // MklInputConversion op is added before it.
     csinfo_.add = "Add";
+    csinfo_.add_v2 = "AddV2";
     csinfo_.maximum = "Maximum";
     csinfo_.mul = "Mul";
     csinfo_.squared_difference = "SquaredDifference";
@@ -364,6 +365,10 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
     rinfo_.push_back({csinfo_.add, mkl_op_registry::GetMklOpName(csinfo_.add),
                       CopyAttrsAll, RewriteIfAtleastOneMklInput,
                       kRewriteForLayoutPropagation});
+    rinfo_.push_back({csinfo_.add_v2,
+                      mkl_op_registry::GetMklOpName(csinfo_.add_v2),
+                      CopyAttrsAll, RewriteIfAtleastOneMklInput,
+                      kRewriteForLayoutPropagation});
     rinfo_.push_back(
         {csinfo_.avg_pool, mkl_op_registry::GetMklOpName(csinfo_.avg_pool),
          CopyAttrsAll, AlwaysRewrite, kRewriteForLayoutPropagation});
@@ -869,6 +874,7 @@ rinfo_.push_back({csinfo_.tanh_grad,
   typedef struct {
     string addn;
     string add;
+    string add_v2;
     string avg_pool;
     string avg_pool_grad;
     string avg_pool3d;
diff --git a/tensorflow/core/graph/mkl_layout_pass_test.cc b/tensorflow/core/graph/mkl_layout_pass_test.cc
index b69a30e8274..b344d6cb179 100644
--- a/tensorflow/core/graph/mkl_layout_pass_test.cc
+++ b/tensorflow/core/graph/mkl_layout_pass_test.cc
@@ -3776,6 +3776,65 @@ TEST_F(MklLayoutPassTest, NodeRewrite_Slice_DeviceTest) {
             "B->D:1;C->D:2;D->E:1");
 }
 
+// The following positive and negative tests test the rewrite of Add and AddV2
+// to MKL versions. The operators will be rewritten only if one of the inputs
+// comes from another MKL operator.
+TEST_F(MklLayoutPassTest, PositiveRewriteAdd) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Input'}"
+      "node { name: 'M' op: 'Relu'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " input: ['A']}"
+      "node { name: 'N' op: 'Add'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " input: ['M', 'B']}");
+  EXPECT_EQ(
+      DoMklLayoutOptimizationPass(),
+      "A(Input);B(Input);DMT/_0(Const);DMT/_1(Const);M(_MklRelu);N(_MklAdd)"
+      "|A->M;A:control->DMT/_0:control;B->N:1;DMT/_0->M:1;DMT/_1->N:3;M->N;"
+      "M:1->N:2;M:control->DMT/_1:control");
+}
+
+TEST_F(MklLayoutPassTest, NegativeRewriteAdd) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Input'}"
+      "node { name: 'N' op: 'Add'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " input: ['A', 'B']}");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Input);N(Add)|A->N;B->N:1");
+}
+
+TEST_F(MklLayoutPassTest, PositiveRewriteAddV2) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Input'}"
+      "node { name: 'M' op: 'Relu'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " input: ['A']}"
+      "node { name: 'N' op: 'AddV2'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " input: ['M', 'B']}");
+  EXPECT_EQ(
+      DoMklLayoutOptimizationPass(),
+      "A(Input);B(Input);DMT/_0(Const);DMT/_1(Const);M(_MklRelu);N(_MklAddV2)"
+      "|A->M;A:control->DMT/_0:control;B->N:1;DMT/_0->M:1;DMT/_1->N:3;M->N;"
+      "M:1->N:2;M:control->DMT/_1:control");
+}
+
+TEST_F(MklLayoutPassTest, NegativeRewriteAddV2) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Input'}"
+      "node { name: 'N' op: 'AddV2'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " input: ['A', 'B']}");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Input);N(AddV2)|A->N;B->N:1");
+}
+
 /////////////////////////////////////////////////////////////////////
 //         Post-rewrite fixup pass test
 /////////////////////////////////////////////////////////////////////
diff --git a/tensorflow/core/grappler/costs/graph_properties.cc b/tensorflow/core/grappler/costs/graph_properties.cc
index e72e613c9e3..866d9341844 100644
--- a/tensorflow/core/grappler/costs/graph_properties.cc
+++ b/tensorflow/core/grappler/costs/graph_properties.cc
@@ -2176,7 +2176,7 @@ Status GraphProperties::InferStatically(bool assume_valid_feeds,
         }
       }
     }
-    if (NumNonControlInputs(node) == 0) {
+    if (!HasRegularInputs(node)) {
       primary_inputs.insert(&node);
     } else if (IsMerge(node)) {
       merge_nodes.insert(&node);
diff --git a/tensorflow/core/grappler/costs/virtual_scheduler.cc b/tensorflow/core/grappler/costs/virtual_scheduler.cc
index ccd8ad86f00..e9c9e3a54fb 100644
--- a/tensorflow/core/grappler/costs/virtual_scheduler.cc
+++ b/tensorflow/core/grappler/costs/virtual_scheduler.cc
@@ -1151,7 +1151,21 @@ void VirtualScheduler::GenerateRunMetadata(RunMetadata* metadata) {
         tensor_descr->mutable_allocation_description()->set_allocated_bytes(
             tensor_size);
       }
-      node_stats->set_timeline_label(node_def->op());
+      if (node_def->op() != "HloGenericOp") {
+        node_stats->set_timeline_label(node_def->op());
+      } else {
+        // For HloGenericOp, display hlo_opcode as timeline label.
+        string timeline_label;
+        if (node_def->attr().count("hlo_opcode") > 0) {
+          absl::StrAppend(&timeline_label,
+                          node_def->attr().at("hlo_opcode").s());
+        }
+        if (node_def->attr().count("_hlo_metadata_op_type") > 0) {
+          absl::StrAppend(&timeline_label, "/",
+                          node_def->attr().at("_hlo_metadata_op_type").s());
+        }
+        node_stats->set_timeline_label(timeline_label);
+      }
       node_stats->set_node_name(node_def->name());
       // Timestamps in microseconds.
       // TODO(b/138165866): Remove once TimelineServer support is no longer
diff --git a/tensorflow/core/grappler/optimizers/BUILD b/tensorflow/core/grappler/optimizers/BUILD
index 89440622341..9c9e8edde0d 100644
--- a/tensorflow/core/grappler/optimizers/BUILD
+++ b/tensorflow/core/grappler/optimizers/BUILD
@@ -374,12 +374,11 @@ cc_library(
     deps = [
         ":graph_optimizer",
         "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/grappler:grappler_item",
+        "//tensorflow/core/grappler:mutable_graph_view",
         "//tensorflow/core/grappler:op_types",
         "//tensorflow/core/grappler:utils",
-        "//tensorflow/core/grappler/utils:graph_view",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
     ],
diff --git a/tensorflow/core/grappler/optimizers/arithmetic_optimizer.cc b/tensorflow/core/grappler/optimizers/arithmetic_optimizer.cc
index 6a281fc0e71..a52a9197c38 100644
--- a/tensorflow/core/grappler/optimizers/arithmetic_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/arithmetic_optimizer.cc
@@ -2421,7 +2421,7 @@ class SimplifyAggregation : public ArithmeticOptimizerStage {
   ~SimplifyAggregation() override = default;
 
   bool IsSupported(const NodeDef* node) const override {
-    return IsAggregate(*node) && NumNonControlInputs(*node) > 0 &&
+    return IsAggregate(*node) && HasRegularInputs(*node) &&
            GetDataTypeFromAttr(*node, "T") !=
                DT_VARIANT;  // TODO(b/119787146): Enable for variants.
   }
diff --git a/tensorflow/core/grappler/optimizers/constant_folding.cc b/tensorflow/core/grappler/optimizers/constant_folding.cc
index f3a3e4c188a..37eb4ff3bce 100644
--- a/tensorflow/core/grappler/optimizers/constant_folding.cc
+++ b/tensorflow/core/grappler/optimizers/constant_folding.cc
@@ -1987,7 +1987,8 @@ Status ConstantFolding::SimplifyNode(bool use_shape_info, NodeDef* node,
   RETURN_IF_ERROR_OR_MODIFIED(SimplifyArithmeticOperations(
       *properties, use_shape_info, optimized_graph, node));
   SET_AND_RETURN_IF_MODIFIED(ReduceDivToReciprocalMul(optimized_graph, node));
-  SET_AND_RETURN_IF_MODIFIED(ConstantPushDown(optimized_graph, node));
+  SET_AND_RETURN_IF_MODIFIED(
+      ConstantPushDown(*properties, optimized_graph, node));
   SET_AND_RETURN_IF_MODIFIED(
       MulConvPushDown(optimized_graph, node, *properties));
   SET_AND_RETURN_IF_MODIFIED(PartialConstPropThroughIdentityN(node));
@@ -2392,7 +2393,7 @@ bool ConstantFolding::SimplifySwitch(GraphDef* optimized_graph, NodeDef* node) {
     if (fanouts.size() == 2) {
       for (NodeDef* fanout : fanouts) {
         if ((!IsIdentity(*fanout) && !IsIdentityNSingleInput(*fanout)) ||
-            NumNonControlOutputs(*fanout, *node_map_) > 0) {
+            HasRegularOutputs(*fanout, *node_map_)) {
           already_optimized = false;
           break;
         }
@@ -2840,7 +2841,8 @@ bool ConstantFolding::ReduceDivToReciprocalMul(GraphDef* optimized_graph,
   return false;
 }
 
-bool ConstantFolding::ConstantPushDown(GraphDef* optimized_graph,
+bool ConstantFolding::ConstantPushDown(const GraphProperties& properties,
+                                       GraphDef* optimized_graph,
                                        NodeDef* node) {
   // Consider the transformation
   //
@@ -2853,7 +2855,7 @@ bool ConstantFolding::ConstantPushDown(GraphDef* optimized_graph,
   // where C is constant, X is non-constant, Y may be constant or non-constant,
   // and '+' denotes an associative and commutative operator like addition or
   // multiplication. This optimization pushes constants down in the tree to
-  // canonicalize it. Moreoever, in cases where the child node has a second
+  // canonicalize it. Moreover, in cases where the child node has a second
   // constant input Y we will create a leaf node that can be folded, e.g.
   //
   //    Add(C1, Add(C2, X)) -> Add(X, Add(C1, C2)) -> Add(X, C1 + C2)
@@ -2944,6 +2946,7 @@ bool ConstantFolding::ConstantPushDown(GraphDef* optimized_graph,
       node->device() != right_leaf->device()) {
     return false;
   }
+
   // Get the node names corresponding to X, Y, and C.
   const string input_x =
       left_leaf_is_constant ? op_child->input(1) : op_child->input(0);
@@ -2957,6 +2960,38 @@ bool ConstantFolding::ConstantPushDown(GraphDef* optimized_graph,
   VLOG(1) << "\n++++++++ Reordering node " << node->name() << ": " << node->op()
           << "(" << left_child->op() << ", " << right_child->op() << ")\n";
 
+  const NodeDef* y_node = left_leaf_is_constant ? left_leaf : right_leaf;
+  if (!IsReallyConstant(*y_node)) {
+    // Now make sure that we do not push a tensor that is larger than the tensor
+    // it replaces down, since that would create more broadcasting and increase
+    // work.
+    const std::vector<OpInfo::TensorProperties>& root_props =
+        properties.GetInputProperties(node->name());
+    const std::vector<OpInfo::TensorProperties>& op_props =
+        properties.GetInputProperties(op_child->name());
+    if (!root_props.empty() && !op_props.empty()) {
+      DCHECK_EQ(2, root_props.size()) << node->DebugString();
+      DCHECK_EQ(2, op_props.size()) << op_child->DebugString();
+      const PartialTensorShape c_shape(
+          root_props[left_child_is_constant ? 0 : 1].shape());
+      const PartialTensorShape x_shape(
+          op_props[left_leaf_is_constant ? 0 : 1].shape());
+      if (c_shape.IsFullyDefined() && x_shape.IsFullyDefined() &&
+          c_shape.num_elements() > x_shape.num_elements()) {
+        return false;
+      } else if (!c_shape.unknown_rank() && !x_shape.unknown_rank() &&
+                 c_shape.dims() > 0) {
+        for (int idx = 0; idx < std::min(x_shape.dims(), c_shape.dims());
+             ++idx) {
+          if (x_shape.dim_size(idx) >= 0 &&
+              c_shape.dim_size(idx) > x_shape.dim_size(idx)) {
+            return false;
+          }
+        }
+      }
+    }
+  }
+
   // Now we have identified the nodes to swap (non_const_leaf_input and
   // const_child).
   node_map_->UpdateInput(node->name(), input_c, input_x);
@@ -3134,7 +3169,7 @@ bool ConstantFolding::MulConvPushDown(GraphDef* optimized_graph, NodeDef* node,
 bool ConstantFolding::PartialConstPropThroughIdentityN(NodeDef* node) {
   // Partial constant propagation through IdentityN.
   if (!(IsIdentityN(*node) || IsIdentityNSingleInput(*node)) ||
-      NumNonControlInputs(*node) == 0)
+      !HasRegularInputs(*node))
     return false;
 
   std::vector<int> inputs_to_forward;
diff --git a/tensorflow/core/grappler/optimizers/constant_folding.h b/tensorflow/core/grappler/optimizers/constant_folding.h
index 37c5674a011..21ad5144c24 100644
--- a/tensorflow/core/grappler/optimizers/constant_folding.h
+++ b/tensorflow/core/grappler/optimizers/constant_folding.h
@@ -145,7 +145,8 @@ class ConstantFolding : public GraphOptimizer {
 
   // Pushes down constants on '+' and '*' operators if applicable. Returns true
   // the transformation applied successfully.
-  bool ConstantPushDown(GraphDef* optimized_graph, NodeDef* node);
+  bool ConstantPushDown(const GraphProperties& properties,
+                        GraphDef* optimized_graph, NodeDef* node);
 
   // Aggregate constants present around a conv operator. Returns true if the
   // transformation was applied successfully.
diff --git a/tensorflow/core/grappler/optimizers/dependency_optimizer.cc b/tensorflow/core/grappler/optimizers/dependency_optimizer.cc
index daf5547df68..bcb4dcec566 100644
--- a/tensorflow/core/grappler/optimizers/dependency_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/dependency_optimizer.cc
@@ -91,6 +91,10 @@ bool DependencyOptimizer::SafeToRemoveIdentity(const NodeDef& node) const {
 }
 
 bool DependencyOptimizer::SafeToConvertToNoOp(const NodeDef& node) const {
+  if (HasRegularOutputs(node, *node_map_)) {
+    // The output values of this node may be needed.
+    return false;
+  }
   if (!fetch_nodes_known_ ||
       nodes_to_preserve_.find(node.name()) != nodes_to_preserve_.end()) {
     return false;
@@ -117,10 +121,6 @@ bool DependencyOptimizer::SafeToConvertToNoOp(const NodeDef& node) const {
   if (!SafeToRemoveIdentity(node)) {
     return false;
   }
-  if (NumNonControlOutputs(node, *node_map_) > 0) {
-    // The output values of this node may be needed.
-    return false;
-  }
   return true;
 }
 
diff --git a/tensorflow/core/grappler/optimizers/memory_optimizer.cc b/tensorflow/core/grappler/optimizers/memory_optimizer.cc
index 1b8d8c73089..47bde3fbfd0 100644
--- a/tensorflow/core/grappler/optimizers/memory_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/memory_optimizer.cc
@@ -608,6 +608,7 @@ bool SchedulingPass(Cluster* cluster, GrapplerItem* item) {
     if (dtype != DT_HALF && dtype != DT_FLOAT && dtype != DT_DOUBLE &&
         dtype != DT_INT64) {  // Only GPU-supported TemporaryVariable types.
       VLOG(1) << "Unsupported dtype for " << node->name();
+      continue;
     }
 
     // Compute a topological ordering for the node fanin.
diff --git a/tensorflow/core/grappler/optimizers/meta_optimizer.cc b/tensorflow/core/grappler/optimizers/meta_optimizer.cc
index f0ded3b635e..8be60119898 100644
--- a/tensorflow/core/grappler/optimizers/meta_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/meta_optimizer.cc
@@ -224,6 +224,10 @@ Status MetaOptimizer::InitializeOptimizers(
   if (cfg_.shape_optimization() != RewriterConfig::OFF) {
     optimizers->push_back(MakeUnique<ShapeOptimizer>());
   }
+  if (AutoMixedPrecisionEnabled(cfg_.auto_mixed_precision())) {
+    optimizers->push_back(
+        MakeUnique<AutoMixedPrecision>(cfg_.auto_mixed_precision()));
+  }
   if (cfg_.layout_optimizer() != RewriterConfig::OFF) {
     optimizers->push_back(MakeUnique<GenericLayoutOptimizer>());
   }
@@ -245,10 +249,6 @@ Status MetaOptimizer::InitializeOptimizers(
     optimizers->push_back(
         MakeUnique<DependencyOptimizer>(cfg_.dependency_optimization()));
   }
-  if (AutoMixedPrecisionEnabled(cfg_.auto_mixed_precision())) {
-    optimizers->push_back(
-        MakeUnique<AutoMixedPrecision>(cfg_.auto_mixed_precision()));
-  }
   auto global_jit_level =
       config_proto_.graph_options().optimizer_options().global_jit_level();
   if (MemoryOptimizerEnabled(cfg_.memory_optimization(), global_jit_level)) {
diff --git a/tensorflow/core/grappler/optimizers/model_pruner.cc b/tensorflow/core/grappler/optimizers/model_pruner.cc
index cb2f1b24b26..cf9ce6fa32c 100644
--- a/tensorflow/core/grappler/optimizers/model_pruner.cc
+++ b/tensorflow/core/grappler/optimizers/model_pruner.cc
@@ -15,6 +15,8 @@ limitations under the License.
 
 #include "tensorflow/core/grappler/optimizers/model_pruner.h"
 
+#include <unordered_set>
+
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
 #include "tensorflow/core/framework/attr_value.pb.h"
@@ -24,68 +26,62 @@ limitations under the License.
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/framework/versions.pb.h"
 #include "tensorflow/core/grappler/grappler_item.h"
+#include "tensorflow/core/grappler/mutable_graph_view.h"
 #include "tensorflow/core/grappler/op_types.h"
 #include "tensorflow/core/grappler/utils.h"
-#include "tensorflow/core/grappler/utils/graph_view.h"
-#include "tensorflow/core/lib/core/errors.h"
 
 namespace tensorflow {
 namespace grappler {
 
-bool IsTrivialIdentity(const utils::MutableNodeView& node_view) {
-  if (node_view.NumControllingFanins() > 0) {
-    // Node is driven by control dependency.
-    return false;
-  }
-  if (node_view.NumControlledFanouts() > 0) {
-    // Node drives control dependency.
-    return false;
-  }
-  for (const auto& regular_fanin : node_view.GetRegularFanins()) {
-    if (IsSwitch(*regular_fanin.node_view()->node())) {
-      // Node is driven by switch.
+bool IsTrivialIdentity(const NodeDef& node, const GraphView& graph_view) {
+  for (const auto input :
+       graph_view.GetFanins(node, /*include_controlling_nodes=*/true)) {
+    if (input.port_id == Graph::kControlSlot) {
+      // Node is driven by control dependency.
+      return false;
+    } else if (IsSwitch(*input.node)) {  // Node is driven by switch.
       return false;
     }
   }
-  for (const auto& regular_fanouts : node_view.GetRegularFanouts()) {
-    for (const auto& regular_fanout : regular_fanouts) {
-      if (IsMerge(*regular_fanout.node_view()->node())) {
-        // Node feeds merge.
-        return false;
-      }
+  for (const auto output :
+       graph_view.GetFanouts(node, /*include_controlled_nodes=*/true)) {
+    if (output.port_id == Graph::kControlSlot) {
+      // Node drives control dependency.
+      return false;
+    } else if (IsMerge(*output.node)) {  // Node feeds merge.
+      return false;
     }
   }
   return true;
 }
 
-bool IsTrivialOp(const utils::MutableNodeView& node_view) {
+bool IsTrivialOp(const NodeDef& node, const GraphView& graph_view) {
   // Remove the stop gradient nodes since they serve no purpose once the graph
   // is built. Also remove Identity ops.
-  const auto* node = node_view.node();
-  if (IsStopGradient(*node)) {
+  if (IsStopGradient(node)) {
     return true;
   }
-  if (IsIdentity(*node) || IsIdentityNSingleInput(*node)) {
-    return IsTrivialIdentity(node_view);
+  if (IsIdentity(node) || IsIdentityNSingleInput(node)) {
+    return IsTrivialIdentity(node, graph_view);
   }
-  const bool no_fanins = node_view.NumRegularFanins() == 0 &&
-                         node_view.NumControllingFanins() == 0;
-  if (IsNoOp(*node) && no_fanins) {
+  if (IsNoOp(node) && node.input().empty()) {
     return true;
   }
   // Const nodes are always executed before anything else, so if they only
   // have control outputs we can remove them.
-  if (IsConstant(*node) && no_fanins && node_view.NumRegularFanouts() == 0) {
+  if (IsConstant(node) && node.input().empty() &&
+      graph_view.NumFanouts(node, /*include_controlled_nodes=*/false) == 0) {
     return true;
   }
-  return IsAddN(*node) && node_view.NumRegularFanins() <= 1;
+  return IsAddN(node) && NumNonControlInputs(node) <= 1;
 }
 
-bool RemovalIncreasesEdgeCount(const utils::MutableNodeView& node_view) {
+bool RemovalIncreasesEdgeCount(const NodeDef& node,
+                               const GraphView& graph_view) {
   int in_degree =
-      node_view.NumRegularFanins() + node_view.NumControllingFanins();
+      graph_view.NumFanins(node, /*include_controlling_nodes=*/true);
   int out_degree =
-      node_view.NumRegularFanouts() + node_view.NumControlledFanouts();
+      graph_view.NumFanouts(node, /*include_controlled_nodes=*/true);
   return in_degree * out_degree > in_degree + out_degree;
 }
 
@@ -103,475 +99,389 @@ bool IsOutputPortRefValue(const NodeDef& node, int port_id,
   return false;
 }
 
-bool CanRemoveNode(const utils::MutableNodeView& node_view,
-                   const absl::flat_hash_set<absl::string_view>& function_names,
+bool CanRemoveNode(const NodeDef& node, const GraphView& graph_view,
+                   const absl::flat_hash_set<string>& function_names,
                    const OpRegistryInterface& op_registry) {
-  const auto* node = node_view.node();
-  const bool no_fanins = node_view.NumRegularFanins() == 0 &&
-                         node_view.NumControllingFanins() == 0;
-  if (IsNoOp(*node) && no_fanins) {
+  if (IsNoOp(node) && node.input().empty()) {
     return true;
   }
-  if (IsConstant(*node) && no_fanins && node_view.NumRegularFanouts() == 0) {
+  if (IsConstant(node) && node.input().empty() &&
+      graph_view.NumFanouts(node, /*include_controlled_nodes=*/false) == 0) {
     return true;
   }
-  if (RemovalIncreasesEdgeCount(node_view)) {
+  if (RemovalIncreasesEdgeCount(node, graph_view)) {
     return false;
   }
-  const string& device = node->device();
-  for (const auto& regular_fanin : node_view.GetRegularFanins()) {
-    auto* fanin_node = regular_fanin.node_view()->node();
-    if (fanin_node->device() != device) {
+  for (const auto input :
+       graph_view.GetFanins(node, /*include_controlling_nodes=*/true)) {
+    if (node.device() != input.node->device()) {
       // Node is driven by a different device.
       return false;
-    } else if (function_names.contains(fanin_node->op())) {
+    } else if (input.port_id == Graph::kControlSlot) {
+      // Node is driven by control dependency.
+      continue;
+    } else if (function_names.find(input.node->op()) != function_names.end()) {
       // Node input is a function call.
       return false;
-    } else if (IsOutputPortRefValue(*fanin_node, regular_fanin.index(),
-                                    op_registry)) {
+    } else if (IsOutputPortRefValue(*input.node, input.port_id, op_registry)) {
       return false;
     }
   }
-  for (const auto& controlling_fanin : node_view.GetControllingFanins()) {
-    if (controlling_fanin.node_view()->GetDevice() != device) {
-      // Node is driven by a different device.
+  for (const auto output :
+       graph_view.GetFanouts(node, /*include_controlled_nodes=*/false)) {
+    if (function_names.find(output.node->op()) != function_names.end()) {
+      // Node output is a function call.
       return false;
     }
   }
-  for (const auto& regular_fanouts : node_view.GetRegularFanouts()) {
-    for (const auto& regular_fanout : regular_fanouts) {
-      if (function_names.contains(regular_fanout.node_view()->GetOp())) {
-        // Node output is a function call.
-        return false;
-      }
-    }
-  }
   return true;
 }
 
-// ForwardFanins forwards fanins of a node to be removed to its fanouts. This
-// currently is specific to nodes defined in `IsTrivialOp` under the assumption
-// they can have at most one regular fanin (at index 0) and one regular fanout
-// index (at 0).
-//
-// The forwarding is as follows:
-// * If the node to be removed has a regular fanin (at index 0), that fanin will
-//   be forwarded by replacing the fanin (consisting of the node being removed)
-//   in each regular fanout (at index 0) of the node being removed with the
-//   node being removed's regular fanin (at index 0). If the node being removed
-//   also has controlling fanins, those controlling fanins are added to each
-//   regular fanout (at index 0) as a controlling fanin.
-// * If the node to be removed has controlled fanouts, each controlling fanin of
-//   the node is added and the node to be removed (as a control dependency) is
-//   removed from each controlled fanout. If the node also has a regular fanin
-//   (at index 0), that fanin as a control dependency is added to each
-//   controlled fanout.
-// TODO(lyandy): Move this to a shared util for GraphView.
-Status ForwardFanins(utils::MutableGraphView* graph_view, int node_to_delete,
-                     absl::flat_hash_set<int>* mutated_fanouts) {
-  utils::Mutation* mutation = graph_view->GetMutationBuilder();
-  auto* node_view = graph_view->GetNode(node_to_delete);
-  std::vector<absl::string_view> controlling_fanin_names;
-  controlling_fanin_names.reserve(node_view->NumControllingFanins());
-  for (const auto& controlling_fanin : node_view->GetControllingFanins()) {
-    controlling_fanin_names.push_back(controlling_fanin.node_view()->GetName());
+void ForwardInputsInternal(
+    const NodeDef& node,
+    const absl::flat_hash_set<const NodeDef*>& nodes_to_delete,
+    bool add_as_control, NodeDef* new_node,
+    const absl::flat_hash_map<string, const NodeDef*>& optimized_nodes,
+    const GraphView& graph_view) {
+  // To speed things up, use the optimized version of the node if
+  // available.
+  auto itr = optimized_nodes.find(node.name());
+  if (itr != optimized_nodes.end()) {
+    for (const string& input : itr->second->input()) {
+      *new_node->add_input() =
+          add_as_control ? AsControlDependency(NodeName(input)) : input;
+    }
+    return;
   }
-  const auto& node_regular_fanin_0 = node_view->GetRegularFanin(0);
-  const bool has_regular_fanin_0 = node_view->NumRegularFanins() >= 1;
-  const string regular_fanin_0_name =
-      has_regular_fanin_0 ? node_regular_fanin_0.node_view()->GetName() : "";
-
-  // Forward to regular fanouts.
-  if (has_regular_fanin_0) {
-    TensorId tensor_id(regular_fanin_0_name, node_regular_fanin_0.index());
-    for (const auto& fanout : node_view->GetRegularFanout(0)) {
-      auto* fanout_node_view = fanout.node_view();
-      mutation->AddOrUpdateRegularFanin(fanout_node_view, fanout.index(),
-                                        tensor_id);
-      for (const auto& controlling_fanin : controlling_fanin_names) {
-        mutation->AddControllingFanin(fanout_node_view, controlling_fanin);
-      }
-      mutated_fanouts->emplace(fanout.node_index());
+  for (const auto& input : node.input()) {
+    const NodeDef* input_node = graph_view.GetNode(NodeName(input));
+    if (input_node == nullptr) {
+      // Invalid input, preserve it as is.
+      *new_node->add_input() =
+          add_as_control ? AsControlDependency(NodeName(input)) : input;
+      continue;
+    }
+    if (nodes_to_delete.find(input_node) != nodes_to_delete.end()) {
+      ForwardInputsInternal(*input_node, nodes_to_delete,
+                            add_as_control || IsControlInput(input), new_node,
+                            optimized_nodes, graph_view);
+    } else {
+      *new_node->add_input() =
+          add_as_control ? AsControlDependency(NodeName(input)) : input;
     }
   }
-
-  // Forward to controlled fanouts.
-  for (const auto& controlled_fanout : node_view->GetControlledFanouts()) {
-    auto* fanout_node_view = controlled_fanout.node_view();
-    mutation->RemoveControllingFanin(fanout_node_view, node_view->GetName());
-    if (has_regular_fanin_0) {
-      mutation->AddControllingFanin(fanout_node_view, regular_fanin_0_name);
-    }
-    for (const auto& controlling_fanin : controlling_fanin_names) {
-      mutation->AddControllingFanin(fanout_node_view, controlling_fanin);
-    }
-    mutated_fanouts->emplace(controlled_fanout.node_index());
-  }
-
-  return mutation->Apply();
 }
 
-absl::flat_hash_map<int, std::vector<bool>> IdentityNTerminalPorts(
-    const utils::MutableGraphView& graph_view,
-    absl::Span<const int> terminal_nodes) {
+void ForwardInputs(const NodeDef& original_node,
+                   const absl::flat_hash_set<const NodeDef*>& nodes_to_delete,
+                   NodeDef* new_node,
+                   absl::flat_hash_map<string, const NodeDef*>* optimized_nodes,
+                   const GraphView& graph_view) {
+  // Forwards inputs of nodes to be deleted to their respective outputs.
+  ForwardInputsInternal(original_node, nodes_to_delete,
+                        /*add_as_control=*/false, new_node, *optimized_nodes,
+                        graph_view);
+  if (!new_node->name().empty()) {
+    (*optimized_nodes)[new_node->name()] = new_node;
+  }
+  // Reorder inputs such that control inputs come after regular inputs.
+  int pos = 0;
+  for (int i = 0; i < new_node->input_size(); ++i) {
+    if (!IsControlInput(new_node->input(i))) {
+      new_node->mutable_input()->SwapElements(pos, i);
+      ++pos;
+    }
+  }
+  DedupControlInputs(new_node);
+}
+
+absl::flat_hash_map<string, absl::flat_hash_set<int>> IdentityNTerminalPorts(
+    const NodeMap& node_map, const std::vector<string>& terminal_nodes,
+    int graph_size) {
   // Determines which ports for IdentityN nodes (that can be rewritten) lead to
   // a terminal node.
-  std::vector<utils::MutableFanoutView> to_visit;
-  to_visit.reserve(graph_view.NumNodes());
+  std::vector<string> to_visit;
+  to_visit.reserve(graph_size);
   // Set terminal nodes as visited so terminal nodes that may be IdentityN don't
   // get pruned later on.
-  absl::flat_hash_set<int> visited(terminal_nodes.begin(),
-                                   terminal_nodes.end());
-  for (const auto& terminal_node : terminal_nodes) {
-    const auto* node = graph_view.GetNode(terminal_node);
-    for (const auto& regular_fanin : node->GetRegularFanins()) {
-      to_visit.push_back(regular_fanin);
+  absl::flat_hash_set<string> visited(terminal_nodes.begin(),
+                                      terminal_nodes.end());
+  for (string terminal_node : terminal_nodes) {
+    NodeDef* node = node_map.GetNode(terminal_node);
+    if (node == nullptr) {
+      continue;
     }
-    for (const auto& controlling_fanin : node->GetRegularFanins()) {
-      to_visit.push_back(controlling_fanin);
+    for (string input : node->input()) {
+      to_visit.push_back(input);
     }
   }
 
-  absl::flat_hash_set<utils::MutableFanoutView> identity_n_fanouts;
+  absl::flat_hash_set<string> identity_n_fanouts;
   while (!to_visit.empty()) {
-    const auto curr = to_visit.back();
+    string curr = to_visit.back();
     to_visit.pop_back();
-    const auto* curr_node = curr.node_view();
-    if (visited.contains(curr_node->node_index())) {
+    NodeDef* curr_node = node_map.GetNode(curr);
+    if (curr_node == nullptr ||
+        visited.find(curr_node->name()) != visited.end()) {
       continue;
     }
     // For IdentityN nodes, only traverse up through the port that comes from a
     // terminal node along with control inputs. The IdentityN node is not marked
     // as visited so other node input traversals can go through the other ports
     // of the IdentityN node.
-    if (IsIdentityN(*curr_node->node())) {
-      if (!identity_n_fanouts.contains(curr)) {
+    if (IsIdentityN(*curr_node)) {
+      if (identity_n_fanouts.find(curr) == identity_n_fanouts.end()) {
         identity_n_fanouts.emplace(curr);
-        const int pos = curr.index();
+        int pos = NodePositionIfSameNode(curr, curr_node->name());
         if (pos >= 0) {
-          to_visit.push_back(curr_node->GetRegularFanin(pos));
+          to_visit.push_back(curr_node->input(pos));
         }
-        for (const auto& controlling_fanin :
-             curr_node->GetControllingFanins()) {
-          if (!identity_n_fanouts.contains(controlling_fanin)) {
-            to_visit.push_back(controlling_fanin);
+        for (const string& input : curr_node->input()) {
+          if (IsControlInput(input) &&
+              identity_n_fanouts.find(input) == identity_n_fanouts.end()) {
+            to_visit.push_back(input);
           }
         }
       }
     } else {
-      for (const auto& regular_fanin : curr_node->GetRegularFanins()) {
-        to_visit.push_back(regular_fanin);
+      for (const string& input : curr_node->input()) {
+        to_visit.push_back(input);
       }
-      for (const auto& controlling_fanin : curr_node->GetRegularFanins()) {
-        to_visit.push_back(controlling_fanin);
-      }
-      visited.emplace(curr_node->node_index());
+      visited.emplace(curr_node->name());
     }
   }
 
-  absl::flat_hash_map<int, std::vector<bool>> identity_n_ports;
+  absl::flat_hash_map<string, absl::flat_hash_set<int>> identity_n_ports;
   for (const auto& fanout : identity_n_fanouts) {
-    if (fanout.index() == Graph::kControlSlot) {  // Exclude control inputs.
+    int pos;
+    string node_name = ParseNodeName(fanout, &pos);
+    if (node_name.empty() || pos < 0) {  // Exclude control inputs.
       continue;
     }
-    const auto* fanout_node_view = fanout.node_view();
-    auto& ports = identity_n_ports[fanout_node_view->node_index()];
-    if (ports.empty()) {
-      ports.resize(fanout_node_view->NumRegularFanins());
+    if (identity_n_ports.find(node_name) == identity_n_ports.end()) {
+      identity_n_ports[node_name] = {pos};
+    } else {
+      identity_n_ports[node_name].emplace(pos);
     }
-    ports[fanout.index()] = true;
   }
 
   return identity_n_ports;
 }
 
-string AddNewIdentityFromIdentityN(utils::MutableGraphView* graph_view,
-                                   const utils::MutableNodeView& node_view,
-                                   const AttrValue* node_type_attr, int pos) {
+string NewIdentityFromIdentityN(int pos, const NodeDef& identity_n,
+                                GraphDef* graph, NodeMap* node_map) {
   // TODO(lyandy): Migrate over to GrapplerOptimizerStage and use
   // OptimizedNodeName for new node name.
   string new_node_name =
-      strings::StrCat(node_view.GetName(), "-", pos, "-grappler-ModelPruner");
-  if (graph_view->HasNode(new_node_name)) {
+      strings::StrCat(identity_n.name(), "-", pos, "-grappler-ModelPruner");
+  if (node_map->NodeExists(new_node_name)) {
     return "";
   }
-
-  // This node will be pruned away so there is no need to add/forward control
-  // dependencies.
-  NodeDef new_node;
-  new_node.set_name(new_node_name);
-  new_node.set_op("Identity");
-  new_node.set_device(node_view.GetDevice());
-  new_node.add_input(node_view.node()->input(pos));
-
-  AttrValue attr;
-  attr.set_type(node_type_attr->list().type(pos));
-  new_node.mutable_attr()->insert({"T", attr});
-
-  Status status;
-  graph_view->GetMutationBuilder()->AddNode(std::move(new_node), &status);
+  NodeDef* new_node = graph->add_node();
+  Status status = NodeDefBuilder(new_node_name, "Identity")
+                      .Input(identity_n.input(pos), 0,
+                             identity_n.attr().at("T").list().type(pos))
+                      .Device(identity_n.device())
+                      .Finalize(new_node);
   if (!status.ok()) {
     return "";
   }
-
-  return new_node_name;
-}
-
-Status ForwardFaninToFanouts(utils::MutableGraphView* graph_view,
-                             utils::MutableNodeView* node_view, int pos,
-                             const TensorId& fanin) {
-  utils::Mutation* mutation = graph_view->GetMutationBuilder();
-  auto& fanouts_at_pos = node_view->GetRegularFanout(pos);
-  for (auto& fanout : fanouts_at_pos) {
-    mutation->AddOrUpdateRegularFanin(fanout.node_view(), fanout.index(),
-                                      fanin);
-  }
-  return Status::OK();
+  node_map->AddNode(new_node->name(), new_node);
+  node_map->AddOutput(NodeName(new_node->input(0)), new_node->name());
+  return new_node->name();
 }
 
 Status RewriteIdentityNAndInputsOutputs(
-    utils::MutableGraphView* graph_view, utils::MutableNodeView* node_view,
-    const std::vector<bool>& terminal_ports) {
+    NodeDef* node, int num_non_control_inputs,
+    const absl::flat_hash_set<int>& terminal_ports, GraphDef* graph,
+    NodeMap* node_map) {
   // Rewrite IdentityN node and associated inputs and outputs. For inputs and
   // outputs that don't lead to a terminal node, a new Identity node is created
   // and those inputs and outputs are rewritten to use the new Identity node as
   // their outputs and inputs respectively. For the remaining nodes, the ouputs
   // have their inputs updated with the adjusted port, from the IdentityN node
   // having less inputs.
-  auto* identity_n_types = node_view->GetAttr("T");
-  if (identity_n_types == nullptr) {
-    return errors::Internal("IdentityN node '", node_view->GetName(),
-                            "' is missing attribute 'T'");
-  }
-  AttrValue types(*identity_n_types);
-
-  utils::Mutation* mutation = graph_view->GetMutationBuilder();
+  struct NodeOutputUpdate {
+    string input;
+    string output;
+  };
 
+  absl::flat_hash_map<int, int> terminal_input_pos;
+  absl::flat_hash_map<int, string> new_identities;
   int new_idx = 0;
-  const int num_regular_fanins = node_view->NumRegularFanins();
-  for (int i = 0; i < num_regular_fanins; i++) {
-    if (terminal_ports[i]) {
-      if (i > new_idx) {
-        const auto& fanin_at_i = node_view->GetRegularFanin(i);
-        mutation->AddOrUpdateRegularFanin(
-            node_view, new_idx,
-            {fanin_at_i.node_view()->GetName(), fanin_at_i.index()});
-        TF_RETURN_IF_ERROR(ForwardFaninToFanouts(
-            graph_view, node_view, i, {node_view->GetName(), new_idx}));
-        types.mutable_list()->mutable_type()->SwapElements(i, new_idx);
-      }
-      new_idx++;
+  for (int i = 0; i < num_non_control_inputs; i++) {
+    if (terminal_ports.find(i) != terminal_ports.end()) {
+      terminal_input_pos[i] = new_idx++;
     } else {
-      string identity = AddNewIdentityFromIdentityN(graph_view, *node_view,
-                                                    identity_n_types, i);
+      string identity = NewIdentityFromIdentityN(i, *node, graph, node_map);
       if (identity.empty()) {
         // Fail early when creating Identity from IdentityN errors.
         return errors::Internal(
-            "Could not create Identity node from IdentityN node '",
-            node_view->GetName(), "' at port ", i);
+            "Could not create Identity node from IdentityN node ", node->name(),
+            " at port ", i);
       }
-      TF_RETURN_IF_ERROR(
-          ForwardFaninToFanouts(graph_view, node_view, i, {identity, 0}));
+      new_identities[i] = identity;
     }
   }
 
-  if (new_idx < num_regular_fanins) {
-    for (int i = new_idx; i < num_regular_fanins; ++i) {
-      mutation->RemoveRegularFanin(node_view, i);
+  std::vector<NodeOutputUpdate> updates;
+  for (NodeDef* output : node_map->GetOutputs(node->name())) {
+    for (int i = 0; i < output->input_size(); i++) {
+      string input = output->input(i);
+      if (IsControlInput(input)) {
+        continue;
+      }
+      TensorId input_tensor = ParseTensorName(input);
+      if (input_tensor.node() == node->name()) {
+        if (terminal_ports.find(input_tensor.index()) == terminal_ports.end()) {
+          // Replace input that does not lead to a terminal node with newly
+          // created identity.
+          string new_identity = new_identities[input_tensor.index()];
+          output->set_input(i, new_identity);
+          updates.push_back({new_identity, output->name()});
+        } else {
+          // Update input ports that lead to a terminal node from splitting
+          // inputs.
+          int new_pos = terminal_input_pos[input_tensor.index()];
+          string updated_input_name =
+              new_pos > 0 ? strings::StrCat(node->name(), ":", new_pos)
+                          : node->name();
+          output->set_input(i, updated_input_name);
+        }
+      }
     }
-    types.mutable_list()->mutable_type()->Truncate(new_idx);
-    mutation->AddOrUpdateNodeAttr(node_view, "T", types);
-    return mutation->Apply();
   }
 
+  for (NodeOutputUpdate update : updates) {
+    node_map->AddOutput(update.input, update.output);
+  }
+
+  // Update inputs and types by removing inputs that were split away from
+  // main IdentityN node.
+  const int num_inputs = node->input_size();
+  int curr_pos = 0;
+  auto mutable_inputs = node->mutable_input();
+  auto mutable_types =
+      node->mutable_attr()->at("T").mutable_list()->mutable_type();
+  for (int i = 0; i < num_non_control_inputs; i++) {
+    if (terminal_input_pos.find(i) != terminal_input_pos.end()) {
+      mutable_inputs->SwapElements(i, curr_pos);
+      mutable_types->SwapElements(i, curr_pos);
+      curr_pos++;
+    }
+  }
+  mutable_types->Truncate(curr_pos);
+  // Control inputs.
+  for (int i = num_non_control_inputs; i < num_inputs; i++) {
+    mutable_inputs->SwapElements(i, curr_pos++);
+  }
+  mutable_inputs->DeleteSubrange(curr_pos, num_inputs - curr_pos);
+
   return Status::OK();
 }
 
-std::vector<int> GetTerminalNodeIndices(
-    const utils::MutableGraphView& graph_view,
-    const absl::flat_hash_set<absl::string_view>& nodes_to_preserve,
-    Status* s) {
-  std::vector<int> node_indices;
-  node_indices.reserve(nodes_to_preserve.size());
-
-  for (const auto& node_to_preserve : nodes_to_preserve) {
-    const auto* node = graph_view.GetNode(node_to_preserve);
-    if (node != nullptr) {
-      node_indices.push_back(node->node_index());
-    } else {
-      *s = errors::Internal("Could not find node with name '", node_to_preserve,
-                            "'");
-      return {};
-    }
-  }
-  *s = Status::OK();
-  return node_indices;
-}
-
-Status SplitIdentityNInputs(
-    utils::MutableGraphView* graph_view,
-    const absl::flat_hash_set<absl::string_view>& nodes_to_preserve,
-    bool* updated_graph) {
-  Status status;
-  std::vector<int> terminal_nodes =
-      GetTerminalNodeIndices(*graph_view, nodes_to_preserve, &status);
-  TF_RETURN_IF_ERROR(status);
-
+Status SplitIdentityNInputs(GraphDef* graph,
+                            const std::vector<string>& terminal_nodes,
+                            bool* updated_graph) {
   // For inputs of IdentityN nodes that do not lead to a terminal node, remove
   // them from IdentityN and create new individual Identity nodes. This will
   // allow ModelPruner to possibly remove nodes in the transitive fanin of the
   // newly created Identity nodes.
-  auto terminal_ports = IdentityNTerminalPorts(*graph_view, terminal_nodes);
-  for (auto const& terminal : terminal_ports) {
-    auto* node = graph_view->GetNode(terminal.first);
+  NodeMap node_map(graph);
 
-    const int num_regular_fanins = node->NumRegularFanins();
-    auto* t_attr = node->GetAttr("T");
-    if (t_attr == nullptr || t_attr->list().type_size() != num_regular_fanins ||
-        terminal.second.size() != num_regular_fanins) {
+  for (auto const& terminal :
+       IdentityNTerminalPorts(node_map, terminal_nodes, graph->node_size())) {
+    NodeDef* node = node_map.GetNode(terminal.first);
+    if (node == nullptr) {
       continue;
     }
 
-    TF_RETURN_IF_ERROR(
-        RewriteIdentityNAndInputsOutputs(graph_view, node, terminal.second));
+    const int num_non_control_inputs = NumNonControlInputs(*node);
+    if (node->attr().count("T") == 0 ||
+        node->attr().at("T").list().type_size() != num_non_control_inputs ||
+        terminal.second.size() >= num_non_control_inputs) {
+      continue;
+    }
+
+    TF_RETURN_IF_ERROR(RewriteIdentityNAndInputsOutputs(
+        node, num_non_control_inputs, terminal.second, graph, &node_map));
     *updated_graph = true;
   }
 
   return Status::OK();
 }
 
-std::vector<bool> ComputeTransitiveFanin(
-    const utils::MutableGraphView& graph_view,
-    absl::Span<const int> terminal_nodes) {
-  absl::flat_hash_map<absl::string_view, int> name_to_send;
-  for (const auto& node : graph_view.GetNodes()) {
-    if (node.GetOp() == "_Send") {
-      name_to_send[node.GetAttr("tensor_name")->s()] = node.node_index();
-    }
+Status SetTransitiveFaninGraph(const GraphDef& input_graph,
+                               GraphDef* output_graph,
+                               const std::vector<string>& terminal_nodes) {
+  // Determines transitive fanin nodes from terminal nodes and add them to the
+  // output graph.
+  bool ill_formed = false;
+  std::vector<const NodeDef*> keep =
+      ComputeTransitiveFanin(input_graph, terminal_nodes, &ill_formed);
+  if (ill_formed) {
+    // Some graph edges are invalid, or some of the feeds/fetch don't exist:
+    // let's be conservative and preserve the graph as is.
+    return errors::InvalidArgument("Invalid input graph.");
+  }
+  // Try to keep the nodes ordered somewhat topologically since this helps
+  // further optimizations perform better.
+  output_graph->mutable_node()->Reserve(keep.size());
+  for (int i = keep.size() - 1; i >= 0; --i) {
+    *output_graph->add_node() = *keep[i];
   }
 
-  std::vector<int> queue;
-  queue.insert(queue.end(), terminal_nodes.begin(), terminal_nodes.end());
-
-  std::vector<bool> result;
-  result.resize(graph_view.NumNodes(), false);
-
-  while (!queue.empty()) {
-    const int node_index = queue.back();
-    queue.pop_back();
-    if (result[node_index]) {
-      // The node has already been visited.
-      continue;
-    }
-    result[node_index] = true;
-    const auto* node = graph_view.GetNode(node_index);
-    for (const auto& regular_fanin : node->GetRegularFanins()) {
-      queue.push_back(regular_fanin.node_index());
-    }
-    for (const auto& controlling_fanin : node->GetControllingFanins()) {
-      queue.push_back(controlling_fanin.node_index());
-    }
-    if (node->GetOp() == "_Recv") {
-      auto it = name_to_send.find(node->GetAttr("tensor_name")->s());
-      if (it != name_to_send.end()) {
-        queue.push_back(it->second);
-      }
-      // Subgraph after partitioning may have either _Send or _Recv, not both.
-    }
-  }
-
-  return result;
-}
-
-// TODO(lyandy): Move this to a shared util for GraphView.
-Status PruneUnreachableNodes(
-    utils::MutableGraphView* graph_view,
-    const absl::flat_hash_set<absl::string_view>& nodes_to_preserve) {
-  Status status;
-  std::vector<int> terminal_nodes =
-      GetTerminalNodeIndices(*graph_view, nodes_to_preserve, &status);
-  TF_RETURN_IF_ERROR(status);
-  std::vector<bool> nodes_to_keep =
-      ComputeTransitiveFanin(*graph_view, terminal_nodes);
-  utils::Mutation* mutation = graph_view->GetMutationBuilder();
-  const int num_nodes = graph_view->NumNodes();
-  for (int i = 0; i < num_nodes; ++i) {
-    if (!nodes_to_keep[i]) {
-      mutation->RemoveNode(graph_view->GetNode(i));
-    }
-  }
-
-  return mutation->Apply();
-}
-
-// TODO(lyandy): Move this to a shared util for GraphView.
-Status DedupNodeControlDependencies(utils::MutableGraphView* graph_view,
-                                    int node_index) {
-  auto* node_view = graph_view->GetNode(node_index);
-
-  std::vector<bool> regular_nodes;
-  regular_nodes.resize(graph_view->NumNodes());
-  for (const auto& regular_fanin : node_view->GetRegularFanins()) {
-    regular_nodes[regular_fanin.node_view()->node_index()] = true;
-  }
-
-  utils::Mutation* mutation = graph_view->GetMutationBuilder();
-  for (const auto& controlling_fanin : node_view->GetControllingFanins()) {
-    auto* controlling_fanin_node = controlling_fanin.node_view();
-    if (regular_nodes[controlling_fanin_node->node_index()]) {
-      mutation->RemoveControllingFanin(node_view,
-                                       controlling_fanin_node->GetName());
-    }
-  }
-
-  return mutation->Apply();
+  return Status::OK();
 }
 
 Status ModelPruner::Optimize(Cluster* cluster, const GrapplerItem& item,
                              GraphDef* optimized_graph) {
-  const auto preserve_set = item.NodesToPreserve();
-  const absl::flat_hash_set<absl::string_view> nodes_to_preserve(
-      preserve_set.begin(), preserve_set.end());
+  const std::unordered_set<string> nodes_to_preserve = item.NodesToPreserve();
 
   // Prune all the nodes that won't be executed, ie all the nodes that aren't in
   // the fanin of a fetch node. If fetch nodes aren't specified, we'll assume
   // the whole graph might be executed.
-  GraphDef graph = item.graph;
-  Status status;
-  utils::MutableGraphView graph_view(&graph, &status);
-  TF_RETURN_IF_ERROR(status);
+  std::unique_ptr<GraphDef> pruned_graph_release;
+  GraphDef* pruned_graph;
   if (!nodes_to_preserve.empty()) {
-    TF_RETURN_IF_ERROR(PruneUnreachableNodes(&graph_view, nodes_to_preserve));
+    pruned_graph_release.reset(new GraphDef());
+    pruned_graph = pruned_graph_release.get();
+    pruned_graph->mutable_node()->Reserve(item.graph.node_size());
+    std::vector<string> terminal_nodes(nodes_to_preserve.begin(),
+                                       nodes_to_preserve.end());
+    std::sort(terminal_nodes.begin(), terminal_nodes.end());
+    TF_RETURN_IF_ERROR(
+        SetTransitiveFaninGraph(item.graph, pruned_graph, terminal_nodes));
     bool did_split_identity_n = false;
-    TF_RETURN_IF_ERROR(SplitIdentityNInputs(&graph_view, nodes_to_preserve,
+    TF_RETURN_IF_ERROR(SplitIdentityNInputs(pruned_graph, terminal_nodes,
                                             &did_split_identity_n));
     if (did_split_identity_n) {
-      TF_RETURN_IF_ERROR(PruneUnreachableNodes(&graph_view, nodes_to_preserve));
+      GraphDef fanin_split_identity_n_graph;
+      TF_RETURN_IF_ERROR(SetTransitiveFaninGraph(
+          *pruned_graph, &fanin_split_identity_n_graph, terminal_nodes));
+      pruned_graph->Swap(&fanin_split_identity_n_graph);
     }
-    // TODO(lyandy): Remove sorting once ArithmeticOptimizer
-    // (MinimizeBroadcasts) is migrated over to using utils::GraphView.
-    TF_RETURN_IF_ERROR(
-        graph_view.SortTopologically(/*ignore_cycles=*/true, {}));
+    GRAPPLER_RETURN_IF_DEADLINE_EXCEEDED();
+  } else {
+    pruned_graph = const_cast<GraphDef*>(&item.graph);
   }
 
-  absl::flat_hash_set<absl::string_view> function_names;
-  function_names.reserve(graph.library().function_size());
-  for (const auto& function : graph.library().function()) {
+  GraphView graph_view(pruned_graph);
+  absl::flat_hash_set<string> function_names;
+  for (const auto& function : item.graph.library().function()) {
     function_names.insert(function.signature().name());
   }
   OpRegistryInterface* op_registry = OpRegistry::Global();
 
   // Check if we can further prune the graph, by removing the trivial ops.
-  const int num_nodes = graph_view.NumNodes();
-  std::vector<int> nodes_to_delete;
-  nodes_to_delete.reserve(num_nodes);
-  for (auto& node : graph_view.GetNodes()) {
-    if (!IsTrivialOp(node)) {
+  absl::flat_hash_set<const NodeDef*> nodes_to_delete;
+  for (const auto& node : pruned_graph->node()) {
+    if (!IsTrivialOp(node, graph_view)) {
       continue;
     }
 
     // Don't remove nodes that must be preserved.
-    if (nodes_to_preserve.contains(node.GetName())) {
+    if (nodes_to_preserve.find(node.name()) != nodes_to_preserve.end()) {
       continue;
     }
 
@@ -589,41 +499,42 @@ Status ModelPruner::Optimize(Cluster* cluster, const GrapplerItem& item,
     //   converting references to non-references. It is important to preserve
     //   these non-references since the partitioner will avoid sending
     //   non-references across partitions more than once.
-    if (CanRemoveNode(node, function_names, *op_registry)) {
-      nodes_to_delete.push_back(node.node_index());
+    if (CanRemoveNode(node, graph_view, function_names, *op_registry)) {
+      nodes_to_delete.insert(&node);
     }
   }
 
-  if (!nodes_to_delete.empty()) {
-    absl::flat_hash_set<int> mutated_fanouts;
-    for (const int node_to_delete : nodes_to_delete) {
-      TF_RETURN_IF_ERROR(
-          ForwardFanins(&graph_view, node_to_delete, &mutated_fanouts));
-    }
-
-    for (const int mutated_fanout : mutated_fanouts) {
-      TF_RETURN_IF_ERROR(
-          DedupNodeControlDependencies(&graph_view, mutated_fanout));
-    }
-
-    if (!item.fetch.empty()) {
-      utils::Mutation* mutation = graph_view.GetMutationBuilder();
-      for (const int node_to_delete : nodes_to_delete) {
-        mutation->RemoveNode(graph_view.GetNode(node_to_delete));
-      }
-      TF_RETURN_IF_ERROR(mutation->Apply());
-      VLOG(1) << "Pruned " << num_nodes - graph.node_size()
-              << " nodes from the graph. The graph now contains "
-              << graph.node_size() << " nodes.";
-    }
-
-    if (graph.node_size() > item.graph.node_size()) {
-      return errors::Internal("Pruning increased graph size.");
-    }
+  if (nodes_to_delete.empty() && nodes_to_preserve.empty()) {
+    return errors::Aborted("Nothing to do.");
   }
 
-  *optimized_graph = std::move(graph);
+  optimized_graph->Clear();
+  *optimized_graph->mutable_library() = item.graph.library();
+  *optimized_graph->mutable_versions() = item.graph.versions();
+  if (nodes_to_delete.empty()) {
+    optimized_graph->mutable_node()->Swap(pruned_graph->mutable_node());
+    return Status::OK();
+  }
 
+  const bool fetches_are_known = !item.fetch.empty();
+  absl::flat_hash_map<string, const NodeDef*> optimized_nodes;
+  optimized_graph->mutable_node()->Reserve(pruned_graph->node_size());
+  for (const auto& node : pruned_graph->node()) {
+    if (!fetches_are_known ||
+        nodes_to_delete.find(&node) == nodes_to_delete.end()) {
+      NodeDef* new_node = optimized_graph->add_node();
+      *new_node = node;
+      new_node->clear_input();
+      ForwardInputs(node, nodes_to_delete, new_node, &optimized_nodes,
+                    graph_view);
+    }
+  }
+  VLOG(1) << "Pruned " << nodes_to_delete.size()
+          << " nodes from the graph. The graph now contains "
+          << optimized_graph->node_size() << " nodes.";
+  if (optimized_graph->node_size() > item.graph.node_size()) {
+    return errors::Internal("Pruning increased graph size.");
+  }
   return Status::OK();
 }
 
diff --git a/tensorflow/core/grappler/optimizers/scoped_allocator_optimizer.cc b/tensorflow/core/grappler/optimizers/scoped_allocator_optimizer.cc
index ec4cb13941d..13aa98d31e6 100644
--- a/tensorflow/core/grappler/optimizers/scoped_allocator_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/scoped_allocator_optimizer.cc
@@ -212,11 +212,10 @@ Status MaybeRewriteInput(ScopedAllocatorOptimizer* sa_opti,
                          NodeMap* node_map, const DataType& dtype,
                          NodeDef* input, const string& edge_name,
                          int output_index, NodeDef* op, NodeDef** new_input,
-                         int* new_output_index) {
-  bool rewrite =
-      IsExit(*input) || (sa_opti->repeated_outputs().find(edge_name) !=
-                         sa_opti->repeated_outputs().end());
-  if (!rewrite) {
+                         int* new_output_index, bool* rewrite) {
+  *rewrite = IsExit(*input) || (sa_opti->repeated_outputs().find(edge_name) !=
+                                sa_opti->repeated_outputs().end());
+  if (!(*rewrite)) {
     *new_input = input;
     *new_output_index = output_index;
     return Status::OK();
@@ -263,6 +262,7 @@ Status GetInputs(ScopedAllocatorOptimizer* sa_opti, int64 invocation_count,
   for (NodeDef* n : ops) {
     NodeDef* inode = nullptr;
     int output_index = 0;
+    DataType inode_dtype = DT_INVALID;
     VLOG(2) << "for node " << n->name();
     for (const auto& input_name : n->input()) {
       if (!IsControlInput(input_name)) {
@@ -277,22 +277,29 @@ Status GetInputs(ScopedAllocatorOptimizer* sa_opti, int64 invocation_count,
         }
         VLOG(2) << "inode " << inode->DebugString() << " output_index "
                 << output_index;
+        bool rewrite;
         LOG_WARNING_AND_RETURN_IF_ERROR(MaybeRewriteInput(
             sa_opti, invocation_count, graph, node_map, dtype, inode,
-            input_name, output_index, n, &inode, &output_index));
+            input_name, output_index, n, &inode, &output_index, &rewrite));
+        // If `inode` was rewritten, don't try to get output properties from the
+        // input node below.
+        if (rewrite) {
+          inode_dtype = dtype;
+        }
         VLOG(2) << "inode after rewrite " << inode->DebugString()
                 << " output_index " << output_index;
       }
     }
-    DataType inode_dtype;
-    if (!graph_properties.HasOutputProperties(inode->name())) {
-      return errors::Internal("Input node ", inode->name(),
-                              " does not have output properties");
+    if (inode_dtype == DT_INVALID) {
+      if (!graph_properties.HasOutputProperties(inode->name())) {
+        return errors::Internal("Input node ", inode->name(),
+                                " does not have output properties");
+      }
+      const auto& inode_output_props =
+          graph_properties.GetOutputProperties(inode->name());
+      LOG_WARNING_AND_RETURN_IF_ERROR(
+          GetOutputDataType(inode_output_props, output_index, &inode_dtype));
     }
-    const auto& inode_output_props =
-        graph_properties.GetOutputProperties(inode->name());
-    LOG_WARNING_AND_RETURN_IF_ERROR(
-        GetOutputDataType(inode_output_props, output_index, &inode_dtype));
     if (inode_dtype != dtype) {
       return errors::Internal("ScopedAllocatorOptimizer expected input type ",
                               dtype, " but found ", inode_dtype);
diff --git a/tensorflow/core/grappler/utils.cc b/tensorflow/core/grappler/utils.cc
index 7054eb22de3..7c176fedfe6 100644
--- a/tensorflow/core/grappler/utils.cc
+++ b/tensorflow/core/grappler/utils.cc
@@ -258,13 +258,21 @@ int NumOutputs(const NodeDef& node, GraphDef* graph) {
 }
 
 bool HasControlInputs(const NodeDef& node) {
-  int num_inputs = node.input_size();
+  const int num_inputs = node.input_size();
   if (num_inputs > 0 && IsControlInput(node.input(num_inputs - 1))) {
     return true;
   }
   return false;
 }
 
+bool HasRegularInputs(const NodeDef& node) {
+  const int num_inputs = node.input_size();
+  if (num_inputs > 0 && !IsControlInput(node.input(0))) {
+    return true;
+  }
+  return false;
+}
+
 int NumNonControlInputs(const NodeDef& node) {
   int num_inputs = node.input_size();
   for (const string& input : node.input()) {
@@ -275,6 +283,34 @@ int NumNonControlInputs(const NodeDef& node) {
   return num_inputs;
 }
 
+bool HasRegularOutputs(const NodeDef& node, const NodeMap& node_map) {
+  for (const NodeDef* output : node_map.GetOutputs(node.name())) {
+    for (const string& node_as_input : output->input()) {
+      if (IsControlInput(node_as_input)) continue;
+
+      TensorId tensor = ParseTensorName(node_as_input);
+      if (tensor.node() == node.name()) {
+        return true;
+      }
+    }
+  }
+  return false;
+}
+
+bool HasControlOutputs(const NodeDef& node, const NodeMap& node_map) {
+  for (const NodeDef* output : node_map.GetOutputs(node.name())) {
+    for (const string& node_as_input : output->input()) {
+      if (!IsControlInput(node_as_input)) continue;
+
+      TensorId tensor = ParseTensorName(node_as_input);
+      if (tensor.node() == node.name()) {
+        return true;
+      }
+    }
+  }
+  return false;
+}
+
 int NumControlOutputs(const NodeDef& node, const NodeMap& node_map) {
   int num_outputs = 0;
   for (const NodeDef* output : node_map.GetOutputs(node.name())) {
diff --git a/tensorflow/core/grappler/utils.h b/tensorflow/core/grappler/utils.h
index 8a698431268..71dbefeb5d4 100644
--- a/tensorflow/core/grappler/utils.h
+++ b/tensorflow/core/grappler/utils.h
@@ -256,6 +256,15 @@ int NumOutputs(const NodeDef& node, GraphDef* graph);
 // Returns true iff the node has at least one control input.
 bool HasControlInputs(const NodeDef& node);
 
+// Returns true iff the node has at least one regular input.
+bool HasRegularInputs(const NodeDef& node);
+
+// Returns true iff the node has at least one regular output.
+bool HasRegularOutputs(const NodeDef& node, const NodeMap& node_map);
+
+// Returns true iff the node has at least one control output.
+bool HasControlOutputs(const NodeDef& node, const NodeMap& node_map);
+
 // Number of connected non-control inputs.
 int NumNonControlInputs(const NodeDef& node);
 
diff --git a/tensorflow/core/grappler/utils_test.cc b/tensorflow/core/grappler/utils_test.cc
index 554fccc2076..4d53bf9ced5 100644
--- a/tensorflow/core/grappler/utils_test.cc
+++ b/tensorflow/core/grappler/utils_test.cc
@@ -322,15 +322,15 @@ TEST_F(UtilsTest, DedupControlInputs) {
 TEST_F(UtilsTest, NumNonControlOutputs) {
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
 
-  //  *) Round node has control dependency edge from Add, which
-  //     is not on this scheme (ASCII graphics limitation).
-  //
   //   *Round    [Sqrt, Shape]
   //      |           |
   //      |   ctrl    |
   //     Mul ------> Add
   //     / \         / \
   //    x   y       a   b
+  //
+  //  *) Round node has control dependency edge from Add, which
+  //     is not on this scheme (ASCII graphics limitation).
   auto x = ops::Variable(s.WithOpName("x"), {1, 2}, DT_FLOAT);
   auto y = ops::Variable(s.WithOpName("y"), {1, 2}, DT_FLOAT);
   auto a = ops::Variable(s.WithOpName("a"), {1, 2}, DT_FLOAT);
@@ -358,6 +358,25 @@ TEST_F(UtilsTest, NumNonControlOutputs) {
   EXPECT_EQ(NumNonControlOutputs(*add_node, node_map), 2);
   // sqrt is the only data output
   EXPECT_EQ(NumNonControlDataOutputs(*add_node, node_map), 1);
+
+  EXPECT_TRUE(HasControlInputs(*add_node));
+  EXPECT_TRUE(HasRegularInputs(*add_node));
+  EXPECT_TRUE(HasControlOutputs(*add_node, node_map));
+  EXPECT_TRUE(HasRegularOutputs(*add_node, node_map));
+
+  const NodeDef* x_node = node_map.GetNode("x");
+  ASSERT_NE(x_node, nullptr);
+  EXPECT_FALSE(HasControlInputs(*x_node));
+  EXPECT_FALSE(HasRegularInputs(*x_node));
+  EXPECT_FALSE(HasControlOutputs(*x_node, node_map));
+  EXPECT_TRUE(HasRegularOutputs(*x_node, node_map));
+
+  const NodeDef* round_node = node_map.GetNode("round");
+  ASSERT_NE(round_node, nullptr);
+  EXPECT_TRUE(HasControlInputs(*round_node));
+  EXPECT_TRUE(HasRegularInputs(*round_node));
+  EXPECT_FALSE(HasControlOutputs(*round_node, node_map));
+  EXPECT_FALSE(HasRegularOutputs(*round_node, node_map));
 }
 
 TEST(CheckAttrExists, All) {
diff --git a/tensorflow/core/kernels/BUILD b/tensorflow/core/kernels/BUILD
index dc99a569d94..7ce6750845a 100644
--- a/tensorflow/core/kernels/BUILD
+++ b/tensorflow/core/kernels/BUILD
@@ -72,6 +72,13 @@ package_group(
     ],
 )
 
+package_group(
+    name = "optimizer_helper_friends",
+    packages = [
+        "//learning/brain/research/lather/...",
+    ],
+)
+
 config_setting(
     # Add "--define tensorflow_xsmm=1" to your build command to use libxsmm for
     # sparse matrix multiplications. You will also need appropriate -mavx*
@@ -518,18 +525,17 @@ tf_cuda_library(
     hdrs = ["gpu_utils.h"],
     deps = [
         ":gpu_util_hdrs",
-        "@com_google_absl//absl/algorithm:container",
-        "@com_google_absl//absl/types:span",
         "//tensorflow/core:autotuning_proto_cc",
         "//tensorflow/core:conv_autotuning_proto_cc",
         "//tensorflow/core:lib",
         "//tensorflow/core:logger",
         "//tensorflow/core:stream_executor",
         "//tensorflow/core/util/proto:proto_utils",
-    ] + if_cuda([
-        "//tensorflow/stream_executor/cuda:redzone_allocator",
-        "//tensorflow/stream_executor/cuda:ptxas_utils",
-    ]),
+        "//tensorflow/stream_executor/gpu:asm_compiler",
+        "//tensorflow/stream_executor/gpu:redzone_allocator",
+        "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/types:span",
+    ],
 )
 
 tf_cc_test(
@@ -704,7 +710,10 @@ cc_library(
     name = "training_op_helpers",
     srcs = ["training_op_helpers.cc"],
     hdrs = ["training_op_helpers.h"],
-    visibility = [":friends"],
+    visibility = [
+        ":friends",
+        ":optimizer_helper_friends",
+    ],
     deps = [
         ":dense_update_functor",
         ":variable_ops",
@@ -3507,6 +3516,7 @@ tf_kernel_library(
     prefix = "einsum_op",
     deps = [
         ":batch_matmul_op",
+        ":fill_functor",
         ":reduction_ops",
         ":transpose_functor",
         "//tensorflow/core:framework",
@@ -4246,6 +4256,8 @@ tf_kernel_library(
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core/util/proto:proto_utils",
+        "//tensorflow/stream_executor/gpu:asm_compiler",
+        "//tensorflow/stream_executor/gpu:redzone_allocator",
     ] + select({
         ":xsmm_convolutions": [
             "@libxsmm_archive//:xsmm_avx",
@@ -4255,8 +4267,6 @@ tf_kernel_library(
         "//tensorflow/core/platform/default/build_config:cublas_plugin",
         "//tensorflow/core/platform/default/build_config:cudnn_plugin",
         "//tensorflow/stream_executor:tf_allocator_adapter",
-        "//tensorflow/stream_executor/cuda:redzone_allocator",
-        "//tensorflow/stream_executor/cuda:ptxas_utils",
         "//tensorflow/stream_executor:stream_executor_headers",
         "//tensorflow/core:stream_executor",
     ]),
@@ -4739,6 +4749,7 @@ cc_library(
 )
 
 PARSING_DEPS = [
+    "//tensorflow/core:core_cpu_internal",
     "//tensorflow/core:framework",
     "//tensorflow/core:lib",
     "//tensorflow/core:protos_all_cc",
@@ -6781,7 +6792,6 @@ tf_cc_binary(
             "-lm",
             "-llog",
             "-pie",
-            "-std=c++11",
         ],
         "//conditions:default": [],
     }),
@@ -6841,7 +6851,6 @@ cc_binary(
             "-lm",
             "-llog",
             "-pie",
-            "-std=c++11",
         ],
         "//conditions:default": [],
     }),
@@ -6927,7 +6936,6 @@ cc_binary(
             "-lm",
             "-llog",
             "-pie",
-            "-std=c++11",
         ],
         "//conditions:default": [],
     }),
@@ -8001,6 +8009,7 @@ cc_library(
         "cwise_ops_gpu_common.cu.h",
         "cwise_ops_gpu_gradients.cu.h",
         "cwise_ops_gradients.h",
+        "fill_functor.h",
         "meta_support.h",
     ],
     deps = [
diff --git a/tensorflow/core/kernels/batching_util/BUILD b/tensorflow/core/kernels/batching_util/BUILD
index 7f43c4aff42..60cc9014a78 100644
--- a/tensorflow/core/kernels/batching_util/BUILD
+++ b/tensorflow/core/kernels/batching_util/BUILD
@@ -70,6 +70,8 @@ cc_library(
         ":batch_scheduler_hdrs",
         ":periodic_function_dynamic",
         "//tensorflow/core:framework_headers_lib",
+        "//tensorflow/core/lib/strings:string_utils",
+        "//tensorflow/core/profiler/lib:traceme",
     ],
 )
 
@@ -80,6 +82,8 @@ cc_library(
         ":batch_scheduler",
         ":periodic_function_dynamic",
         "//tensorflow/core:lib",
+        "//tensorflow/core/lib/strings:string_utils",
+        "//tensorflow/core/profiler/lib:traceme",
     ],
     alwayslink = 1,
 )
diff --git a/tensorflow/core/kernels/batching_util/shared_batch_scheduler.h b/tensorflow/core/kernels/batching_util/shared_batch_scheduler.h
index 5fe1b75bd78..aa46ebf4454 100644
--- a/tensorflow/core/kernels/batching_util/shared_batch_scheduler.h
+++ b/tensorflow/core/kernels/batching_util/shared_batch_scheduler.h
@@ -17,6 +17,7 @@ limitations under the License.
 #define TENSORFLOW_CORE_KERNELS_BATCHING_UTIL_SHARED_BATCH_SCHEDULER_H_
 
 #include <stddef.h>
+
 #include <deque>
 #include <functional>
 #include <list>
@@ -35,6 +36,7 @@ limitations under the License.
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/thread_annotations.h"
 #include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/profiler/lib/traceme.h"
 
 namespace tensorflow {
 namespace serving {
@@ -526,6 +528,8 @@ Queue<TaskType>::~Queue() {
 
 template <typename TaskType>
 Status Queue<TaskType>::Schedule(std::unique_ptr<TaskType>* task) {
+  profiler::TraceMe trace_me(
+      [task] { return strings::StrCat("Schedule:", (*task)->size()); });
   if ((*task)->size() > options_.max_batch_size) {
     return errors::InvalidArgument("Task size ", (*task)->size(),
                                    " is larger than maximum batch size ",
@@ -616,6 +620,8 @@ std::unique_ptr<Batch<TaskType>> Queue<TaskType>::ScheduleBatch() {
 
 template <typename TaskType>
 void Queue<TaskType>::ProcessBatch(std::unique_ptr<Batch<TaskType>> batch) {
+  profiler::TraceMe trace_me(
+      [&batch] { return strings::StrCat("ProcessBatch:", batch->size()); });
   process_batch_callback_(std::move(batch));
 
   {
diff --git a/tensorflow/core/kernels/boosted_trees/resources.cc b/tensorflow/core/kernels/boosted_trees/resources.cc
index 4e5f1db7e02..82d3601a6a8 100644
--- a/tensorflow/core/kernels/boosted_trees/resources.cc
+++ b/tensorflow/core/kernels/boosted_trees/resources.cc
@@ -253,15 +253,24 @@ void BoostedTreesEnsembleResource::UpdateGrowingMetadata() const {
 }
 
 // Add a tree to the ensemble and returns a new tree_id.
-int32 BoostedTreesEnsembleResource::AddNewTree(const float weight) {
-  return AddNewTreeWithLogits(weight, 0.0);
+int32 BoostedTreesEnsembleResource::AddNewTree(const float weight,
+                                               const int32 logits_dimension) {
+  const std::vector<float> empty_leaf(logits_dimension);
+  return AddNewTreeWithLogits(weight, empty_leaf, logits_dimension);
 }
 
-int32 BoostedTreesEnsembleResource::AddNewTreeWithLogits(const float weight,
-                                                         const float logits) {
+int32 BoostedTreesEnsembleResource::AddNewTreeWithLogits(
+    const float weight, const std::vector<float>& logits,
+    const int32 logits_dimension) {
   const int32 new_tree_id = tree_ensemble_->trees_size();
   auto* node = tree_ensemble_->add_trees()->add_nodes();
-  node->mutable_leaf()->set_scalar(logits);
+  if (logits_dimension == 1) {
+    node->mutable_leaf()->set_scalar(logits[0]);
+  } else {
+    for (int32 i = 0; i < logits_dimension; ++i) {
+      node->mutable_leaf()->mutable_vector()->add_value(logits[i]);
+    }
+  }
   tree_ensemble_->add_tree_weights(weight);
   tree_ensemble_->add_tree_metadata();
 
diff --git a/tensorflow/core/kernels/boosted_trees/resources.h b/tensorflow/core/kernels/boosted_trees/resources.h
index 572b14757cf..70155e89071 100644
--- a/tensorflow/core/kernels/boosted_trees/resources.h
+++ b/tensorflow/core/kernels/boosted_trees/resources.h
@@ -102,10 +102,12 @@ class BoostedTreesEnsembleResource : public StampedResource {
   int32 right_id(const int32 tree_id, const int32 node_id) const;
 
   // Add a tree to the ensemble and returns a new tree_id.
-  int32 AddNewTree(const float weight);
+  int32 AddNewTree(const float weight, const int32 logits_dimension);
 
   // Adds new tree with one node to the ensemble and sets node's value to logits
-  int32 AddNewTreeWithLogits(const float weight, const float logits);
+  int32 AddNewTreeWithLogits(const float weight,
+                             const std::vector<float>& logits,
+                             const int32 logits_dimension);
 
   // Grows the tree by adding a bucketized split and leaves.
   void AddBucketizedSplitNode(
diff --git a/tensorflow/core/kernels/boosted_trees/training_ops.cc b/tensorflow/core/kernels/boosted_trees/training_ops.cc
index ca4f2e011be..7816c2c07eb 100644
--- a/tensorflow/core/kernels/boosted_trees/training_ops.cc
+++ b/tensorflow/core/kernels/boosted_trees/training_ops.cc
@@ -136,7 +136,7 @@ class BoostedTreesUpdateEnsembleOp : public OpKernel {
         }
         if (ensemble_resource->num_trees() > 0) {
           // Create a dummy new tree with an empty node.
-          ensemble_resource->AddNewTree(kLayerByLayerTreeWeight);
+          ensemble_resource->AddNewTree(kLayerByLayerTreeWeight, 1);
         }
       }
       // If we managed to split, update the node range. If we didn't, don't
@@ -159,7 +159,7 @@ class BoostedTreesUpdateEnsembleOp : public OpKernel {
     // boosting.
     if (num_trees <= 0) {
       // Create a new tree with a no-op leaf.
-      current_tree = resource->AddNewTree(kLayerByLayerTreeWeight);
+      current_tree = resource->AddNewTree(kLayerByLayerTreeWeight, 1);
     }
     return current_tree;
   }
@@ -357,7 +357,7 @@ class BoostedTreesUpdateEnsembleV2Op : public OpKernel {
         }
         if (ensemble_resource->num_trees() > 0) {
           // Create a dummy new tree with an empty node.
-          ensemble_resource->AddNewTree(kLayerByLayerTreeWeight);
+          ensemble_resource->AddNewTree(kLayerByLayerTreeWeight, logits_dim_);
         }
       }
       // If we managed to split, update the node range. If we didn't, don't
@@ -380,7 +380,7 @@ class BoostedTreesUpdateEnsembleV2Op : public OpKernel {
     // boosting.
     if (num_trees <= 0) {
       // Create a new tree with a no-op leaf.
-      current_tree = resource->AddNewTree(kLayerByLayerTreeWeight);
+      current_tree = resource->AddNewTree(kLayerByLayerTreeWeight, logits_dim_);
     }
     return current_tree;
   }
@@ -504,7 +504,8 @@ class BoostedTreesCenterBiasOp : public OpKernel {
     float current_bias = 0.0;
     bool continue_centering = true;
     if (ensemble_resource->num_trees() == 0) {
-      ensemble_resource->AddNewTreeWithLogits(kLayerByLayerTreeWeight, logits);
+      ensemble_resource->AddNewTreeWithLogits(kLayerByLayerTreeWeight, {logits},
+                                              1);
       current_bias = logits;
     } else {
       const auto& current_biases = ensemble_resource->node_value(0, 0);
diff --git a/tensorflow/core/kernels/bucketize_op.cc b/tensorflow/core/kernels/bucketize_op.cc
index 393254474e2..1b65d46dffb 100644
--- a/tensorflow/core/kernels/bucketize_op.cc
+++ b/tensorflow/core/kernels/bucketize_op.cc
@@ -67,8 +67,10 @@ class BucketizeOp : public OpKernel {
     OP_REQUIRES_OK(context, context->allocate_output(0, input_tensor.shape(),
                                                      &output_tensor));
     auto output = output_tensor->template flat<int32>();
-    OP_REQUIRES_OK(context, functor::BucketizeFunctor<Device, T>::Compute(
-                                context, input, boundaries_, output));
+    if (input.size() > 0) {
+      OP_REQUIRES_OK(context, functor::BucketizeFunctor<Device, T>::Compute(
+                                  context, input, boundaries_, output));
+    }
   }
 
  private:
diff --git a/tensorflow/core/kernels/collective_nccl_broadcaster.cc b/tensorflow/core/kernels/collective_nccl_broadcaster.cc
index 6e1da95faa7..59aecd90309 100644
--- a/tensorflow/core/kernels/collective_nccl_broadcaster.cc
+++ b/tensorflow/core/kernels/collective_nccl_broadcaster.cc
@@ -32,9 +32,8 @@ void NcclBroadcaster::Run(StatusCallback done) {
   string nccl_collective_key =
       NcclCollectiveKey(col_ctx_->exec_key, col_ctx_->step_id);
   auto participant = absl::make_unique<NcclManager::Participant>(
-      compute_stream->parent(), compute_stream, gpu_info->event_mgr,
-      gpu_info->gpu_id, col_ctx_->input, col_ctx_->output,
-      col_params_->default_rank, std::move(done));
+      compute_stream->parent(), compute_stream, gpu_info, col_ctx_->input,
+      col_ctx_->output, col_params_->default_rank, std::move(done));
   VLOG(1)
       << "NcclBroadcast calling NcclManager::AddBroadcastSend/Recv num_tasks "
       << col_params_->group.num_tasks << " current task "
diff --git a/tensorflow/core/kernels/collective_nccl_gatherer.cc b/tensorflow/core/kernels/collective_nccl_gatherer.cc
index 144d830befb..e219dffdc33 100644
--- a/tensorflow/core/kernels/collective_nccl_gatherer.cc
+++ b/tensorflow/core/kernels/collective_nccl_gatherer.cc
@@ -32,9 +32,8 @@ void NcclGatherer::Run(StatusCallback done) {
   string nccl_collective_key =
       NcclCollectiveKey(col_ctx_->exec_key, col_ctx_->step_id);
   auto participant = absl::make_unique<NcclManager::Participant>(
-      compute_stream->parent(), compute_stream, gpu_info->event_mgr,
-      gpu_info->gpu_id, col_ctx_->input, col_ctx_->output,
-      col_params_->default_rank, std::move(done));
+      compute_stream->parent(), compute_stream, gpu_info, col_ctx_->input,
+      col_ctx_->output, col_params_->default_rank, std::move(done));
   VLOG(1) << "NcclGatherer calling NcclManager::AddToAllGather num_tasks "
           << col_params_->group.num_tasks << " current task "
           << col_params_->instance.task_names[col_params_->default_rank]
diff --git a/tensorflow/core/kernels/collective_nccl_reducer.cc b/tensorflow/core/kernels/collective_nccl_reducer.cc
index 873e4e3aa6c..399c537ad33 100644
--- a/tensorflow/core/kernels/collective_nccl_reducer.cc
+++ b/tensorflow/core/kernels/collective_nccl_reducer.cc
@@ -118,9 +118,8 @@ void NcclReducer::Run(StatusCallback done) {
     nccl_done.Notify();
   };
   auto participant = absl::make_unique<NcclManager::Participant>(
-      compute_stream->parent(), compute_stream, gpu_info->event_mgr,
-      gpu_info->gpu_id, col_ctx_->input, col_ctx_->output,
-      col_params_->default_rank, std::move(done_callback));
+      compute_stream->parent(), compute_stream, gpu_info, col_ctx_->input,
+      col_ctx_->output, col_params_->default_rank, std::move(done_callback));
   VLOG(1) << "NcclReducer calling NcclManager::AddToAllReduce num_tasks "
           << col_params_->group.num_tasks << " current task "
           << col_params_->instance.task_names[col_params_->default_rank]
diff --git a/tensorflow/core/kernels/control_flow_ops.cc b/tensorflow/core/kernels/control_flow_ops.cc
index b084af9fd4d..ed75a05db49 100644
--- a/tensorflow/core/kernels/control_flow_ops.cc
+++ b/tensorflow/core/kernels/control_flow_ops.cc
@@ -55,6 +55,10 @@ void SwitchNOp::Compute(OpKernelContext* context) {
 REGISTER_KERNEL_BUILDER(
     Name("Switch").Device(DEVICE_DEFAULT).HostMemory("pred"), SwitchOp);
 
+REGISTER_KERNEL_BUILDER(
+    Name("_SwitchN").Device(DEVICE_DEFAULT).HostMemory("output_index"),
+    SwitchNOp);
+
 #define REGISTER_CPU_SWITCH(type)                         \
   REGISTER_KERNEL_BUILDER(Name("Switch")                  \
                               .Device(DEVICE_CPU)         \
diff --git a/tensorflow/core/kernels/conv_2d.h b/tensorflow/core/kernels/conv_2d.h
index 0dc3a574da8..5c7e75d50e7 100644
--- a/tensorflow/core/kernels/conv_2d.h
+++ b/tensorflow/core/kernels/conv_2d.h
@@ -25,32 +25,6 @@ limitations under the License.
 namespace tensorflow {
 namespace functor {
 
-// TODO(yangke): revisit these operations and in particular, see if we can
-// combine all of them into just one operation without causing nvcc to
-// timeout.
-template <typename Device, typename T, int Dims, typename IndexType>
-struct ShuffleAndReverse {
-  void operator()(const Device& d,
-                  typename TTypes<T, Dims, IndexType>::ConstTensor input,
-                  const Eigen::DSizes<IndexType, Dims>& order,
-                  const Eigen::array<bool, Dims>& reverse_dims,
-                  typename TTypes<T, Dims, IndexType>::Tensor output) {
-    output.device(d) = input.shuffle(order).reverse(reverse_dims);
-  }
-};
-
-template <typename Device, typename T, int Dims, typename IndexType>
-struct InflatePadAndShuffle {
-  void operator()(
-      const Device& d, typename TTypes<T, Dims, IndexType>::ConstTensor input,
-      const Eigen::DSizes<IndexType, Dims>& strides,
-      const Eigen::array<Eigen::IndexPair<IndexType>, Dims>& pad_dims,
-      const Eigen::DSizes<IndexType, Dims>& order,
-      typename TTypes<T, Dims, IndexType>::Tensor output) {
-    output.device(d) = input.inflate(strides).pad(pad_dims).shuffle(order);
-  }
-};
-
 template <typename Device, typename Input, typename Filter, typename Output,
           typename OutputKernel>
 void SpatialConvolutionFunc(const Device& d, Output output, Input input,
@@ -199,6 +173,8 @@ struct TransformFilter {
   }
 };
 
+// TODO This functor is not used anywhere and should be removed,
+// but it defines some eigen templates that are referenced in other kernels.
 template <typename Device, typename T, typename IndexType>
 struct TransformDepth {
   void operator()(const Device& d,
diff --git a/tensorflow/core/kernels/conv_2d_gpu_float.cu.cc b/tensorflow/core/kernels/conv_2d_gpu_float.cu.cc
index b34a15772bc..9c92d1f700f 100644
--- a/tensorflow/core/kernels/conv_2d_gpu_float.cu.cc
+++ b/tensorflow/core/kernels/conv_2d_gpu_float.cu.cc
@@ -29,10 +29,6 @@ namespace tensorflow {
 
 namespace functor {
 
-template struct ShuffleAndReverse<Eigen::GpuDevice, float, 4, int>;
-template struct ShuffleAndReverse<Eigen::GpuDevice, float, 4,
-                                  Eigen::DenseIndex>;
-
 template struct TransformDepth<Eigen::GpuDevice, float, int>;
 
 template struct SwapDimension1And2InTensor3<Eigen::GpuDevice, float4>;
diff --git a/tensorflow/core/kernels/conv_2d_gpu_half.cu.cc b/tensorflow/core/kernels/conv_2d_gpu_half.cu.cc
index 4b96a377f84..d66e4188877 100644
--- a/tensorflow/core/kernels/conv_2d_gpu_half.cu.cc
+++ b/tensorflow/core/kernels/conv_2d_gpu_half.cu.cc
@@ -29,12 +29,6 @@ namespace tensorflow {
 
 namespace functor {
 
-template struct ShuffleAndReverse<Eigen::GpuDevice, Eigen::half, 4, int>;
-template struct ShuffleAndReverse<Eigen::GpuDevice, Eigen::half, 4,
-                                  Eigen::DenseIndex>;
-
-template struct TransformDepth<Eigen::GpuDevice, Eigen::half, int>;
-
 template struct SwapDimension1And2InTensor3<Eigen::GpuDevice, Eigen::half>;
 
 // For 2d ops.
diff --git a/tensorflow/core/kernels/conv_grad_filter_ops.cc b/tensorflow/core/kernels/conv_grad_filter_ops.cc
index 84588e2ff50..e1d441e44ed 100644
--- a/tensorflow/core/kernels/conv_grad_filter_ops.cc
+++ b/tensorflow/core/kernels/conv_grad_filter_ops.cc
@@ -54,8 +54,8 @@ limitations under the License.
 #include "tensorflow/core/util/proto/proto_utils.h"
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 #if GOOGLE_CUDA
-#include "tensorflow/stream_executor/cuda/ptxas_utils.h"
-#include "tensorflow/stream_executor/cuda/redzone_allocator.h"
+#include "tensorflow/stream_executor/gpu/asm_compiler.h"
+#include "tensorflow/stream_executor/gpu/redzone_allocator.h"
 #include "tensorflow/stream_executor/tf_allocator_adapter.h"
 #endif  // GOOGLE_CUDA
 
@@ -979,8 +979,8 @@ void LaunchConv2DBackpropFilterOp<Eigen::GpuDevice, T>::operator()(
 
     se::TfAllocatorAdapter tf_allocator_adapter(ctx->device()->GetAllocator({}),
                                                 stream);
-    se::cuda::RedzoneAllocator rz_allocator(stream, &tf_allocator_adapter,
-                                            se::cuda::PtxCompilationOptions());
+    se::RedzoneAllocator rz_allocator(stream, &tf_allocator_adapter,
+                                      se::GpuAsmOpts());
 
     se::DeviceMemory<T> filter_backprop_ptr_rz(
         WrapRedzoneBestEffort(&rz_allocator, filter_backprop_ptr));
@@ -995,8 +995,8 @@ void LaunchConv2DBackpropFilterOp<Eigen::GpuDevice, T>::operator()(
       // accuracy.
       DnnScratchAllocator scratch_allocator(ConvolveBackwardFilterScratchSize,
                                             ctx);
-      se::cuda::RedzoneAllocator rz_scratch_allocator(
-          stream, &tf_allocator_adapter, se::cuda::PtxCompilationOptions(),
+      se::RedzoneAllocator rz_scratch_allocator(
+          stream, &tf_allocator_adapter, se::GpuAsmOpts(),
           /*memory_limit=*/ConvolveBackwardFilterScratchSize);
       se::ScratchAllocator* allocator_used =
           !RedzoneCheckDisabled()
@@ -1085,21 +1085,6 @@ void LaunchConv2DBackpropFilterOp<Eigen::GpuDevice, T>::operator()(
 // Forward declarations of the functor specializations for GPU.
 namespace functor {
 #define DECLARE_GPU_SPEC(T)                                              \
-  template <>                                                            \
-  void ShuffleAndReverse<GPUDevice, T, 4, int>::operator()(              \
-      const GPUDevice& d, typename TTypes<T, 4, int>::ConstTensor input, \
-      const Eigen::DSizes<int, 4>& order,                                \
-      const Eigen::array<bool, 4>& reverse_dims,                         \
-      typename TTypes<T, 4, int>::Tensor output);                        \
-  extern template struct ShuffleAndReverse<GPUDevice, T, 4, int>;        \
-  template <>                                                            \
-  void InflatePadAndShuffle<GPUDevice, T, 4, int>::operator()(           \
-      const GPUDevice& d, typename TTypes<T, 4, int>::ConstTensor input, \
-      const Eigen::DSizes<int, 4>& strides,                              \
-      const Eigen::array<Eigen::IndexPair<int>, 4>& pad_dims,            \
-      const Eigen::DSizes<int, 4>& order,                                \
-      typename TTypes<T, 4, int>::Tensor output);                        \
-  extern template struct InflatePadAndShuffle<GPUDevice, T, 4, int>;     \
   template <>                                                            \
   void TransformFilter<GPUDevice, T, int, 4>::operator()(                \
       const GPUDevice& d, FilterTensorFormat dst_filter_format,          \
@@ -1107,12 +1092,6 @@ namespace functor {
       typename TTypes<T, 4, int>::Tensor out);                           \
   extern template struct TransformFilter<GPUDevice, T, int, 4>;          \
   template <>                                                            \
-  void TransformDepth<GPUDevice, T, int>::operator()(                    \
-      const GPUDevice& d, typename TTypes<T, 4, int>::ConstTensor in,    \
-      const Eigen::DSizes<int, 4>& shuffle,                              \
-      typename TTypes<T, 4, int>::Tensor out);                           \
-  extern template struct TransformDepth<GPUDevice, T, int>;              \
-  template <>                                                            \
   void PadInput<GPUDevice, T, int, 4>::operator()(                       \
       const GPUDevice& d, typename TTypes<T, 4, int>::ConstTensor in,    \
       const std::array<int, 2>& padding_left,                            \
diff --git a/tensorflow/core/kernels/conv_grad_input_ops.cc b/tensorflow/core/kernels/conv_grad_input_ops.cc
index 165560d6789..751c520612b 100644
--- a/tensorflow/core/kernels/conv_grad_input_ops.cc
+++ b/tensorflow/core/kernels/conv_grad_input_ops.cc
@@ -54,8 +54,8 @@ limitations under the License.
 #include "tensorflow/core/util/proto/proto_utils.h"
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 #if GOOGLE_CUDA
-#include "tensorflow/stream_executor/cuda/ptxas_utils.h"
-#include "tensorflow/stream_executor/cuda/redzone_allocator.h"
+#include "tensorflow/stream_executor/gpu/asm_compiler.h"
+#include "tensorflow/stream_executor/gpu/redzone_allocator.h"
 #include "tensorflow/stream_executor/tf_allocator_adapter.h"
 #endif  // GOOGLE_CUDA
 
@@ -1106,8 +1106,8 @@ void LaunchConv2DBackpropInputOp<GPUDevice, T>::operator()(
     se::TfAllocatorAdapter tf_allocator_adapter(ctx->device()->GetAllocator({}),
                                                 stream);
 
-    se::cuda::RedzoneAllocator rz_allocator(stream, &tf_allocator_adapter,
-                                            se::cuda::PtxCompilationOptions());
+    se::RedzoneAllocator rz_allocator(stream, &tf_allocator_adapter,
+                                      se::GpuAsmOpts());
 
     se::DeviceMemory<T> in_backprop_ptr_rz(
         WrapRedzoneBestEffort(&rz_allocator, in_backprop_ptr));
@@ -1122,8 +1122,8 @@ void LaunchConv2DBackpropInputOp<GPUDevice, T>::operator()(
       // accuracy.
       DnnScratchAllocator scratch_allocator(ConvolveBackwardDataScratchSize,
                                             ctx);
-      se::cuda::RedzoneAllocator rz_scratch_allocator(
-          stream, &tf_allocator_adapter, se::cuda::PtxCompilationOptions(),
+      se::RedzoneAllocator rz_scratch_allocator(
+          stream, &tf_allocator_adapter, se::GpuAsmOpts(),
           /*memory_limit=*/ConvolveBackwardDataScratchSize);
       se::ScratchAllocator* allocator_used =
           !RedzoneCheckDisabled()
@@ -1241,21 +1241,6 @@ void LaunchConv2DBackpropInputOp<GPUDevice, T>::operator()(
 // Forward declarations of the functor specializations for GPU.
 namespace functor {
 #define DECLARE_GPU_SPEC(T)                                              \
-  template <>                                                            \
-  void ShuffleAndReverse<GPUDevice, T, 4, int>::operator()(              \
-      const GPUDevice& d, typename TTypes<T, 4, int>::ConstTensor input, \
-      const Eigen::DSizes<int, 4>& order,                                \
-      const Eigen::array<bool, 4>& reverse_dims,                         \
-      typename TTypes<T, 4, int>::Tensor output);                        \
-  extern template struct ShuffleAndReverse<GPUDevice, T, 4, int>;        \
-  template <>                                                            \
-  void InflatePadAndShuffle<GPUDevice, T, 4, int>::operator()(           \
-      const GPUDevice& d, typename TTypes<T, 4, int>::ConstTensor input, \
-      const Eigen::DSizes<int, 4>& strides,                              \
-      const Eigen::array<Eigen::IndexPair<int>, 4>& pad_dims,            \
-      const Eigen::DSizes<int, 4>& order,                                \
-      typename TTypes<T, 4, int>::Tensor output);                        \
-  extern template struct InflatePadAndShuffle<GPUDevice, T, 4, int>;     \
   template <>                                                            \
   void TransformFilter<GPUDevice, T, int, 4>::operator()(                \
       const GPUDevice& d, FilterTensorFormat dst_filter_format,          \
@@ -1263,12 +1248,6 @@ namespace functor {
       typename TTypes<T, 4, int>::Tensor out);                           \
   extern template struct TransformFilter<GPUDevice, T, int, 4>;          \
   template <>                                                            \
-  void TransformDepth<GPUDevice, T, int>::operator()(                    \
-      const GPUDevice& d, typename TTypes<T, 4, int>::ConstTensor in,    \
-      const Eigen::DSizes<int, 4>& shuffle,                              \
-      typename TTypes<T, 4, int>::Tensor out);                           \
-  extern template struct TransformDepth<GPUDevice, T, int>;              \
-  template <>                                                            \
   void PadInput<GPUDevice, T, int, 4>::operator()(                       \
       const GPUDevice& d, typename TTypes<T, 4, int>::ConstTensor in,    \
       const std::array<int, 2>& padding_left,                            \
diff --git a/tensorflow/core/kernels/conv_grad_ops_3d.cc b/tensorflow/core/kernels/conv_grad_ops_3d.cc
index 687338f8fa0..9ce05383c6c 100644
--- a/tensorflow/core/kernels/conv_grad_ops_3d.cc
+++ b/tensorflow/core/kernels/conv_grad_ops_3d.cc
@@ -45,8 +45,8 @@ using stream_executor::dnn::DimIndex;
 #include "tensorflow/core/util/proto/proto_utils.h"
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 #if GOOGLE_CUDA
-#include "tensorflow/stream_executor/cuda/ptxas_utils.h"
-#include "tensorflow/stream_executor/cuda/redzone_allocator.h"
+#include "tensorflow/stream_executor/gpu/asm_compiler.h"
+#include "tensorflow/stream_executor/gpu/redzone_allocator.h"
 #include "tensorflow/stream_executor/tf_allocator_adapter.h"
 #endif  // GOOGLE_CUDA
 
@@ -1370,8 +1370,8 @@ class Conv3DBackpropInputOp<GPUDevice, T> : public OpKernel {
 #if GOOGLE_CUDA
       se::TfAllocatorAdapter tf_allocator_adapter(
           context->device()->GetAllocator({}), stream);
-      se::cuda::RedzoneAllocator rz_allocator(
-          stream, &tf_allocator_adapter, se::cuda::PtxCompilationOptions());
+      se::RedzoneAllocator rz_allocator(stream, &tf_allocator_adapter,
+                                        se::GpuAsmOpts());
       se::DeviceMemory<T> in_backprop_ptr_rz(
           WrapRedzoneBestEffort(&rz_allocator, in_backprop_ptr));
       std::vector<AlgorithmDesc> algorithms;
@@ -1387,8 +1387,8 @@ class Conv3DBackpropInputOp<GPUDevice, T> : public OpKernel {
         // accuracy.
         DnnScratchAllocator scratch_allocator(ConvolveBackwardDataScratchSize,
                                               context);
-        se::cuda::RedzoneAllocator rz_scratch_allocator(
-            stream, &tf_allocator_adapter, se::cuda::PtxCompilationOptions(),
+        se::RedzoneAllocator rz_scratch_allocator(
+            stream, &tf_allocator_adapter, se::GpuAsmOpts(),
             /*memory_limit=*/ConvolveBackwardDataScratchSize);
         se::ScratchAllocator* allocator_used =
             !RedzoneCheckDisabled()
diff --git a/tensorflow/core/kernels/conv_ops.cc b/tensorflow/core/kernels/conv_ops.cc
index d453e9d68e7..3272c72a868 100644
--- a/tensorflow/core/kernels/conv_ops.cc
+++ b/tensorflow/core/kernels/conv_ops.cc
@@ -64,8 +64,8 @@ limitations under the License.
 #include "tensorflow/core/util/proto/proto_utils.h"
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 #if GOOGLE_CUDA
-#include "tensorflow/stream_executor/cuda/ptxas_utils.h"
-#include "tensorflow/stream_executor/cuda/redzone_allocator.h"
+#include "tensorflow/stream_executor/gpu/asm_compiler.h"
+#include "tensorflow/stream_executor/gpu/redzone_allocator.h"
 #include "tensorflow/stream_executor/tf_allocator_adapter.h"
 #endif  // GOOGLE_CUDA
 
@@ -955,8 +955,8 @@ void LaunchConv2DOp<GPUDevice, T>::operator()(
 
     se::TfAllocatorAdapter tf_allocator_adapter(ctx->device()->GetAllocator({}),
                                                 stream);
-    se::cuda::RedzoneAllocator rz_allocator(stream, &tf_allocator_adapter,
-                                            se::cuda::PtxCompilationOptions());
+    se::RedzoneAllocator rz_allocator(stream, &tf_allocator_adapter,
+                                      se::GpuAsmOpts());
     se::DeviceMemory<T> output_tensor(
         WrapRedzoneBestEffort(&rz_allocator, output_ptr));
 
@@ -964,8 +964,8 @@ void LaunchConv2DOp<GPUDevice, T>::operator()(
     for (auto profile_algorithm : algorithms) {
       // TODO(zhengxq): profile each algorithm multiple times to better
       // accuracy.
-      se::cuda::RedzoneAllocator rz_scratch_allocator(
-          stream, &tf_allocator_adapter, se::cuda::PtxCompilationOptions(),
+      se::RedzoneAllocator rz_scratch_allocator(
+          stream, &tf_allocator_adapter, se::GpuAsmOpts(),
           /*memory_limit=*/ConvolveScratchSize);
       DnnScratchAllocator scratch_allocator(ConvolveScratchSize, ctx);
       se::ScratchAllocator* allocator_used =
diff --git a/tensorflow/core/kernels/conv_ops_3d.cc b/tensorflow/core/kernels/conv_ops_3d.cc
index f985edca12f..f0b9bf12e8e 100644
--- a/tensorflow/core/kernels/conv_ops_3d.cc
+++ b/tensorflow/core/kernels/conv_ops_3d.cc
@@ -38,8 +38,8 @@ limitations under the License.
 using stream_executor::dnn::DimIndex;
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 #if GOOGLE_CUDA
-#include "tensorflow/stream_executor/cuda/ptxas_utils.h"
-#include "tensorflow/stream_executor/cuda/redzone_allocator.h"
+#include "tensorflow/stream_executor/gpu/asm_compiler.h"
+#include "tensorflow/stream_executor/gpu/redzone_allocator.h"
 #include "tensorflow/stream_executor/tf_allocator_adapter.h"
 #endif  // GOOGLE_CUDA
 
@@ -450,8 +450,8 @@ struct LaunchConvOp<GPUDevice, T> {
 #if GOOGLE_CUDA
       se::TfAllocatorAdapter tf_allocator_adapter(
           ctx->device()->GetAllocator({}), stream);
-      se::cuda::RedzoneAllocator rz_allocator(
-          stream, &tf_allocator_adapter, se::cuda::PtxCompilationOptions());
+      se::RedzoneAllocator rz_allocator(stream, &tf_allocator_adapter,
+                                        se::GpuAsmOpts());
       se::DeviceMemory<T> output_ptr_rz(
           WrapRedzoneBestEffort(&rz_allocator, output_ptr));
       std::vector<AlgorithmDesc> algorithms;
@@ -470,8 +470,8 @@ struct LaunchConvOp<GPUDevice, T> {
         // TODO(zhengxq): profile each algorithm multiple times to better
         // accuracy.
         DnnScratchAllocator scratch_allocator(ConvolveScratchSize, ctx);
-        se::cuda::RedzoneAllocator rz_scratch_allocator(
-            stream, &tf_allocator_adapter, se::cuda::PtxCompilationOptions(),
+        se::RedzoneAllocator rz_scratch_allocator(
+            stream, &tf_allocator_adapter, se::GpuAsmOpts(),
             /*memory_limit=*/ConvolveScratchSize);
         se::ScratchAllocator* allocator_used =
             !RedzoneCheckDisabled()
diff --git a/tensorflow/core/kernels/conv_ops_fused_impl.h b/tensorflow/core/kernels/conv_ops_fused_impl.h
index 61171842b5e..eeaae464449 100644
--- a/tensorflow/core/kernels/conv_ops_fused_impl.h
+++ b/tensorflow/core/kernels/conv_ops_fused_impl.h
@@ -61,8 +61,8 @@ limitations under the License.
 #include "tensorflow/core/kernels/conv_ops_gpu.h"
 #include "tensorflow/core/platform/stream_executor.h"
 #include "tensorflow/core/util/proto/proto_utils.h"
-#include "tensorflow/stream_executor/cuda/ptxas_utils.h"
-#include "tensorflow/stream_executor/cuda/redzone_allocator.h"
+#include "tensorflow/stream_executor/gpu/asm_compiler.h"
+#include "tensorflow/stream_executor/gpu/redzone_allocator.h"
 #include "tensorflow/stream_executor/tf_allocator_adapter.h"
 #endif  // GOOGLE_CUDA
 
@@ -328,16 +328,16 @@ Status FindBestConvolveAlgorithm(const FusedConvParameters& params,
 
   se::TfAllocatorAdapter tf_allocator_adapter(
       context->device()->GetAllocator({}), stream);
-  se::cuda::RedzoneAllocator rz_allocator(stream, &tf_allocator_adapter,
-                                          se::cuda::PtxCompilationOptions());
+  se::RedzoneAllocator rz_allocator(stream, &tf_allocator_adapter,
+                                    se::GpuAsmOpts());
   se::DeviceMemory<T> output_ptr_rz(
       WrapRedzoneBestEffort(&rz_allocator, output_ptr));
 
   std::vector<tensorflow::AutotuneResult> results;
   for (auto profile_algorithm : algorithms) {
     DnnScratchAllocator scratch_allocator(ConvolveScratchSize(), context);
-    se::cuda::RedzoneAllocator rz_scratch_allocator(
-        stream, &tf_allocator_adapter, se::cuda::PtxCompilationOptions(),
+    se::RedzoneAllocator rz_scratch_allocator(
+        stream, &tf_allocator_adapter, se::GpuAsmOpts(),
         /*memory_limit=*/ConvolveScratchSize());
     se::ScratchAllocator* allocator_used =
         !RedzoneCheckDisabled()
diff --git a/tensorflow/core/kernels/conv_ops_gpu_2.cu.cc b/tensorflow/core/kernels/conv_ops_gpu_2.cu.cc
deleted file mode 100644
index f23630783bd..00000000000
--- a/tensorflow/core/kernels/conv_ops_gpu_2.cu.cc
+++ /dev/null
@@ -1,39 +0,0 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
-
-#define EIGEN_USE_GPU
-
-#include "tensorflow/core/kernels/conv_2d.h"
-
-#include "tensorflow/core/framework/register_types.h"
-#include "tensorflow/core/framework/tensor.h"
-
-namespace tensorflow {
-
-typedef Eigen::GpuDevice GPUDevice;
-template struct functor::InflatePadAndShuffle<GPUDevice, double, 4, int>;
-template struct functor::InflatePadAndShuffle<GPUDevice, double, 4,
-                                              Eigen::DenseIndex>;
-template struct functor::InflatePadAndShuffle<GPUDevice, float, 4, int>;
-template struct functor::InflatePadAndShuffle<GPUDevice, float, 4,
-                                              Eigen::DenseIndex>;
-template struct functor::InflatePadAndShuffle<GPUDevice, Eigen::half, 4, int>;
-template struct functor::InflatePadAndShuffle<GPUDevice, Eigen::half, 4,
-                                              Eigen::DenseIndex>;
-}  // namespace tensorflow
-
-#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
diff --git a/tensorflow/core/kernels/cwise_op_abs.cc b/tensorflow/core/kernels/cwise_op_abs.cc
index d19965b45fe..e4f01cf6c90 100644
--- a/tensorflow/core/kernels/cwise_op_abs.cc
+++ b/tensorflow/core/kernels/cwise_op_abs.cc
@@ -16,12 +16,12 @@ limitations under the License.
 #include "tensorflow/core/kernels/cwise_ops_common.h"
 
 namespace tensorflow {
-REGISTER5(UnaryOp, CPU, "Abs", functor::abs, float, Eigen::half, double, int32,
-          int64);
+REGISTER8(UnaryOp, CPU, "Abs", functor::abs, Eigen::half, bfloat16, float,
+          double, int8, int16, int32, int64);
 REGISTER2(UnaryOp, CPU, "ComplexAbs", functor::abs, complex64, complex128);
 
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
-REGISTER4(UnaryOp, GPU, "Abs", functor::abs, float, Eigen::half, double, int64);
+REGISTER4(UnaryOp, GPU, "Abs", functor::abs, Eigen::half, float, double, int64);
 REGISTER2(UnaryOp, GPU, "ComplexAbs", functor::abs, complex64, complex128);
 
 // A special GPU kernel for int32.
diff --git a/tensorflow/core/kernels/cwise_ops_common.cc b/tensorflow/core/kernels/cwise_ops_common.cc
index 8ad3b4d1fc9..733fa8af2f9 100644
--- a/tensorflow/core/kernels/cwise_ops_common.cc
+++ b/tensorflow/core/kernels/cwise_ops_common.cc
@@ -57,11 +57,23 @@ BinaryOpShared::BinaryOpState::BinaryOpState(OpKernelContext* ctx)
       in1(ctx->input(1)),
       bcast(BCast::FromShape(in0.shape()), BCast::FromShape(in1.shape())) {
   if (!bcast.IsValid()) {
+    bool incompatible_shape_error;
+    bool has_attr =
+        TryGetNodeAttr(ctx->op_kernel().def(), "incompatible_shape_error",
+                       &(incompatible_shape_error));
+    if (has_attr && !incompatible_shape_error) {
+      const string& op = ctx->op_kernel().type_string();
+      OP_REQUIRES_OK(ctx, ctx->allocate_output(0, TensorShape({}), &out));
+      result = (op == "NotEqual");
+      return;
+    }
+
     ctx->SetStatus(errors::InvalidArgument(
         "Incompatible shapes: ", in0.shape().DebugString(), " vs. ",
         in1.shape().DebugString()));
     return;
   }
+
   const TensorShape output_shape = BCast::ToShape(bcast.output_shape());
   out_num_elements = output_shape.num_elements();
   in0_num_elements = in0.NumElements();
diff --git a/tensorflow/core/kernels/cwise_ops_common.h b/tensorflow/core/kernels/cwise_ops_common.h
index 40f64e662c9..2701d4133ee 100644
--- a/tensorflow/core/kernels/cwise_ops_common.h
+++ b/tensorflow/core/kernels/cwise_ops_common.h
@@ -26,13 +26,13 @@ limitations under the License.
 #include "tensorflow/core/kernels/cwise_ops_sycl_common.h"
 #endif
 
-#include "tensorflow/core/kernels/cwise_ops.h"
-#include "tensorflow/core/kernels/cwise_ops_gradients.h"
-
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/tensor_types.h"
 #include "tensorflow/core/framework/variant_op_registry.h"
+#include "tensorflow/core/kernels/cwise_ops.h"
+#include "tensorflow/core/kernels/cwise_ops_gradients.h"
+#include "tensorflow/core/kernels/fill_functor.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/util/bcast.h"
 
@@ -56,7 +56,7 @@ class BinaryOpShared : public OpKernel {
     // in-place computation.
     // Caller must check ctx->status() upon return for non-ok status.
     // If ctx->status().ok() is true, then out is guaranteed to be allocated.
-    BinaryOpState(OpKernelContext* ctx);
+    explicit BinaryOpState(OpKernelContext* ctx);
 
     const Tensor& in0;
     const Tensor& in1;
@@ -69,6 +69,7 @@ class BinaryOpShared : public OpKernel {
     int64 in1_num_elements;
 
     int ndims;
+    bool result;
   };
 
   void SetUnimplementedError(OpKernelContext* ctx);
@@ -91,16 +92,29 @@ class BinaryOp : public BinaryOpShared {
   void Compute(OpKernelContext* ctx) override {
     // 'state': Shared helper not dependent on T to reduce code size
     BinaryOpState state(ctx);
-    if (!ctx->status().ok()) return;
+    auto& bcast = state.bcast;
+    const Device& eigen_device = ctx->eigen_device<Device>();
     Tensor* out = state.out;
-    BCast* bcast = &state.bcast;
+    if (!bcast.IsValid()) {
+      if (ctx->status().ok()) {
+        if (state.result) {
+          functor::SetOneFunctor<Device, bool>()(eigen_device,
+                                                 out->flat<bool>());
+        } else {
+          functor::SetZeroFunctor<Device, bool>()(eigen_device,
+                                                  out->flat<bool>());
+        }
+      }
+      return;
+    }
+
     auto& in0 = state.in0;
     auto& in1 = state.in1;
     if (state.out_num_elements == 0) {
       return;
     }
+
     const int ndims = state.ndims;
-    const Device& eigen_device = ctx->eigen_device<Device>();
     bool error = false;
     bool* const error_ptr = Functor::has_errors ? &error : nullptr;
     if (ndims <= 1) {
@@ -122,32 +136,32 @@ class BinaryOp : public BinaryOpShared {
       }
     } else if (ndims == 2) {
       functor::BinaryFunctor<Device, Functor, 2>().BCast(
-          eigen_device, out->shaped<Tout, 2>(bcast->result_shape()),
-          in0.template shaped<Tin, 2>(bcast->x_reshape()),
-          BCast::ToIndexArray<2>(bcast->x_bcast()),
-          in1.template shaped<Tin, 2>(bcast->y_reshape()),
-          BCast::ToIndexArray<2>(bcast->y_bcast()), error_ptr);
+          eigen_device, out->shaped<Tout, 2>(bcast.result_shape()),
+          in0.template shaped<Tin, 2>(bcast.x_reshape()),
+          BCast::ToIndexArray<2>(bcast.x_bcast()),
+          in1.template shaped<Tin, 2>(bcast.y_reshape()),
+          BCast::ToIndexArray<2>(bcast.y_bcast()), error_ptr);
     } else if (ndims == 3) {
       functor::BinaryFunctor<Device, Functor, 3>().BCast(
-          eigen_device, out->shaped<Tout, 3>(bcast->result_shape()),
-          in0.template shaped<Tin, 3>(bcast->x_reshape()),
-          BCast::ToIndexArray<3>(bcast->x_bcast()),
-          in1.template shaped<Tin, 3>(bcast->y_reshape()),
-          BCast::ToIndexArray<3>(bcast->y_bcast()), error_ptr);
+          eigen_device, out->shaped<Tout, 3>(bcast.result_shape()),
+          in0.template shaped<Tin, 3>(bcast.x_reshape()),
+          BCast::ToIndexArray<3>(bcast.x_bcast()),
+          in1.template shaped<Tin, 3>(bcast.y_reshape()),
+          BCast::ToIndexArray<3>(bcast.y_bcast()), error_ptr);
     } else if (ndims == 4) {
       functor::BinaryFunctor<Device, Functor, 4>().BCast(
-          eigen_device, out->shaped<Tout, 4>(bcast->result_shape()),
-          in0.template shaped<Tin, 4>(bcast->x_reshape()),
-          BCast::ToIndexArray<4>(bcast->x_bcast()),
-          in1.template shaped<Tin, 4>(bcast->y_reshape()),
-          BCast::ToIndexArray<4>(bcast->y_bcast()), error_ptr);
+          eigen_device, out->shaped<Tout, 4>(bcast.result_shape()),
+          in0.template shaped<Tin, 4>(bcast.x_reshape()),
+          BCast::ToIndexArray<4>(bcast.x_bcast()),
+          in1.template shaped<Tin, 4>(bcast.y_reshape()),
+          BCast::ToIndexArray<4>(bcast.y_bcast()), error_ptr);
     } else if (ndims == 5) {
       functor::BinaryFunctor<Device, Functor, 5>().BCast(
-          eigen_device, out->shaped<Tout, 5>(bcast->result_shape()),
-          in0.template shaped<Tin, 5>(bcast->x_reshape()),
-          BCast::ToIndexArray<5>(bcast->x_bcast()),
-          in1.template shaped<Tin, 5>(bcast->y_reshape()),
-          BCast::ToIndexArray<5>(bcast->y_bcast()), error_ptr);
+          eigen_device, out->shaped<Tout, 5>(bcast.result_shape()),
+          in0.template shaped<Tin, 5>(bcast.x_reshape()),
+          BCast::ToIndexArray<5>(bcast.x_bcast()),
+          in1.template shaped<Tin, 5>(bcast.y_reshape()),
+          BCast::ToIndexArray<5>(bcast.y_bcast()), error_ptr);
     } else {
       SetUnimplementedError(ctx);
     }
diff --git a/tensorflow/core/kernels/data/BUILD b/tensorflow/core/kernels/data/BUILD
index 295c52465c0..a372ee1b05b 100644
--- a/tensorflow/core/kernels/data/BUILD
+++ b/tensorflow/core/kernels/data/BUILD
@@ -26,8 +26,10 @@ cc_library(
     srcs = ["dataset_test_base.cc"],
     hdrs = ["dataset_test_base.h"],
     deps = [
+        ":batch_dataset_op",
         ":dataset_utils",
         ":iterator_ops",
+        ":map_dataset_op",
         ":name_utils",
         ":range_dataset_op",
         ":take_dataset_op",
diff --git a/tensorflow/core/kernels/data/batch_dataset_op_test.cc b/tensorflow/core/kernels/data/batch_dataset_op_test.cc
index cce73a41ca4..b01d4ef234e 100644
--- a/tensorflow/core/kernels/data/batch_dataset_op_test.cc
+++ b/tensorflow/core/kernels/data/batch_dataset_op_test.cc
@@ -17,188 +17,103 @@ namespace tensorflow {
 namespace data {
 namespace {
 
-constexpr char kNodeName[] = "batch_dataset_v2";
-constexpr int kOpVersion = 2;
+constexpr char kNodeName[] = "batch_dataset";
 
-class BatchDatasetParams : public DatasetParams {
- public:
-  BatchDatasetParams(int64 num_input_elements, int64 batch_size,
-                     bool drop_remainder, bool parallel_copy,
-                     DataTypeVector output_dtypes,
-                     std::vector<PartialTensorShape> output_shapes,
-                     string node_name)
-      : DatasetParams(std::move(output_dtypes), std::move(output_shapes),
-                      std::move(node_name)),
-        num_input_elements(num_input_elements),
-        batch_size(CreateTensor<int64>(TensorShape({}), {batch_size})),
-        drop_remainder(CreateTensor<bool>(TensorShape({}), {drop_remainder})),
-        parallel_copy(parallel_copy) {}
-
-  Status MakeInputs(gtl::InlinedVector<TensorValue, 4>* inputs) override {
-    if (!IsDatasetTensor(input_dataset)) {
-      return errors::Internal(
-          "The input dataset is not populated as the dataset tensor yet.");
-    }
-    *inputs = {TensorValue(&input_dataset), TensorValue(&batch_size),
-               TensorValue(&drop_remainder)};
-    return Status::OK();
-  }
-
-  int64 num_input_elements;
-  Tensor input_dataset;
-  Tensor batch_size;
-  Tensor drop_remainder;
-  bool parallel_copy;
-};
-
-class BatchDatasetOpTest : public DatasetOpsTestBaseV2<BatchDatasetParams> {
- public:
-  Status Initialize(BatchDatasetParams* batch_dataset_params) override {
-    TF_RETURN_IF_ERROR(InitThreadPool(thread_num_));
-    TF_RETURN_IF_ERROR(InitFunctionLibraryRuntime({}, cpu_num_));
-
-    // Populate the `input_dataset` in `batch_dataset_params_`.
-    RangeDatasetParams input_dataset_params(
-        0, batch_dataset_params->num_input_elements, 1, {DT_INT64},
-        {PartialTensorShape({})}, "range_dataset");
-    TF_RETURN_IF_ERROR(MakeRangeDataset(input_dataset_params,
-                                        &batch_dataset_params->input_dataset));
-    // Create the dataset kernel.
-    TF_RETURN_IF_ERROR(
-        MakeDatasetOpKernel(*batch_dataset_params, &dataset_kernel_));
-    // Create the inputs for the dataset op.
-    gtl::InlinedVector<TensorValue, 4> inputs;
-    TF_RETURN_IF_ERROR(batch_dataset_params->MakeInputs(&inputs));
-    // Creat the dataset context.
-    TF_RETURN_IF_ERROR(
-        CreateDatasetContext(dataset_kernel_.get(), &inputs, &dataset_ctx_));
-    // Create the dataset.
-    TF_RETURN_IF_ERROR(
-        CreateDataset(dataset_kernel_.get(), dataset_ctx_.get(), &dataset_));
-
-    // Create the iterator context.
-    TF_RETURN_IF_ERROR(
-        CreateIteratorContext(dataset_ctx_.get(), &iterator_ctx_));
-    // Create the iterator.
-    TF_RETURN_IF_ERROR(dataset_->MakeIterator(
-        iterator_ctx_.get(), batch_dataset_params->iterator_prefix,
-        &iterator_));
-    return Status::OK();
-  }
-
- protected:
-  // Creates a new `BatchDataset` op kernel.
-  Status MakeDatasetOpKernel(
-      const BatchDatasetParams& dataset_params,
-      std::unique_ptr<OpKernel>* batch_dataset_op_kernel) override {
-    name_utils::OpNameParams params;
-    params.op_version = kOpVersion;
-    NodeDef node_def = test::function::NDef(
-        dataset_params.node_name,
-        name_utils::OpName(BatchDatasetOp::kDatasetType, params),
-        {BatchDatasetOp::kInputDataset, BatchDatasetOp::kBatchSize,
-         BatchDatasetOp::kDropRemainder},
-        {{BatchDatasetOp::kParallelCopy, dataset_params.parallel_copy},
-         {BatchDatasetOp::kOutputTypes, dataset_params.output_dtypes},
-         {BatchDatasetOp::kOutputShapes, dataset_params.output_shapes}});
-    TF_RETURN_IF_ERROR(CreateOpKernel(node_def, batch_dataset_op_kernel));
-    return Status::OK();
-  }
-};
+class BatchDatasetOpTest : public DatasetOpsTestBaseV2 {};
 
 // Test Case 1: test BatchDatasetV2 with `drop_remainder` = false and a batch
 // size that can evenly split the input dataset.
 BatchDatasetParams BatchDatasetParams1() {
-  return {/*num_input_elements=*/12,
-          /*batch_size=*/4,
-          /*drop_remainder=*/false,
-          /*parallel_copy=*/true,
-          /*output_dtypes=*/{DT_INT64},
-          /*output_shapes=*/{PartialTensorShape({4})},
-          /*node_name=*/kNodeName};
+  return BatchDatasetParams(RangeDatasetParams(0, 12, 1),
+                            /*batch_size=*/4,
+                            /*drop_remainder=*/false,
+                            /*parallel_copy=*/true,
+                            /*output_dtypes=*/{DT_INT64},
+                            /*output_shapes=*/{PartialTensorShape({4})},
+                            /*node_name=*/kNodeName);
 }
 
 // Test Case 2: test BatchDatasetV2 with `drop_remainder` = true and a batch
 // size that can evenly split the input dataset.
 BatchDatasetParams BatchDatasetParams2() {
-  return {/*num_input_elements=*/12,
-          /*batch_size=*/4,
-          /*drop_remainder=*/true,
-          /*parallel_copy=*/false,
-          /*output_dtypes=*/{DT_INT64},
-          /*output_shapes=*/{PartialTensorShape({4})},
-          /*node_name=*/kNodeName};
+  return BatchDatasetParams(RangeDatasetParams(0, 12, 1),
+                            /*batch_size=*/4,
+                            /*drop_remainder=*/true,
+                            /*parallel_copy=*/false,
+                            /*output_dtypes=*/{DT_INT64},
+                            /*output_shapes=*/{PartialTensorShape({4})},
+                            /*node_name=*/kNodeName);
 }
 
 // Test Case 3: test BatchDatasetV2 with `drop_remainder` = false and a batch
 // size that can not evenly split the input dataset.
 BatchDatasetParams BatchDatasetParams3() {
-  return {/*num_input_elements=*/10,
-          /*batch_size=*/3,
-          /*drop_remainder=*/false,
-          /*parallel_copy=*/false,
-          /*output_dtypes=*/{DT_INT64},
-          /*output_shapes=*/{PartialTensorShape({-1})},
-          /*node_name=*/kNodeName};
+  return BatchDatasetParams(RangeDatasetParams(0, 10, 1),
+                            /*batch_size=*/3,
+                            /*drop_remainder=*/false,
+                            /*parallel_copy=*/false,
+                            /*output_dtypes=*/{DT_INT64},
+                            /*output_shapes=*/{PartialTensorShape({-1})},
+                            /*node_name=*/kNodeName);
 }
 
 // Test Case 4: test BatchDatasetV2 with `drop_remainder` = true and a batch
 // size that can not evenly split the input dataset.
 BatchDatasetParams BatchDatasetParams4() {
-  return {/*num_input_elements=*/10,
-          /*batch_size=*/3,
-          /*drop_remainder=*/true,
-          /*parallel_copy=*/true,
-          /*output_dtypes=*/{DT_INT64},
-          /*output_shapes=*/{PartialTensorShape({3})},
-          /*node_name=*/kNodeName};
+  return BatchDatasetParams(RangeDatasetParams(0, 10, 1),
+                            /*batch_size=*/3,
+                            /*drop_remainder=*/true,
+                            /*parallel_copy=*/true,
+                            /*output_dtypes=*/{DT_INT64},
+                            /*output_shapes=*/{PartialTensorShape({3})},
+                            /*node_name=*/kNodeName);
 }
 
 // Test Case 5: test BatchDatasetV2 with `drop_remainder` = true and
 // `batch_size` > the cardinality of the input dataset.
 BatchDatasetParams BatchDatasetParams5() {
-  return {/*num_input_elements=*/10,
-          /*batch_size=*/12,
-          /*drop_remainder=*/true,
-          /*parallel_copy=*/true,
-          /*output_dtypes=*/{DT_INT64},
-          /*output_shapes=*/{PartialTensorShape({12})},
-          /*node_name=*/kNodeName};
+  return BatchDatasetParams(RangeDatasetParams(0, 10, 1),
+                            /*batch_size=*/12,
+                            /*drop_remainder=*/true,
+                            /*parallel_copy=*/true,
+                            /*output_dtypes=*/{DT_INT64},
+                            /*output_shapes=*/{PartialTensorShape({12})},
+                            /*node_name=*/kNodeName);
 }
 
 // Test Case 6: test BatchDatasetV2 with `drop_remainder` = false and
 // `batch_size` > the cardinality of the input dataset.
 BatchDatasetParams BatchDatasetParams6() {
-  return {/*num_input_elements=*/10,
-          /*batch_size=*/12,
-          /*drop_remainder=*/false,
-          /*parallel_copy=*/true,
-          /*output_dtypes=*/{DT_INT64},
-          /*output_shapes=*/{PartialTensorShape({-1})},
-          /*node_name=*/kNodeName};
+  return BatchDatasetParams(RangeDatasetParams(0, 10, 1),
+                            /*batch_size=*/12,
+                            /*drop_remainder=*/false,
+                            /*parallel_copy=*/true,
+                            /*output_dtypes=*/{DT_INT64},
+                            /*output_shapes=*/{PartialTensorShape({-1})},
+                            /*node_name=*/kNodeName);
 }
 
 // Test Case 7: test BatchDatasetV2 with `drop_remainder` = false and
 // the output of the input dataset is empty.
 BatchDatasetParams BatchDatasetParams7() {
-  return {/*num_input_elements=*/0,
-          /*batch_size=*/4,
-          /*drop_remainder=*/false,
-          /*parallel_copy=*/false,
-          /*output_dtypes=*/{DT_INT64},
-          /*output_shapes=*/{PartialTensorShape({4})},
-          /*node_name=*/kNodeName};
+  return BatchDatasetParams(RangeDatasetParams(0, 0, 1),
+                            /*batch_size=*/4,
+                            /*drop_remainder=*/false,
+                            /*parallel_copy=*/false,
+                            /*output_dtypes=*/{DT_INT64},
+                            /*output_shapes=*/{PartialTensorShape({4})},
+                            /*node_name=*/kNodeName);
 }
 
 // Test Case 8: test BatchDatasetV2 with an invalid batch size
 BatchDatasetParams InvalidBatchSizeBatchDatasetParams() {
-  return {/*num_input_elements=*/10,
-          /*batch_size=*/-1,
-          /*drop_remainder=*/false,
-          /*parallel_copy=*/false,
-          /*output_dtypes=*/{DT_INT64},
-          /*output_shapes=*/{PartialTensorShape({3})},
-          /*node_name=*/kNodeName};
+  return BatchDatasetParams(RangeDatasetParams(0, 10, 1),
+                            /*batch_size=*/-1,
+                            /*drop_remainder=*/false,
+                            /*parallel_copy=*/false,
+                            /*output_dtypes=*/{DT_INT64},
+                            /*output_shapes=*/{PartialTensorShape({3})},
+                            /*node_name=*/kNodeName);
 }
 
 std::vector<GetNextTestCase<BatchDatasetParams>> GetNextTestCases() {
@@ -236,22 +151,22 @@ ITERATOR_GET_NEXT_TEST_P(BatchDatasetOpTest, BatchDatasetParams,
 
 TEST_F(BatchDatasetOpTest, DatasetNodeName) {
   auto batch_dataset_params = BatchDatasetParams1();
-  TF_ASSERT_OK(Initialize(&batch_dataset_params));
-  TF_ASSERT_OK(CheckDatasetNodeName(batch_dataset_params.node_name));
+  TF_ASSERT_OK(Initialize(batch_dataset_params));
+  TF_ASSERT_OK(CheckDatasetNodeName(batch_dataset_params.node_name()));
 }
 
 TEST_F(BatchDatasetOpTest, DatasetTypeString) {
   auto batch_dataset_params = BatchDatasetParams1();
-  TF_ASSERT_OK(Initialize(&batch_dataset_params));
+  TF_ASSERT_OK(Initialize(batch_dataset_params));
   name_utils::OpNameParams params;
-  params.op_version = kOpVersion;
+  params.op_version = batch_dataset_params.op_version();
   TF_ASSERT_OK(CheckDatasetTypeString(
       name_utils::OpName(BatchDatasetOp::kDatasetType, params)));
 }
 
 TEST_F(BatchDatasetOpTest, DatasetOutputDtypes) {
   auto batch_dataset_params = BatchDatasetParams1();
-  TF_ASSERT_OK(Initialize(&batch_dataset_params));
+  TF_ASSERT_OK(Initialize(batch_dataset_params));
   TF_ASSERT_OK(CheckDatasetOutputDtypes({DT_INT64}));
 }
 
@@ -292,7 +207,7 @@ DATASET_CARDINALITY_TEST_P(BatchDatasetOpTest, BatchDatasetParams,
 
 TEST_F(BatchDatasetOpTest, IteratorOutputDtypes) {
   auto batch_dataset_params = BatchDatasetParams1();
-  TF_ASSERT_OK(Initialize(&batch_dataset_params));
+  TF_ASSERT_OK(Initialize(batch_dataset_params));
   TF_ASSERT_OK(CheckIteratorOutputDtypes({DT_INT64}));
 }
 
@@ -319,11 +234,11 @@ ITERATOR_OUTPUT_SHAPES_TEST_P(BatchDatasetOpTest, BatchDatasetParams,
 
 TEST_F(BatchDatasetOpTest, IteratorOutputPrefix) {
   auto batch_dataset_params = BatchDatasetParams1();
-  TF_ASSERT_OK(Initialize(&batch_dataset_params));
+  TF_ASSERT_OK(Initialize(batch_dataset_params));
   name_utils::IteratorPrefixParams params;
-  params.op_version = kOpVersion;
+  params.op_version = batch_dataset_params.op_version();
   TF_ASSERT_OK(CheckIteratorPrefix(name_utils::IteratorPrefix(
-      BatchDatasetOp::kDatasetType, batch_dataset_params.iterator_prefix,
+      BatchDatasetOp::kDatasetType, batch_dataset_params.iterator_prefix(),
       params)));
 }
 
@@ -370,7 +285,7 @@ ITERATOR_SAVE_AND_RESTORE_TEST_P(BatchDatasetOpTest, BatchDatasetParams,
 
 TEST_F(BatchDatasetOpTest, InvalidBatchSize) {
   auto batch_dataset_params = InvalidBatchSizeBatchDatasetParams();
-  EXPECT_EQ(Initialize(&batch_dataset_params).code(),
+  EXPECT_EQ(Initialize(batch_dataset_params).code(),
             tensorflow::error::INVALID_ARGUMENT);
 }
 
diff --git a/tensorflow/core/kernels/data/dataset_test_base.cc b/tensorflow/core/kernels/data/dataset_test_base.cc
index 15b673e38b1..0778987784e 100644
--- a/tensorflow/core/kernels/data/dataset_test_base.cc
+++ b/tensorflow/core/kernels/data/dataset_test_base.cc
@@ -274,7 +274,6 @@ Status DatasetOpsTestBase::CreateTensorSliceDataset(
   return Status::OK();
 }
 
-// Create a `RangeDataset` dataset as a variant tensor.
 Status DatasetOpsTestBase::MakeRangeDataset(
     const Tensor& start, const Tensor& stop, const Tensor& step,
     const DataTypeVector& output_types,
@@ -293,25 +292,6 @@ Status DatasetOpsTestBase::MakeRangeDataset(
   return Status::OK();
 }
 
-// Create a `RangeDataset` dataset as a variant tensor.
-Status DatasetOpsTestBase::MakeRangeDataset(
-    const RangeDatasetParams& range_dataset_params, Tensor* range_dataset) {
-  GraphConstructorOptions graph_opts;
-  graph_opts.allow_internal_ops = true;
-  graph_opts.expect_device_spec = false;
-  TF_RETURN_IF_ERROR(RunFunction(
-      test::function::MakeRangeDataset(),
-      /*attrs*/
-      {{RangeDatasetOp::kOutputTypes, range_dataset_params.output_dtypes},
-       {RangeDatasetOp::kOutputShapes, range_dataset_params.output_shapes}},
-      /*inputs*/
-      {range_dataset_params.start, range_dataset_params.stop,
-       range_dataset_params.step},
-      graph_opts,
-      /*rets*/ {range_dataset}));
-  return Status::OK();
-}
-
 // Create a `TakeDataset` dataset as a variant tensor.
 Status DatasetOpsTestBase::MakeTakeDataset(
     const Tensor& input_dataset, int64 count,
@@ -698,5 +678,234 @@ Status DatasetOpsTestBase::CheckIteratorSaveAndRestore(
   return Status::OK();
 }
 
+Status DatasetOpsTestBaseV2::Initialize(DatasetParams& dataset_params) {
+  TF_RETURN_IF_ERROR(InitThreadPool(thread_num_));
+  TF_RETURN_IF_ERROR(
+      InitFunctionLibraryRuntime(dataset_params.func_lib(), cpu_num_));
+
+  TF_RETURN_IF_ERROR(MakeDatasetOpKernel(dataset_params, &dataset_kernel_));
+  for (auto& pair : dataset_params.input_dataset_params()) {
+    TF_RETURN_IF_ERROR(MakeDatasetTensor(pair.first.get(), &pair.second));
+  }
+  gtl::InlinedVector<TensorValue, 4> inputs;
+  TF_RETURN_IF_ERROR(dataset_params.GetInputs(&inputs));
+  TF_RETURN_IF_ERROR(
+      CreateDatasetContext(dataset_kernel_.get(), &inputs, &dataset_ctx_));
+  TF_RETURN_IF_ERROR(
+      CreateDataset(dataset_kernel_.get(), dataset_ctx_.get(), &dataset_));
+  TF_RETURN_IF_ERROR(CreateIteratorContext(dataset_ctx_.get(), &iterator_ctx_));
+  TF_RETURN_IF_ERROR(dataset_->MakeIterator(
+      iterator_ctx_.get(), dataset_params.iterator_prefix(), &iterator_));
+  return Status::OK();
+}
+
+Status DatasetOpsTestBaseV2::MakeDatasetOpKernel(
+    const DatasetParams& dataset_params,
+    std::unique_ptr<OpKernel>* dataset_kernel) {
+  name_utils::OpNameParams params;
+  params.op_version = dataset_params.op_version();
+  std::vector<string> input_placeholder;
+  TF_RETURN_IF_ERROR(dataset_params.GetInputPlaceholder(&input_placeholder));
+  AttributeVector attributes;
+  TF_RETURN_IF_ERROR(dataset_params.GetAttributes(&attributes));
+  NodeDef node_def = test::function::NDef(
+      dataset_params.node_name(),
+      name_utils::OpName(ToString(dataset_params.type()), params),
+      input_placeholder, attributes);
+  TF_RETURN_IF_ERROR(CreateOpKernel(node_def, dataset_kernel));
+  return Status::OK();
+}
+
+Status DatasetOpsTestBaseV2::MakeDatasetTensor(DatasetParams* dataset_params,
+                                               Tensor* dataset) {
+  // Make sure all the input dataset tensors have been populated.
+  for (auto& pair : dataset_params->input_dataset_params()) {
+    TF_RETURN_IF_ERROR(MakeDatasetTensor(pair.first.get(), &pair.second));
+  }
+
+  AttributeVector attributes;
+  TF_RETURN_IF_ERROR(dataset_params->GetAttributes(&attributes));
+  gtl::InlinedVector<TensorValue, 4> inputs;
+  TF_RETURN_IF_ERROR(dataset_params->GetInputs(&inputs));
+  std::vector<Tensor> input_tensors;
+  for (auto& tensor_value : inputs) {
+    input_tensors.emplace_back(*tensor_value.tensor);
+  }
+
+  GraphConstructorOptions graph_opts;
+  graph_opts.allow_internal_ops = true;
+  graph_opts.expect_device_spec = false;
+  FunctionDef make_dataset_tensor_fdef;
+  TF_RETURN_IF_ERROR(
+      MakeDatasetTensorFunc(*dataset_params, &make_dataset_tensor_fdef));
+  TF_RETURN_IF_ERROR(RunFunction(make_dataset_tensor_fdef, attributes,
+                                 input_tensors, graph_opts,
+                                 /*rets=*/{dataset}));
+  return Status::OK();
+}
+
+Status DatasetOpsTestBaseV2::MakeDatasetTensorFunc(
+    const DatasetParams& dataset_params, FunctionDef* fdef) {
+  switch (dataset_params.type()) {
+    case DatasetParamsType::Range:
+      *fdef = test::function::MakeRangeDataset();
+      break;
+    case DatasetParamsType::Batch:
+      *fdef = test::function::MakeBatchDataset();
+      break;
+    case DatasetParamsType::Map: {
+      std::vector<string> input_placeholder;
+      TF_RETURN_IF_ERROR(
+          dataset_params.GetInputPlaceholder(&input_placeholder));
+      bool has_other_args = input_placeholder.size() > 1;
+      *fdef = test::function::MakeMapDataset(has_other_args);
+      break;
+    }
+    default:
+      return errors::Unimplemented("MakeDatasetTensorFunc() for ",
+                                   ToString(dataset_params.type()));
+  }
+  return Status::OK();
+}
+
+string ToString(DatasetParamsType type) {
+  switch (type) {
+    case DatasetParamsType::Range:
+      return "Range";
+    case DatasetParamsType::Batch:
+      return "Batch";
+    case DatasetParamsType::Map:
+      return "Map";
+    case DatasetParamsType::MapAndBatch:
+      return "MapAndBatch";
+    case DatasetParamsType::Sampling:
+      return "Sampling";
+  }
+}
+
+DatasetParams::DatasetParams(DataTypeVector output_dtypes,
+                             std::vector<PartialTensorShape> output_shapes,
+                             string node_name, DatasetParamsType type)
+    : output_dtypes_(std::move(output_dtypes)),
+      output_shapes_(std::move(output_shapes)),
+      node_name_(std::move(node_name)),
+      type_(type) {}
+
+bool DatasetParams::IsDatasetTensor(const Tensor& tensor) {
+  return tensor.dtype() == DT_VARIANT &&
+         TensorShapeUtils::IsScalar(tensor.shape());
+}
+
+RangeDatasetParams::RangeDatasetParams(
+    int64 start, int64 stop, int64 step, DataTypeVector output_dtypes,
+    std::vector<PartialTensorShape> output_shapes, string node_name)
+    : DatasetParams(std::move(output_dtypes), std::move(output_shapes),
+                    std::move(node_name), DatasetParamsType::Range),
+      start_(CreateTensor<int64>(TensorShape({}), {start})),
+      stop_(CreateTensor<int64>(TensorShape({}), {stop})),
+      step_(CreateTensor<int64>(TensorShape({}), {step})) {}
+
+RangeDatasetParams::RangeDatasetParams(int64 start, int64 stop, int64 step)
+    : DatasetParams({DT_INT64}, {PartialTensorShape({})}, "range_dataset",
+                    DatasetParamsType::Range),
+      start_(CreateTensor<int64>(TensorShape({}), {start})),
+      stop_(CreateTensor<int64>(TensorShape({}), {stop})),
+      step_(CreateTensor<int64>(TensorShape({}), {step})) {}
+
+Status RangeDatasetParams::GetInputs(
+    gtl::InlinedVector<TensorValue, 4>* inputs) {
+  *inputs = {TensorValue(&start_), TensorValue(&stop_), TensorValue(&step_)};
+  return Status::OK();
+}
+
+Status RangeDatasetParams::GetInputPlaceholder(
+    std::vector<string>* input_placeholder) const {
+  *input_placeholder = {RangeDatasetOp::kStart, RangeDatasetOp::kStop,
+                        RangeDatasetOp::kStep};
+  return Status::OK();
+}
+
+Status RangeDatasetParams::GetAttributes(AttributeVector* attr_vector) const {
+  *attr_vector = {{RangeDatasetOp::kOutputTypes, output_dtypes_},
+                  {RangeDatasetOp::kOutputShapes, output_shapes_}};
+  return Status::OK();
+}
+
+Status BatchDatasetParams::GetInputs(
+    gtl::InlinedVector<TensorValue, 4>* inputs) {
+  inputs->reserve(input_dataset_params_group_.size());
+  for (auto& pair : input_dataset_params_group_) {
+    if (!IsDatasetTensor(pair.second)) {
+      inputs->clear();
+      return errors::Internal(
+          "The input dataset is not populated as the dataset tensor yet.");
+    } else {
+      inputs->emplace_back(TensorValue(&pair.second));
+    }
+  }
+  inputs->emplace_back(TensorValue(&batch_size_));
+  inputs->emplace_back(TensorValue(&drop_remainder_));
+  return Status::OK();
+}
+
+Status BatchDatasetParams::GetInputPlaceholder(
+    std::vector<string>* input_placeholder) const {
+  *input_placeholder = {BatchDatasetOp::kInputDataset,
+                        BatchDatasetOp::kBatchSize,
+                        BatchDatasetOp::kDropRemainder};
+  return Status::OK();
+}
+
+Status BatchDatasetParams::GetAttributes(AttributeVector* attr_vector) const {
+  *attr_vector = {{BatchDatasetOp::kParallelCopy, parallel_copy_},
+                  {BatchDatasetOp::kOutputTypes, output_dtypes_},
+                  {BatchDatasetOp::kOutputShapes, output_shapes_}};
+  return Status::OK();
+}
+
+int BatchDatasetParams::op_version() const { return op_version_; }
+
+Status MapDatasetParams::GetInputs(gtl::InlinedVector<TensorValue, 4>* inputs) {
+  inputs->reserve(input_dataset_params_group_.size());
+  for (auto& pair : input_dataset_params_group_) {
+    if (!IsDatasetTensor(pair.second)) {
+      inputs->clear();
+      return errors::Internal(
+          "The input dataset is not populated as the dataset tensor yet.");
+    } else {
+      inputs->emplace_back(TensorValue(&pair.second));
+    }
+  }
+  for (auto& argument : other_arguments_) {
+    inputs->emplace_back(TensorValue(&argument));
+  }
+  return Status::OK();
+}
+
+Status MapDatasetParams::GetInputPlaceholder(
+    std::vector<string>* input_placeholder) const {
+  input_placeholder->emplace_back(MapDatasetOp::kInputDataset);
+  for (int i = 0; i < other_arguments_.size(); ++i) {
+    input_placeholder->emplace_back(
+        absl::StrCat(MapDatasetOp::kOtherArguments, "_", i));
+  }
+  return Status::OK();
+}
+
+Status MapDatasetParams::GetAttributes(AttributeVector* attr_vector) const {
+  *attr_vector = {
+      {MapDatasetOp::kFunc, func_},
+      {MapDatasetOp::kTarguments, type_arguments_},
+      {MapDatasetOp::kOutputShapes, output_shapes_},
+      {MapDatasetOp::kOutputTypes, output_dtypes_},
+      {MapDatasetOp::kUseInterOpParallelism, use_inter_op_parallelism_},
+      {MapDatasetOp::kPreserveCardinality, preserve_cardinality_}};
+  return Status::OK();
+}
+
+std::vector<FunctionDef> MapDatasetParams::func_lib() const {
+  return func_lib_;
+}
+
 }  // namespace data
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/data/dataset_test_base.h b/tensorflow/core/kernels/data/dataset_test_base.h
index 55ad8677701..c1aef5cb333 100644
--- a/tensorflow/core/kernels/data/dataset_test_base.h
+++ b/tensorflow/core/kernels/data/dataset_test_base.h
@@ -26,11 +26,14 @@ limitations under the License.
 #include "tensorflow/core/framework/function_testlib.h"
 #include "tensorflow/core/framework/node_def_builder.h"
 #include "tensorflow/core/framework/partial_tensor_shape.h"
+#include "tensorflow/core/framework/tensor_testutil.h"
 #include "tensorflow/core/framework/variant.h"
 #include "tensorflow/core/framework/variant_tensor_data.h"
 #include "tensorflow/core/graph/graph_constructor.h"
+#include "tensorflow/core/kernels/data/batch_dataset_op.h"
 #include "tensorflow/core/kernels/data/dataset_utils.h"
 #include "tensorflow/core/kernels/data/iterator_ops.h"
+#include "tensorflow/core/kernels/data/map_dataset_op.h"
 #include "tensorflow/core/kernels/data/name_utils.h"
 #include "tensorflow/core/kernels/data/range_dataset_op.h"
 #include "tensorflow/core/kernels/data/take_dataset_op.h"
@@ -45,10 +48,35 @@ limitations under the License.
 namespace tensorflow {
 namespace data {
 
+typedef std::vector<
+    std::pair<string, tensorflow::FunctionDefHelper::AttrValueWrapper>>
+    AttributeVector;
+
 constexpr int kDefaultCPUNum = 2;
 constexpr int kDefaultThreadNum = 2;
 constexpr char kDefaultIteratorPrefix[] = "Iterator";
 
+// Creates a tensor with the specified dtype, shape, and value.
+template <typename T>
+static Tensor CreateTensor(const TensorShape& input_shape,
+                           const gtl::ArraySlice<T>& input_data) {
+  Tensor tensor(DataTypeToEnum<T>::value, input_shape);
+  test::FillValues<T>(&tensor, input_data);
+  return tensor;
+}
+
+// Creates a vector of tensors with the specified dtype, shape, and values.
+template <typename T>
+std::vector<Tensor> CreateTensors(
+    const TensorShape& shape, const std::vector<gtl::ArraySlice<T>>& values) {
+  std::vector<Tensor> result;
+  result.reserve(values.size());
+  for (auto& value : values) {
+    result.emplace_back(CreateTensor<T>(shape, value));
+  }
+  return result;
+}
+
 enum class CompressionType { ZLIB = 0, GZIP = 1, RAW = 2, UNCOMPRESSED = 3 };
 
 // Returns a string representation for the given compression type.
@@ -81,76 +109,179 @@ Status WriteDataToTFRecordFile(const string& filename,
                                const std::vector<absl::string_view>& records,
                                const CompressionParams& params);
 
-// Creates a tensor with the specified dtype, shape, and value.
-template <typename T>
-static Tensor CreateTensor(const TensorShape& input_shape,
-                           const gtl::ArraySlice<T>& input_data) {
-  Tensor tensor(DataTypeToEnum<T>::value, input_shape);
-  test::FillValues<T>(&tensor, input_data);
-  return tensor;
-}
+enum class DatasetParamsType {
+  Range,
+  Batch,
+  Map,
+  MapAndBatch,
+  Sampling,
+};
 
-// Creates a vector of tensors with the specified dtype, shape, and values.
-template <typename T>
-std::vector<Tensor> CreateTensors(
-    const TensorShape& shape, const std::vector<gtl::ArraySlice<T>>& values) {
-  std::vector<Tensor> result;
-  result.reserve(values.size());
-  for (auto& value : values) {
-    result.emplace_back(CreateTensor<T>(shape, value));
-  }
-  return result;
-}
+// Returns a string representation for the given dataset parameter type. Note
+// that the return string needs to be same with `kDatasetType` for each dataset
+// parameter type.
+string ToString(DatasetParamsType type);
 
+// Provides the parameters for running the dataset op.
 class DatasetParams {
  public:
   DatasetParams(DataTypeVector output_dtypes,
-                std::vector<PartialTensorShape> output_shapes, string node_name)
-      : output_dtypes(std::move(output_dtypes)),
-        output_shapes(std::move(output_shapes)),
-        node_name(std::move(node_name)) {}
+                std::vector<PartialTensorShape> output_shapes, string node_name,
+                DatasetParamsType type);
 
   virtual ~DatasetParams() {}
 
-  virtual Status MakeInputs(gtl::InlinedVector<TensorValue, 4>* inputs) = 0;
+  // Returns the dataset input values as a TensorValue vector.
+  virtual Status GetInputs(gtl::InlinedVector<TensorValue, 4>* inputs) = 0;
 
-  bool IsDatasetTensor(const Tensor& tensor) {
-    return tensor.dtype() == DT_VARIANT &&
-           TensorShapeUtils::IsScalar(tensor.shape());
+  // Returns the dataset input names as a string vector.
+  virtual Status GetInputPlaceholder(
+      std::vector<string>* input_placeholder) const = 0;
+
+  // Returns the dataset attributes as a vector.
+  virtual Status GetAttributes(AttributeVector* attributes) const = 0;
+
+  // Checks if the tensor is a dataset variant tensor.
+  static bool IsDatasetTensor(const Tensor& tensor);
+
+  string node_name() const { return node_name_; }
+
+  DataTypeVector output_dtypes() const { return output_dtypes_; }
+
+  std::vector<PartialTensorShape> output_shapes() const {
+    return output_shapes_;
   }
 
-  DataTypeVector output_dtypes;
-  std::vector<PartialTensorShape> output_shapes;
-  string node_name;
-  string iterator_prefix = kDefaultIteratorPrefix;
+  string iterator_prefix() const { return iterator_prefix_; }
+
+  DatasetParamsType type() const { return type_; }
+
+  std::vector<std::pair<std::shared_ptr<DatasetParams>, Tensor>>&
+  input_dataset_params() {
+    return input_dataset_params_group_;
+  }
+
+  // Returns the functions that will be used when running the dataset op.
+  virtual std::vector<FunctionDef> func_lib() const { return {}; }
+
+  virtual int op_version() const { return op_version_; }
+
+ protected:
+  // Used to store all the input dataset parameters and the dataset tensors
+  // generated from the parameters.
+  std::vector<std::pair<std::shared_ptr<DatasetParams>, Tensor>>
+      input_dataset_params_group_;
+  DataTypeVector output_dtypes_;
+  std::vector<PartialTensorShape> output_shapes_;
+  string node_name_;
+  string iterator_prefix_ = "Iterator";
+  DatasetParamsType type_;
+  int op_version_ = 1;
 };
 
+// `RangeDatasetParams` is a common dataset parameter type that are used in
+// testing.
 class RangeDatasetParams : public DatasetParams {
  public:
   RangeDatasetParams(int64 start, int64 stop, int64 step,
+                     DataTypeVector output_dtypes,
+                     std::vector<PartialTensorShape> output_shapes,
+                     string node_name);
+
+  RangeDatasetParams(int64 start, int64 stop, int64 step);
+
+  Status GetInputs(gtl::InlinedVector<TensorValue, 4>* inputs) override;
+
+  Status GetInputPlaceholder(
+      std::vector<string>* input_placeholder) const override;
+
+  Status GetAttributes(AttributeVector* attr_vector) const override;
+
+ private:
+  Tensor start_;
+  Tensor stop_;
+  Tensor step_;
+};
+
+// `BatchDatasetParams` is a common dataset parameter type that are used in
+// testing.
+class BatchDatasetParams : public DatasetParams {
+ public:
+  template <typename T>
+  BatchDatasetParams(T input_dataset_params, int64 batch_size,
+                     bool drop_remainder, bool parallel_copy,
                      DataTypeVector output_dtypes,
                      std::vector<PartialTensorShape> output_shapes,
                      string node_name)
       : DatasetParams(std::move(output_dtypes), std::move(output_shapes),
-                      std::move(node_name)),
-        start(CreateTensor<int64>(TensorShape({}), {start})),
-        stop(CreateTensor<int64>(TensorShape({}), {stop})),
-        step(CreateTensor<int64>(TensorShape({}), {step})) {}
-
-  RangeDatasetParams(int64 start, int64 stop, int64 step)
-      : DatasetParams({DT_INT64}, {PartialTensorShape({})}, ""),
-        start(CreateTensor<int64>(TensorShape({}), {start})),
-        stop(CreateTensor<int64>(TensorShape({}), {stop})),
-        step(CreateTensor<int64>(TensorShape({}), {step})) {}
-
-  Status MakeInputs(gtl::InlinedVector<TensorValue, 4>* inputs) override {
-    *inputs = {TensorValue(&start), TensorValue(&stop), TensorValue(&step)};
-    return Status::OK();
+                      std::move(node_name), DatasetParamsType::Batch),
+        batch_size_(CreateTensor<int64>(TensorShape({}), {batch_size})),
+        drop_remainder_(CreateTensor<bool>(TensorShape({}), {drop_remainder})),
+        parallel_copy_(parallel_copy) {
+    auto input_dataset_params_ptr =
+        std::make_shared<T>(std::move(input_dataset_params));
+    input_dataset_params_group_.emplace_back(
+        std::make_pair(std::move(input_dataset_params_ptr), Tensor()));
   }
 
-  Tensor start;
-  Tensor stop;
-  Tensor step;
+  Status GetInputs(gtl::InlinedVector<TensorValue, 4>* inputs) override;
+
+  Status GetInputPlaceholder(
+      std::vector<string>* input_placeholder) const override;
+
+  Status GetAttributes(AttributeVector* attr_vector) const override;
+
+  int op_version() const override;
+
+ private:
+  Tensor batch_size_;
+  Tensor drop_remainder_;
+  bool parallel_copy_;
+  int op_version_ = 2;
+};
+
+// `MapDatasetParams` is a common dataset parameter type that are used in
+// testing.
+class MapDatasetParams : public DatasetParams {
+ public:
+  template <typename T>
+  MapDatasetParams(T input_dataset_params, std::vector<Tensor> other_arguments,
+                   FunctionDefHelper::AttrValueWrapper func,
+                   std::vector<FunctionDef> func_lib,
+                   DataTypeVector type_arguments, DataTypeVector output_dtypes,
+                   std::vector<PartialTensorShape> output_shapes,
+                   bool use_inter_op_parallelism, bool preserve_cardinality,
+                   string node_name)
+      : DatasetParams(std::move(output_dtypes), std::move(output_shapes),
+                      std::move(node_name), DatasetParamsType::Map),
+        other_arguments_(std::move(other_arguments)),
+        func_(std::move(func)),
+        func_lib_(std::move(func_lib)),
+        type_arguments_(std::move(type_arguments)),
+        use_inter_op_parallelism_(use_inter_op_parallelism),
+        preserve_cardinality_(preserve_cardinality) {
+    auto input_dataset_params_ptr =
+        std::make_shared<T>(std::move(input_dataset_params));
+    input_dataset_params_group_.emplace_back(
+        std::make_pair(std::move(input_dataset_params_ptr), Tensor()));
+  }
+
+  Status GetInputs(gtl::InlinedVector<TensorValue, 4>* inputs) override;
+
+  Status GetInputPlaceholder(
+      std::vector<string>* input_placeholder) const override;
+
+  Status GetAttributes(AttributeVector* attr_vector) const override;
+
+  std::vector<FunctionDef> func_lib() const override;
+
+ private:
+  std::vector<Tensor> other_arguments_;
+  FunctionDefHelper::AttrValueWrapper func_;
+  std::vector<FunctionDef> func_lib_;
+  DataTypeVector type_arguments_;
+  bool use_inter_op_parallelism_;
+  bool preserve_cardinality_;
 };
 
 template <typename T>
@@ -329,6 +460,8 @@ class DatasetOpsTestBase : public ::testing::Test {
                                   std::vector<Tensor>* const components,
                                   DatasetBase** tensor_slice_dataset);
 
+  // TODO(feihugis): remove this function after all related testes switch to
+  // `DatasetOpsTestBaseV2`.
   // Creates a `RangeDataset` dataset as a variant tensor.
   Status MakeRangeDataset(const Tensor& start, const Tensor& stop,
                           const Tensor& step,
@@ -336,10 +469,8 @@ class DatasetOpsTestBase : public ::testing::Test {
                           const std::vector<PartialTensorShape>& output_shapes,
                           Tensor* range_dataset);
 
-  // Creates a `RangeDataset` dataset as a variant tensor.
-  Status MakeRangeDataset(const RangeDatasetParams& range_dataset_params,
-                          Tensor* range_dataset);
-
+  // TODO(feihugis): remove this function after all related testes switch to
+  // `DatasetOpsTestBaseV2`.
   // Creates a `TakeDataset` dataset as a variant tensor.
   Status MakeTakeDataset(const Tensor& input_dataset, int64 count,
                          const DataTypeVector& output_types,
@@ -479,18 +610,27 @@ class DatasetOpsTestBase : public ::testing::Test {
   std::unique_ptr<IteratorBase> iterator_;
 };
 
-template <typename T>
+// TODO(feihugis): merge `DatasetOpsTestBaseV2` into `DatasetOpsTestBase` once
+// `DatasetOpsTestBaseV2` becomes stable.
 class DatasetOpsTestBaseV2 : public DatasetOpsTestBase {
  public:
   // Initializes the required members for running the unit tests.
-  virtual Status Initialize(T* dataset_params) = 0;
+  Status Initialize(DatasetParams& dataset_params);
 
-  virtual Status MakeDatasetOpKernel(
-      const T& dataset_params, std::unique_ptr<OpKernel>* dataset_kernel) = 0;
+ private:
+  // Creates the dataset op kernel.
+  Status MakeDatasetOpKernel(const DatasetParams& dataset_params,
+                             std::unique_ptr<OpKernel>* dataset_kernel);
+
+  // Creates a dataset tensor according to the input dataset params.
+  Status MakeDatasetTensor(DatasetParams* dataset_params, Tensor* dataset);
+
+  Status MakeDatasetTensorFunc(const DatasetParams& dataset_params,
+                               FunctionDef* fdef);
 };
 
 #define ITERATOR_GET_NEXT_TEST_P(dataset_op_test_class, dataset_params_class, \
-                                 test_case_generator)                         \
+                                 test_cases)                                  \
   class ParameterizedGetNextTest                                              \
       : public dataset_op_test_class,                                         \
         public ::testing::WithParamInterface<                                 \
@@ -498,18 +638,18 @@ class DatasetOpsTestBaseV2 : public DatasetOpsTestBase {
                                                                               \
   TEST_P(ParameterizedGetNextTest, GetNext) {                                 \
     auto test_case = GetParam();                                              \
-    TF_ASSERT_OK(Initialize(&test_case.dataset_params));                      \
+    TF_ASSERT_OK(Initialize(test_case.dataset_params));                       \
     TF_ASSERT_OK(CheckIteratorGetNext(test_case.expected_outputs,             \
                                       /*compare_order=*/true));               \
   }                                                                           \
                                                                               \
   INSTANTIATE_TEST_SUITE_P(                                                   \
       dataset_op_test_class, ParameterizedGetNextTest,                        \
-      ::testing::ValuesIn(std::vector<GetNextTestCase<dataset_params_class>>( \
-          test_case_generator)));
+      ::testing::ValuesIn(                                                    \
+          std::vector<GetNextTestCase<dataset_params_class>>(test_cases)));
 
 #define DATASET_NODE_NAME_TEST_P(dataset_op_test_class, dataset_params_class, \
-                                 test_case_generator)                         \
+                                 test_cases)                                  \
   class ParameterizedDatasetNodeNameTest                                      \
       : public dataset_op_test_class,                                         \
         public ::testing::WithParamInterface<                                 \
@@ -517,7 +657,7 @@ class DatasetOpsTestBaseV2 : public DatasetOpsTestBase {
                                                                               \
   TEST_P(ParameterizedDatasetNodeNameTest, DatasetNodeName) {                 \
     auto test_case = GetParam();                                              \
-    TF_ASSERT_OK(Initialize(&test_case.dataset_params));                      \
+    TF_ASSERT_OK(Initialize(test_case.dataset_params));                       \
     TF_ASSERT_OK(CheckDatasetNodeName(test_case.expected_node_name));         \
   }                                                                           \
                                                                               \
@@ -525,30 +665,30 @@ class DatasetOpsTestBaseV2 : public DatasetOpsTestBase {
       dataset_op_test_class, ParameterizedDatasetNodeNameTest,                \
       ::testing::ValuesIn(                                                    \
           std::vector<DatasetNodeNameTestCase<dataset_params_class>>(         \
-              test_case_generator)));
+              test_cases)));
 
-#define DATASET_TYPE_STRING_TEST_P(dataset_op_test_class,                     \
-                                   dataset_params_class, test_case_generator) \
-  class ParameterizedDatasetTypeStringTest                                    \
-      : public dataset_op_test_class,                                         \
-        public ::testing::WithParamInterface<                                 \
-            DatasetTypeStringTestCase<dataset_params_class>> {};              \
-                                                                              \
-  TEST_P(ParameterizedDatasetTypeStringTest, DatasetTypeString) {             \
-    auto test_case = GetParam();                                              \
-    TF_ASSERT_OK(Initialize(&test_case.dataset_params));                      \
-    TF_ASSERT_OK(                                                             \
-        CheckDatasetTypeString(test_case.expected_dataset_type_string));      \
-  }                                                                           \
-                                                                              \
-  INSTANTIATE_TEST_SUITE_P(                                                   \
-      dataset_op_test_class, ParameterizedDatasetTypeStringTest,              \
-      ::testing::ValuesIn(                                                    \
-          std::vector<DatasetTypeStringTestCase<dataset_params_class>>(       \
-              test_case_generator)));
+#define DATASET_TYPE_STRING_TEST_P(dataset_op_test_class,                \
+                                   dataset_params_class, test_cases)     \
+  class ParameterizedDatasetTypeStringTest                               \
+      : public dataset_op_test_class,                                    \
+        public ::testing::WithParamInterface<                            \
+            DatasetTypeStringTestCase<dataset_params_class>> {};         \
+                                                                         \
+  TEST_P(ParameterizedDatasetTypeStringTest, DatasetTypeString) {        \
+    auto test_case = GetParam();                                         \
+    TF_ASSERT_OK(Initialize(test_case.dataset_params));                  \
+    TF_ASSERT_OK(                                                        \
+        CheckDatasetTypeString(test_case.expected_dataset_type_string)); \
+  }                                                                      \
+                                                                         \
+  INSTANTIATE_TEST_SUITE_P(                                              \
+      dataset_op_test_class, ParameterizedDatasetTypeStringTest,         \
+      ::testing::ValuesIn(                                               \
+          std::vector<DatasetTypeStringTestCase<dataset_params_class>>(  \
+              test_cases)));
 
-#define DATASET_OUTPUT_DTYPES_TEST_P(                                         \
-    dataset_op_test_class, dataset_params_class, test_case_generator)         \
+#define DATASET_OUTPUT_DTYPES_TEST_P(dataset_op_test_class,                   \
+                                     dataset_params_class, test_cases)        \
                                                                               \
   class ParameterizedDatasetOutputDtypesTest                                  \
       : public dataset_op_test_class,                                         \
@@ -557,7 +697,7 @@ class DatasetOpsTestBaseV2 : public DatasetOpsTestBase {
                                                                               \
   TEST_P(ParameterizedDatasetOutputDtypesTest, DatasetOutputDtypes) {         \
     auto test_case = GetParam();                                              \
-    TF_ASSERT_OK(Initialize(&test_case.dataset_params));                      \
+    TF_ASSERT_OK(Initialize(test_case.dataset_params));                       \
     TF_ASSERT_OK(CheckDatasetOutputDtypes(test_case.expected_output_dtypes)); \
   }                                                                           \
                                                                               \
@@ -565,10 +705,10 @@ class DatasetOpsTestBaseV2 : public DatasetOpsTestBase {
       dataset_op_test_class, ParameterizedDatasetOutputDtypesTest,            \
       ::testing::ValuesIn(                                                    \
           std::vector<DatasetOutputDtypesTestCase<dataset_params_class>>(     \
-              test_case_generator)));
+              test_cases)));
 
-#define DATASET_OUTPUT_SHAPES_TEST_P(                                         \
-    dataset_op_test_class, dataset_params_class, test_case_generator)         \
+#define DATASET_OUTPUT_SHAPES_TEST_P(dataset_op_test_class,                   \
+                                     dataset_params_class, test_cases)        \
                                                                               \
   class ParameterizedDatasetOutputShapesTest                                  \
       : public dataset_op_test_class,                                         \
@@ -577,7 +717,7 @@ class DatasetOpsTestBaseV2 : public DatasetOpsTestBase {
                                                                               \
   TEST_P(ParameterizedDatasetOutputShapesTest, DatasetOutputShapes) {         \
     auto test_case = GetParam();                                              \
-    TF_ASSERT_OK(Initialize(&test_case.dataset_params));                      \
+    TF_ASSERT_OK(Initialize(test_case.dataset_params));                       \
     TF_ASSERT_OK(CheckDatasetOutputShapes(test_case.expected_output_shapes)); \
   }                                                                           \
                                                                               \
@@ -585,30 +725,30 @@ class DatasetOpsTestBaseV2 : public DatasetOpsTestBase {
       dataset_op_test_class, ParameterizedDatasetOutputShapesTest,            \
       ::testing::ValuesIn(                                                    \
           std::vector<DatasetOutputShapesTestCase<dataset_params_class>>(     \
-              test_case_generator)));
+              test_cases)));
 
-#define DATASET_CARDINALITY_TEST_P(dataset_op_test_class,                     \
-                                   dataset_params_class, test_case_generator) \
-                                                                              \
-  class ParameterizedCardinalityTest                                          \
-      : public dataset_op_test_class,                                         \
-        public ::testing::WithParamInterface<                                 \
-            CardinalityTestCase<dataset_params_class>> {};                    \
-                                                                              \
-  TEST_P(ParameterizedCardinalityTest, Cardinality) {                         \
-    auto test_case = GetParam();                                              \
-    TF_ASSERT_OK(Initialize(&test_case.dataset_params));                      \
-    TF_ASSERT_OK(CheckDatasetCardinality(test_case.expected_cardinality));    \
-  }                                                                           \
-                                                                              \
-  INSTANTIATE_TEST_SUITE_P(                                                   \
-      dataset_op_test_class, ParameterizedCardinalityTest,                    \
-      ::testing::ValuesIn(                                                    \
-          std::vector<CardinalityTestCase<dataset_params_class>>(             \
-              test_case_generator)));
+#define DATASET_CARDINALITY_TEST_P(dataset_op_test_class,                  \
+                                   dataset_params_class, test_cases)       \
+                                                                           \
+  class ParameterizedCardinalityTest                                       \
+      : public dataset_op_test_class,                                      \
+        public ::testing::WithParamInterface<                              \
+            CardinalityTestCase<dataset_params_class>> {};                 \
+                                                                           \
+  TEST_P(ParameterizedCardinalityTest, Cardinality) {                      \
+    auto test_case = GetParam();                                           \
+    TF_ASSERT_OK(Initialize(test_case.dataset_params));                    \
+    TF_ASSERT_OK(CheckDatasetCardinality(test_case.expected_cardinality)); \
+  }                                                                        \
+                                                                           \
+  INSTANTIATE_TEST_SUITE_P(                                                \
+      dataset_op_test_class, ParameterizedCardinalityTest,                 \
+      ::testing::ValuesIn(                                                 \
+          std::vector<CardinalityTestCase<dataset_params_class>>(          \
+              test_cases)));
 
-#define ITERATOR_OUTPUT_DTYPES_TEST_P(                                        \
-    dataset_op_test_class, dataset_params_class, test_case_generator)         \
+#define ITERATOR_OUTPUT_DTYPES_TEST_P(dataset_op_test_class,                  \
+                                      dataset_params_class, test_cases)       \
   class ParameterizedIteratorOutputDtypesTest                                 \
       : public dataset_op_test_class,                                         \
         public ::testing::WithParamInterface<                                 \
@@ -616,7 +756,7 @@ class DatasetOpsTestBaseV2 : public DatasetOpsTestBase {
                                                                               \
   TEST_P(ParameterizedIteratorOutputDtypesTest, IteratorOutputDtypes) {       \
     auto test_case = GetParam();                                              \
-    TF_ASSERT_OK(Initialize(&test_case.dataset_params));                      \
+    TF_ASSERT_OK(Initialize(test_case.dataset_params));                       \
     TF_ASSERT_OK(CheckDatasetOutputDtypes(test_case.expected_output_dtypes)); \
   }                                                                           \
                                                                               \
@@ -624,10 +764,10 @@ class DatasetOpsTestBaseV2 : public DatasetOpsTestBase {
       dataset_op_test_class, ParameterizedIteratorOutputDtypesTest,           \
       ::testing::ValuesIn(                                                    \
           std::vector<IteratorOutputDtypesTestCase<dataset_params_class>>(    \
-              test_case_generator)));
+              test_cases)));
 
-#define ITERATOR_OUTPUT_SHAPES_TEST_P(                                         \
-    dataset_op_test_class, dataset_params_class, test_case_generator)          \
+#define ITERATOR_OUTPUT_SHAPES_TEST_P(dataset_op_test_class,                   \
+                                      dataset_params_class, test_cases)        \
   class ParameterizedIteratorOutputShapesTest                                  \
       : public dataset_op_test_class,                                          \
         public ::testing::WithParamInterface<                                  \
@@ -635,7 +775,7 @@ class DatasetOpsTestBaseV2 : public DatasetOpsTestBase {
                                                                                \
   TEST_P(ParameterizedIteratorOutputShapesTest, IteratorOutputShapes) {        \
     auto test_case = GetParam();                                               \
-    TF_ASSERT_OK(Initialize(&test_case.dataset_params));                       \
+    TF_ASSERT_OK(Initialize(test_case.dataset_params));                        \
     TF_ASSERT_OK(CheckIteratorOutputShapes(test_case.expected_output_shapes)); \
   }                                                                            \
                                                                                \
@@ -643,10 +783,10 @@ class DatasetOpsTestBaseV2 : public DatasetOpsTestBase {
       dataset_op_test_class, ParameterizedIteratorOutputShapesTest,            \
       ::testing::ValuesIn(                                                     \
           std::vector<IteratorOutputShapesTestCase<dataset_params_class>>(     \
-              test_case_generator)));
+              test_cases)));
 
 #define ITERATOR_PREFIX_TEST_P(dataset_op_test_class, dataset_params_class, \
-                               test_case_generator)                         \
+                               test_cases)                                  \
   class ParameterizedIteratorPrefixTest                                     \
       : public dataset_op_test_class,                                       \
         public ::testing::WithParamInterface<                               \
@@ -654,7 +794,7 @@ class DatasetOpsTestBaseV2 : public DatasetOpsTestBase {
                                                                             \
   TEST_P(ParameterizedIteratorPrefixTest, IteratorPrefix) {                 \
     auto test_case = GetParam();                                            \
-    TF_ASSERT_OK(Initialize(&test_case.dataset_params));                    \
+    TF_ASSERT_OK(Initialize(test_case.dataset_params));                     \
     TF_ASSERT_OK(CheckIteratorPrefix(test_case.expected_iterator_prefix));  \
   }                                                                         \
                                                                             \
@@ -662,26 +802,26 @@ class DatasetOpsTestBaseV2 : public DatasetOpsTestBase {
       dataset_op_test_class, ParameterizedIteratorPrefixTest,               \
       ::testing::ValuesIn(                                                  \
           std::vector<IteratorPrefixTestCase<dataset_params_class>>(        \
-              test_case_generator)));
+              test_cases)));
 
-#define ITERATOR_SAVE_AND_RESTORE_TEST_P(                                     \
-    dataset_op_test_class, dataset_params_class, test_case_generator)         \
-  class ParameterizedIteratorSaveAndRestoreTest                               \
-      : public dataset_op_test_class,                                         \
-        public ::testing::WithParamInterface<                                 \
-            IteratorSaveAndRestoreTestCase<dataset_params_class>> {};         \
-  TEST_P(ParameterizedIteratorSaveAndRestoreTest, IteratorSaveAndRestore) {   \
-    auto test_case = GetParam();                                              \
-    TF_ASSERT_OK(Initialize(&test_case.dataset_params));                      \
-    TF_ASSERT_OK(CheckIteratorSaveAndRestore(                                 \
-        test_case.dataset_params.iterator_prefix, test_case.expected_outputs, \
-        test_case.breakpoints));                                              \
-  }                                                                           \
-  INSTANTIATE_TEST_SUITE_P(                                                   \
-      dataset_op_test_class, ParameterizedIteratorSaveAndRestoreTest,         \
-      ::testing::ValuesIn(                                                    \
-          std::vector<IteratorSaveAndRestoreTestCase<dataset_params_class>>(  \
-              test_case_generator)));
+#define ITERATOR_SAVE_AND_RESTORE_TEST_P(dataset_op_test_class,              \
+                                         dataset_params_class, test_cases)   \
+  class ParameterizedIteratorSaveAndRestoreTest                              \
+      : public dataset_op_test_class,                                        \
+        public ::testing::WithParamInterface<                                \
+            IteratorSaveAndRestoreTestCase<dataset_params_class>> {};        \
+  TEST_P(ParameterizedIteratorSaveAndRestoreTest, IteratorSaveAndRestore) {  \
+    auto test_case = GetParam();                                             \
+    TF_ASSERT_OK(Initialize(test_case.dataset_params));                      \
+    TF_ASSERT_OK(CheckIteratorSaveAndRestore(                                \
+        test_case.dataset_params.iterator_prefix(),                          \
+        test_case.expected_outputs, test_case.breakpoints));                 \
+  }                                                                          \
+  INSTANTIATE_TEST_SUITE_P(                                                  \
+      dataset_op_test_class, ParameterizedIteratorSaveAndRestoreTest,        \
+      ::testing::ValuesIn(                                                   \
+          std::vector<IteratorSaveAndRestoreTestCase<dataset_params_class>>( \
+              test_cases)));
 
 }  // namespace data
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/data/experimental/BUILD b/tensorflow/core/kernels/data/experimental/BUILD
index 8aef8eec257..b1ec124e21a 100644
--- a/tensorflow/core/kernels/data/experimental/BUILD
+++ b/tensorflow/core/kernels/data/experimental/BUILD
@@ -322,7 +322,7 @@ tf_kernel_library(
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/grappler/optimizers/data:rebatch",
-        "//tensorflow/core/kernels/data:rewrite_utils",
+        "//tensorflow/core/kernels/data:name_utils",
     ],
 )
 
diff --git a/tensorflow/core/kernels/data/experimental/map_and_batch_dataset_op_test.cc b/tensorflow/core/kernels/data/experimental/map_and_batch_dataset_op_test.cc
index 926ed31a2c5..27dbd64e343 100644
--- a/tensorflow/core/kernels/data/experimental/map_and_batch_dataset_op_test.cc
+++ b/tensorflow/core/kernels/data/experimental/map_and_batch_dataset_op_test.cc
@@ -22,104 +22,93 @@ constexpr char kNodeName[] = "map_and_batch_dataset";
 
 class MapAndBatchDatasetParams : public DatasetParams {
  public:
+  template <typename T>
   MapAndBatchDatasetParams(
-      RangeDatasetParams range_dataset_params,
-      std::vector<Tensor> other_arguments, int64 batch_size,
-      int64 num_parallel_calls, bool drop_remainder,
+      T input_dataset_params, std::vector<Tensor> other_arguments,
+      int64 batch_size, int64 num_parallel_calls, bool drop_remainder,
       FunctionDefHelper::AttrValueWrapper func,
       std::vector<FunctionDef> func_lib, DataTypeVector type_arguments,
       bool preserve_cardinality, DataTypeVector output_dtypes,
       std::vector<PartialTensorShape> output_shapes, string node_name)
       : DatasetParams(std::move(output_dtypes), std::move(output_shapes),
-                      std::move(node_name)),
-        input_dataset_params(std::move(range_dataset_params)),
-        other_arguments(std::move(other_arguments)),
-        batch_size(CreateTensor<int64>(TensorShape({}), {batch_size})),
-        num_parallel_calls(
+                      std::move(node_name), DatasetParamsType::MapAndBatch),
+        other_arguments_(std::move(other_arguments)),
+        batch_size_(CreateTensor<int64>(TensorShape({}), {batch_size})),
+        num_parallel_calls_(
             CreateTensor<int64>(TensorShape({}), {num_parallel_calls})),
-        drop_remainder(CreateTensor<bool>(TensorShape({}), {drop_remainder})),
-        func(std::move(func)),
-        func_lib(std::move(func_lib)),
-        type_arguments(std::move(type_arguments)),
-        preserve_cardinality(preserve_cardinality) {}
+        drop_remainder_(CreateTensor<bool>(TensorShape({}), {drop_remainder})),
+        func_(std::move(func)),
+        func_lib_(std::move(func_lib)),
+        type_arguments_(std::move(type_arguments)),
+        preserve_cardinality_(preserve_cardinality) {
+    auto input_dataset_params_ptr =
+        std::make_shared<T>(std::move(input_dataset_params));
+    input_dataset_params_group_.emplace_back(
+        std::make_pair(std::move(input_dataset_params_ptr), Tensor()));
+  }
 
-  Status MakeInputs(gtl::InlinedVector<TensorValue, 4>* inputs) override {
-    if (!IsDatasetTensor(input_dataset)) {
-      return tensorflow::errors::Internal(
-          "The input dataset is not populated as the dataset tensor yet.");
+  Status GetInputs(gtl::InlinedVector<TensorValue, 4>* inputs) override {
+    inputs->reserve(input_dataset_params_group_.size() +
+                    other_arguments_.size() + 3);
+    for (auto& pair : input_dataset_params_group_) {
+      if (!IsDatasetTensor(pair.second)) {
+        inputs->clear();
+        return errors::Internal(
+            "The input dataset is not populated as the dataset tensor yet.");
+      } else {
+        inputs->emplace_back(TensorValue(&pair.second));
+      }
     }
-    *inputs = {TensorValue(&input_dataset)};
-    for (auto& argument : other_arguments) {
+    for (auto& argument : other_arguments_) {
       inputs->emplace_back(TensorValue(&argument));
     }
-    inputs->insert(inputs->end(),
-                   {TensorValue(&batch_size), TensorValue(&num_parallel_calls),
-                    TensorValue(&drop_remainder)});
+    inputs->emplace_back(TensorValue(&batch_size_));
+    inputs->emplace_back(TensorValue(&num_parallel_calls_));
+    inputs->emplace_back(TensorValue(&drop_remainder_));
+
     return Status::OK();
   }
 
-  RangeDatasetParams input_dataset_params;
-  Tensor input_dataset;
-  std::vector<Tensor> other_arguments;
-  Tensor batch_size;
-  Tensor num_parallel_calls;
-  Tensor drop_remainder;
-  FunctionDefHelper::AttrValueWrapper func;
-  std::vector<FunctionDef> func_lib;
-  DataTypeVector type_arguments;
-  bool preserve_cardinality;
+  Status GetInputPlaceholder(
+      std::vector<string>* input_placeholder) const override {
+    input_placeholder->reserve(input_dataset_params_group_.size() +
+                               other_arguments_.size() + 3);
+    input_placeholder->emplace_back(MapAndBatchDatasetOp::kInputDataset);
+    for (int i = 0; i < other_arguments_.size(); ++i) {
+      input_placeholder->emplace_back(
+          absl::StrCat(MapAndBatchDatasetOp::kOtherArguments, "_", i));
+    }
+    input_placeholder->emplace_back(MapAndBatchDatasetOp::kBatchSize);
+    input_placeholder->emplace_back(MapAndBatchDatasetOp::kNumParallelCalls);
+    input_placeholder->emplace_back(MapAndBatchDatasetOp::kDropRemainder);
+
+    return Status::OK();
+  }
+
+  Status GetAttributes(AttributeVector* attr_vector) const override {
+    *attr_vector = {
+        {MapDatasetOp::kFunc, func_},
+        {MapDatasetOp::kTarguments, type_arguments_},
+        {MapDatasetOp::kOutputShapes, output_shapes_},
+        {MapDatasetOp::kOutputTypes, output_dtypes_},
+        {MapDatasetOp::kPreserveCardinality, preserve_cardinality_}};
+    return Status::OK();
+  }
+
+  std::vector<FunctionDef> func_lib() const override { return func_lib_; }
+
+ private:
+  std::vector<Tensor> other_arguments_;
+  Tensor batch_size_;
+  Tensor num_parallel_calls_;
+  Tensor drop_remainder_;
+  FunctionDefHelper::AttrValueWrapper func_;
+  std::vector<FunctionDef> func_lib_;
+  DataTypeVector type_arguments_;
+  bool preserve_cardinality_;
 };
 
-class MapAndBatchDatasetOpTest
-    : public DatasetOpsTestBaseV2<MapAndBatchDatasetParams> {
- public:
-  Status Initialize(
-      MapAndBatchDatasetParams* map_and_batch_dataset_params) override {
-    TF_RETURN_IF_ERROR(InitThreadPool(thread_num_));
-    TF_RETURN_IF_ERROR(InitFunctionLibraryRuntime(
-        map_and_batch_dataset_params->func_lib, cpu_num_));
-
-    TF_RETURN_IF_ERROR(
-        MakeDatasetOpKernel(*map_and_batch_dataset_params, &dataset_kernel_));
-    TF_RETURN_IF_ERROR(
-        MakeRangeDataset(map_and_batch_dataset_params->input_dataset_params,
-                         &map_and_batch_dataset_params->input_dataset));
-    gtl::InlinedVector<TensorValue, 4> inputs;
-    TF_RETURN_IF_ERROR(map_and_batch_dataset_params->MakeInputs(&inputs));
-    TF_RETURN_IF_ERROR(
-        CreateDatasetContext(dataset_kernel_.get(), &inputs, &dataset_ctx_));
-    TF_RETURN_IF_ERROR(
-        CreateDataset(dataset_kernel_.get(), dataset_ctx_.get(), &dataset_));
-    TF_RETURN_IF_ERROR(
-        CreateIteratorContext(dataset_ctx_.get(), &iterator_ctx_));
-    TF_RETURN_IF_ERROR(dataset_->MakeIterator(
-        iterator_ctx_.get(), map_and_batch_dataset_params->iterator_prefix,
-        &iterator_));
-    return Status::OK();
-  }
-
- protected:
-  Status MakeDatasetOpKernel(
-      const MapAndBatchDatasetParams& map_and_batch_dataset_params,
-      std::unique_ptr<OpKernel>* map_and_batch_kernel) override {
-    NodeDef node_def = test::function::NDef(
-        kNodeName, name_utils::OpName(MapAndBatchDatasetOp::kDatasetType),
-        {MapAndBatchDatasetOp::kInputDataset, MapAndBatchDatasetOp::kBatchSize,
-         MapAndBatchDatasetOp::kNumParallelCalls,
-         MapAndBatchDatasetOp::kDropRemainder},
-        {{MapAndBatchDatasetOp::kFunc, map_and_batch_dataset_params.func},
-         {MapAndBatchDatasetOp::kTarguments,
-          map_and_batch_dataset_params.type_arguments},
-         {MapAndBatchDatasetOp::kOutputTypes,
-          map_and_batch_dataset_params.output_dtypes},
-         {MapAndBatchDatasetOp::kOutputShapes,
-          map_and_batch_dataset_params.output_shapes},
-         {MapAndBatchDatasetOp::kPreserveCardinality,
-          map_and_batch_dataset_params.preserve_cardinality}});
-    TF_RETURN_IF_ERROR(CreateOpKernel(node_def, map_and_batch_kernel));
-    return Status::OK();
-  }
-};
+class MapAndBatchDatasetOpTest : public DatasetOpsTestBaseV2 {};
 
 FunctionDefHelper::AttrValueWrapper MapFunc(const string& func_name,
                                             const DataType& dtype) {
@@ -129,42 +118,42 @@ FunctionDefHelper::AttrValueWrapper MapFunc(const string& func_name,
 // test case 1: num_parallel_calls = 1, drop_remainder = true,
 // preserve_cardinality = false, MapFunc = XTimesTwo
 MapAndBatchDatasetParams MapAndBatchDatasetParams1() {
-  return {/*range_dataset_params=*/{/*start=*/0, /*stop=*/10, /*step=*/2},
-          /*other_arguments=*/{},
-          /*batch_size=*/2,
-          /*num_parallel_calls=*/1,
-          /*drop_remainder=*/true,
-          /*func=*/MapFunc("XTimesTwo", DT_INT64),
-          /*func_lib=*/{test::function::XTimesTwo()},
-          /*type_arguments*/ {},
-          /*preserve_cardinality=*/false,
-          /*output_dtypes=*/{DT_INT64},
-          /*output_shapes=*/{PartialTensorShape({2})},
-          /*node_name=*/kNodeName};
+  return MapAndBatchDatasetParams(RangeDatasetParams(0, 10, 2),
+                                  /*other_arguments=*/{},
+                                  /*batch_size=*/2,
+                                  /*num_parallel_calls=*/1,
+                                  /*drop_remainder=*/true,
+                                  /*func=*/MapFunc("XTimesTwo", DT_INT64),
+                                  /*func_lib=*/{test::function::XTimesTwo()},
+                                  /*type_arguments*/ {},
+                                  /*preserve_cardinality=*/false,
+                                  /*output_dtypes=*/{DT_INT64},
+                                  /*output_shapes=*/{PartialTensorShape({2})},
+                                  /*node_name=*/kNodeName);
 }
 
 // test case 2: num_parallel_calls = 2, drop_remainder = true,
 // preserve_cardinality = true, MapFunc = XTimesTwo
 MapAndBatchDatasetParams MapAndBatchDatasetParams2() {
-  return {/*range_dataset_params=*/{/*start=*/0, /*stop=*/10, /*step=*/2},
-          /*other_arguments=*/{},
-          /*batch_size=*/2,
-          /*num_parallel_calls=*/2,
-          /*drop_remainder=*/true,
-          /*func=*/MapFunc("XTimesTwo", DT_INT64),
-          /*func_lib=*/{test::function::XTimesTwo()},
-          /*type_arguments*/ {},
-          /*preserve_cardinality=*/true,
-          /*output_dtypes=*/{DT_INT64},
-          /*output_shapes=*/{PartialTensorShape({2})},
-          /*node_name=*/kNodeName};
+  return MapAndBatchDatasetParams(RangeDatasetParams(0, 10, 2),
+                                  /*other_arguments=*/{},
+                                  /*batch_size=*/2,
+                                  /*num_parallel_calls=*/2,
+                                  /*drop_remainder=*/true,
+                                  /*func=*/MapFunc("XTimesTwo", DT_INT64),
+                                  /*func_lib=*/{test::function::XTimesTwo()},
+                                  /*type_arguments*/ {},
+                                  /*preserve_cardinality=*/true,
+                                  /*output_dtypes=*/{DT_INT64},
+                                  /*output_shapes=*/{PartialTensorShape({2})},
+                                  /*node_name=*/kNodeName);
 }
 
 // test case 3: num_parallel_calls = 3, drop_remainder = false,
 // preserve_cardinality = true, MapFunc = XTimesFour
 MapAndBatchDatasetParams MapAndBatchDatasetParams3() {
-  return {
-      /*range_dataset_params=*/{/*start=*/0, /*stop=*/10, /*step=*/2},
+  return MapAndBatchDatasetParams(
+      RangeDatasetParams(0, 10, 2),
       /*other_arguments=*/{},
       /*batch_size=*/2,
       /*num_parallel_calls=*/3,
@@ -175,31 +164,31 @@ MapAndBatchDatasetParams MapAndBatchDatasetParams3() {
       /*preserve_cardinality=*/true,
       /*output_dtypes=*/{DT_INT64},
       /*output_shapes=*/{PartialTensorShape({2})},
-      /*node_name=*/kNodeName};
+      /*node_name=*/kNodeName);
 }
 
 // test case 4: num_parallel_calls = 4, drop_remainder = true,
 // preserve_cardinality = false, MapFunc = XTimesTwo
 MapAndBatchDatasetParams MapAndBatchDatasetParams4() {
-  return {/*range_dataset_params=*/{/*start=*/0, /*stop=*/10, /*step=*/2},
-          /*other_arguments=*/{},
-          /*batch_size=*/2,
-          /*num_parallel_calls=*/4,
-          /*drop_remainder=*/true,
-          /*func=*/MapFunc("XTimesTwo", DT_INT64),
-          /*func_lib=*/{test::function::XTimesTwo()},
-          /*type_arguments*/ {},
-          /*preserve_cardinality=*/false,
-          /*output_dtypes=*/{DT_INT64},
-          /*output_shapes=*/{PartialTensorShape({2})},
-          /*node_name=*/kNodeName};
+  return MapAndBatchDatasetParams(RangeDatasetParams(0, 10, 2),
+                                  /*other_arguments=*/{},
+                                  /*batch_size=*/2,
+                                  /*num_parallel_calls=*/4,
+                                  /*drop_remainder=*/true,
+                                  /*func=*/MapFunc("XTimesTwo", DT_INT64),
+                                  /*func_lib=*/{test::function::XTimesTwo()},
+                                  /*type_arguments*/ {},
+                                  /*preserve_cardinality=*/false,
+                                  /*output_dtypes=*/{DT_INT64},
+                                  /*output_shapes=*/{PartialTensorShape({2})},
+                                  /*node_name=*/kNodeName);
 }
 
 // test case 5: num_parallel_calls = kAutotune, drop_remainder = true,
 // preserve_cardinality = true, MapFunc = XTimesTwo
 MapAndBatchDatasetParams MapAndBatchDatasetParams5() {
-  return {
-      /*range_dataset_params=*/{/*start=*/0, /*stop=*/10, /*step=*/2},
+  return MapAndBatchDatasetParams(
+      RangeDatasetParams(0, 10, 2),
       /*other_arguments=*/{},
       /*batch_size=*/2,
       /*num_parallel_calls=*/model::kAutotune,
@@ -210,14 +199,14 @@ MapAndBatchDatasetParams MapAndBatchDatasetParams5() {
       /*preserve_cardinality=*/true,
       /*output_dtypes=*/{DT_INT64},
       /*output_shapes=*/{PartialTensorShape({2})},
-      /*node_name=*/kNodeName};
+      /*node_name=*/kNodeName);
 }
 
 // test case 6: num_parallel_calls = 4, drop_remainder = false,
 // preserve_cardinality = true, MapFunc = XTimesFour
 MapAndBatchDatasetParams MapAndBatchDatasetParams6() {
-  return {
-      /*range_dataset_params=*/{/*start=*/0, /*stop=*/10, /*step=*/2},
+  return MapAndBatchDatasetParams(
+      RangeDatasetParams(0, 10, 2),
       /*other_arguments=*/{},
       /*batch_size=*/2,
       /*num_parallel_calls=*/4,
@@ -228,12 +217,12 @@ MapAndBatchDatasetParams MapAndBatchDatasetParams6() {
       /*preserve_cardinality=*/false,
       /*output_dtypes=*/{DT_INT64},
       /*output_shapes=*/{PartialTensorShape({2})},
-      /*node_name=*/kNodeName};
+      /*node_name=*/kNodeName);
 }
 
 MapAndBatchDatasetParams InvalidNumParallelCallsMapAndBatchDatasetParams() {
-  return {
-      /*range_dataset_params=*/{/*start=*/0, /*stop=*/10, /*step=*/2},
+  return MapAndBatchDatasetParams(
+      RangeDatasetParams(0, 10, 2),
       /*other_arguments=*/{},
       /*batch_size=*/2,
       /*num_parallel_calls=*/-4,
@@ -244,12 +233,12 @@ MapAndBatchDatasetParams InvalidNumParallelCallsMapAndBatchDatasetParams() {
       /*preserve_cardinality=*/false,
       /*output_dtypes=*/{DT_INT64},
       /*output_shapes=*/{PartialTensorShape({2})},
-      /*node_name=*/kNodeName};
+      /*node_name=*/kNodeName);
 }
 
 MapAndBatchDatasetParams InvalidBatchSizeMapAndBatchDatasetParams() {
-  return {
-      /*range_dataset_params=*/{/*start=*/0, /*stop=*/10, /*step=*/2},
+  return MapAndBatchDatasetParams(
+      RangeDatasetParams(0, 10, 2),
       /*other_arguments=*/{},
       /*batch_size=*/-2,
       /*num_parallel_calls=*/2,
@@ -260,7 +249,7 @@ MapAndBatchDatasetParams InvalidBatchSizeMapAndBatchDatasetParams() {
       /*preserve_cardinality=*/false,
       /*output_dtypes=*/{DT_INT64},
       /*output_shapes=*/{PartialTensorShape({2})},
-      /*node_name=*/kNodeName};
+      /*node_name=*/kNodeName);
 }
 
 std::vector<GetNextTestCase<MapAndBatchDatasetParams>> GetNextTestCases() {
@@ -293,14 +282,14 @@ ITERATOR_GET_NEXT_TEST_P(MapAndBatchDatasetOpTest, MapAndBatchDatasetParams,
 
 TEST_F(MapAndBatchDatasetOpTest, DatasetTypeString) {
   auto dataset_params = MapAndBatchDatasetParams1();
-  TF_ASSERT_OK(Initialize(&dataset_params));
+  TF_ASSERT_OK(Initialize(dataset_params));
   TF_ASSERT_OK(CheckDatasetTypeString(
       name_utils::OpName(MapAndBatchDatasetOp::kDatasetType)));
 }
 
 TEST_F(MapAndBatchDatasetOpTest, DatasetOutputDtypes) {
   auto dataset_params = MapAndBatchDatasetParams1();
-  TF_ASSERT_OK(Initialize(&dataset_params));
+  TF_ASSERT_OK(Initialize(dataset_params));
   TF_ASSERT_OK(CheckDatasetOutputDtypes({DT_INT64}));
 }
 
@@ -350,7 +339,7 @@ DATASET_CARDINALITY_TEST_P(MapAndBatchDatasetOpTest, MapAndBatchDatasetParams,
 
 TEST_F(MapAndBatchDatasetOpTest, IteratorOutputDtypes) {
   auto dataset_params = MapAndBatchDatasetParams1();
-  TF_ASSERT_OK(Initialize(&dataset_params));
+  TF_ASSERT_OK(Initialize(dataset_params));
   TF_ASSERT_OK(CheckIteratorOutputDtypes({DT_INT64}));
 }
 
@@ -382,9 +371,9 @@ ITERATOR_OUTPUT_SHAPES_TEST_P(MapAndBatchDatasetOpTest,
 
 TEST_F(MapAndBatchDatasetOpTest, IteratorPrefix) {
   auto dataset_params = MapAndBatchDatasetParams1();
-  TF_ASSERT_OK(Initialize(&dataset_params));
+  TF_ASSERT_OK(Initialize(dataset_params));
   TF_ASSERT_OK(CheckIteratorPrefix(name_utils::IteratorPrefix(
-      MapAndBatchDatasetOp::kDatasetType, dataset_params.iterator_prefix)));
+      MapAndBatchDatasetOp::kDatasetType, dataset_params.iterator_prefix())));
 }
 
 std::vector<IteratorSaveAndRestoreTestCase<MapAndBatchDatasetParams>>
@@ -425,13 +414,13 @@ ITERATOR_SAVE_AND_RESTORE_TEST_P(MapAndBatchDatasetOpTest,
 
 TEST_F(MapAndBatchDatasetOpTest, InvalidBatchSize) {
   auto dataset_params = InvalidBatchSizeMapAndBatchDatasetParams();
-  EXPECT_EQ(Initialize(&dataset_params).code(),
+  EXPECT_EQ(Initialize(dataset_params).code(),
             tensorflow::error::INVALID_ARGUMENT);
 }
 
 TEST_F(MapAndBatchDatasetOpTest, InvalidNumParallel) {
   auto dataset_params = InvalidNumParallelCallsMapAndBatchDatasetParams();
-  EXPECT_EQ(Initialize(&dataset_params).code(),
+  EXPECT_EQ(Initialize(dataset_params).code(),
             tensorflow::error::INVALID_ARGUMENT);
 }
 
diff --git a/tensorflow/core/kernels/data/experimental/parse_example_dataset_op.cc b/tensorflow/core/kernels/data/experimental/parse_example_dataset_op.cc
index de5f56b10b8..b3e2399fa93 100644
--- a/tensorflow/core/kernels/data/experimental/parse_example_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/parse_example_dataset_op.cc
@@ -15,6 +15,7 @@ limitations under the License.
 #include <deque>
 
 #include "tensorflow/core/common_runtime/device.h"
+#include "tensorflow/core/common_runtime/metrics.h"
 #include "tensorflow/core/framework/stats_aggregator.h"
 #include "tensorflow/core/kernels/data/parallel_map_dataset_op.h"
 #include "tensorflow/core/kernels/data/stats_utils.h"
@@ -66,6 +67,8 @@ class ParseExampleDatasetOp : public UnaryDatasetOpKernel {
       }
       elements_per_stride_.push_back(dense_shape.num_elements());
     }
+    metrics::RecordParseDenseFeature(dense_keys_.size());
+    metrics::RecordParseSparseFeature(sparse_keys_.size());
   }
 
  protected:
@@ -337,8 +340,6 @@ class ParseExampleDatasetOp : public UnaryDatasetOpKernel {
                   << ", got " << serialized_sparse.shape().DebugString()
                   << ").";
             }
-            // TODO(b/123360128): Add component name to streamz metrics without
-            // breaking TFX metrics.
             if (stats_aggregator) {
               stats_aggregator->IncrementCounter(
                   stats_utils::kExamplesCount, "trainer",
diff --git a/tensorflow/core/kernels/data/experimental/rebatch_dataset_op.cc b/tensorflow/core/kernels/data/experimental/rebatch_dataset_op.cc
index 615882119ad..030166aeecd 100644
--- a/tensorflow/core/kernels/data/experimental/rebatch_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/rebatch_dataset_op.cc
@@ -13,24 +13,26 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #include "tensorflow/core/framework/dataset.h"
-#include "tensorflow/core/kernels/data/rewrite_utils.h"
-#include "tensorflow/core/protobuf/rewriter_config.pb.h"
+#include "tensorflow/core/framework/tensor_util.h"
+#include "tensorflow/core/kernels/data/name_utils.h"
 
 namespace tensorflow {
 namespace data {
 namespace experimental {
 namespace {
 
-constexpr char kOptimizerName[] = "tf_data_rebatcher";
-constexpr char kUseFallbackAttr[] = "use_fallback";
+inline int64 CeilDiv(int64 dividend, int64 divisor) {
+  return (dividend - 1 + divisor) / divisor;
+}
+
+constexpr const char* const kDatasetType = "Rebatch";
 
 class RebatchDatasetOp : public UnaryDatasetOpKernel {
  public:
   explicit RebatchDatasetOp(OpKernelConstruction* ctx)
       : UnaryDatasetOpKernel(ctx) {
-    if (ctx->HasAttr(kUseFallbackAttr)) {
-      OP_REQUIRES_OK(ctx, ctx->GetAttr(kUseFallbackAttr, &use_fallback_));
-    }
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("output_types", &output_types_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("output_shapes", &output_shapes_));
   }
 
  protected:
@@ -42,40 +44,213 @@ class RebatchDatasetOp : public UnaryDatasetOpKernel {
     OP_REQUIRES(
         ctx, num_replicas > 0,
         errors::InvalidArgument("num_replicas must be greater than zero."));
-
-    auto config_factory = [num_replicas, this]() {
-      return CreateConfig(num_replicas, this->use_fallback_);
-    };
-
-    // We only want to optimize functions for some particular datasets like
-    // FlatMapDataset, InterleaveDataset etc. So we disable generalized
-    // function optimization and explicitly handle function modifications
-    // for those datasets in the rewrite.
-    OP_REQUIRES_OK(ctx,
-                   RewriteDataset(ctx, input, std::move(config_factory),
-                                  /*optimize_function_library=*/false, output));
+    *output =
+        new Dataset(ctx, input, num_replicas, output_types_, output_shapes_);
   }
 
  private:
-  static RewriterConfig CreateConfig(int64 num_replicas, bool use_fallback) {
-    RewriterConfig rewriter_config;
-    rewriter_config.set_fail_on_optimizer_errors(true);
-    rewriter_config.add_optimizers(kOptimizerName);
-    rewriter_config.set_meta_optimizer_iterations(RewriterConfig::ONE);
-    auto custom_optimizer = rewriter_config.add_custom_optimizers();
-    custom_optimizer->set_name(kOptimizerName);
-    AttrValue num_replicas_attr;
-    num_replicas_attr.set_i(num_replicas);
-    (*custom_optimizer->mutable_parameter_map())["num_replicas"] =
-        num_replicas_attr;
-    AttrValue use_fallback_attr;
-    use_fallback_attr.set_b(use_fallback);
-    (*custom_optimizer->mutable_parameter_map())["use_fallback"] =
-        use_fallback_attr;
-    return rewriter_config;
-  }
+  class Dataset : public DatasetBase {
+   public:
+    Dataset(OpKernelContext* ctx, const DatasetBase* input,
+            const int64 num_replicas, const DataTypeVector& output_types,
+            const std::vector<PartialTensorShape>& output_shapes)
+        : DatasetBase(DatasetContext(ctx)),
+          input_(input),
+          num_replicas_(num_replicas),
+          output_types_(output_types),
+          output_shapes_(output_shapes) {
+      input_->Ref();
+    }
 
-  bool use_fallback_ = true;
+    ~Dataset() override { input_->Unref(); }
+
+    std::unique_ptr<IteratorBase> MakeIteratorInternal(
+        const string& prefix) const override {
+      name_utils::IteratorPrefixParams params;
+      return absl::make_unique<Iterator>(Iterator::Params{
+          this, name_utils::IteratorPrefix(kDatasetType, prefix, params)});
+    }
+
+    const DataTypeVector& output_dtypes() const override {
+      return output_types_;
+    }
+
+    const std::vector<PartialTensorShape>& output_shapes() const override {
+      return output_shapes_;
+    }
+
+    string DebugString() const override {
+      name_utils::DatasetDebugStringParams params;
+      params.set_args(num_replicas_);
+      return name_utils::DatasetDebugString(kDatasetType, params);
+    }
+
+    Status CheckExternalState() const override {
+      return input_->CheckExternalState();
+    }
+
+   protected:
+    Status AsGraphDefInternal(SerializationContext* ctx,
+                              DatasetGraphDefBuilder* b,
+                              Node** output) const override {
+      Node* input_graph_node = nullptr;
+      TF_RETURN_IF_ERROR(b->AddInputDataset(ctx, input_, &input_graph_node));
+      Node* num_replicas = nullptr;
+      TF_RETURN_IF_ERROR(b->AddScalar(num_replicas_, &num_replicas));
+      TF_RETURN_IF_ERROR(
+          b->AddDataset(this, {input_graph_node, num_replicas}, output));
+      return Status::OK();
+    }
+
+   private:
+    class Iterator : public DatasetIterator<Dataset> {
+     public:
+      explicit Iterator(const Params& params)
+          : DatasetIterator<Dataset>(params) {}
+
+      ~Iterator() override {}
+
+      Status Initialize(IteratorContext* ctx) override {
+        return dataset()->input_->MakeIterator(ctx, prefix(), &input_impl_);
+      }
+
+      Status GetNextInternal(IteratorContext* ctx,
+                             std::vector<Tensor>* out_tensors,
+                             bool* end_of_sequence) override {
+        mutex_lock l(mu_);
+        if (slice_number_ % dataset()->num_replicas_ == 0) {
+          input_descriptors_.clear();
+          std::vector<Tensor> input_tensors;
+          TF_RETURN_IF_ERROR(
+              input_impl_->GetNext(ctx, &input_tensors, end_of_sequence));
+          if (*end_of_sequence) {
+            return Status::OK();
+          }
+
+          input_descriptors_.reserve(input_tensors.size());
+          for (int i = 0; i < input_tensors.size(); ++i) {
+            if (input_tensors[i].dims() == 0) {
+              return errors::InvalidArgument(
+                  "Cannot rebatch dataset: All components must have at least "
+                  "one dimension. Perhaps your input dataset is not batched? "
+                  "Component ",
+                  i, " is scalar.");
+            }
+
+            int64 original_batch_dim = input_tensors[i].dim_size(0);
+            int64 interval =
+                CeilDiv(original_batch_dim, dataset()->num_replicas_);
+            input_descriptors_.push_back(
+                {std::move(input_tensors[i]), original_batch_dim, interval});
+          }
+        }
+
+        out_tensors->reserve(input_descriptors_.size());
+
+        // We slice each component independently because they may have
+        // different batch dimensions.
+        for (const auto& input_desc : input_descriptors_) {
+          int64 start = input_desc.interval * slice_number_;
+          int64 end = std::min(start + input_desc.interval,
+                               input_desc.original_batch_dim);
+          if (start >= end) {
+            // We can get here if ceil(original_batch_dim_ / new batch dim) <
+            // num_replicas_, i.e. the batch isn't big enough to distribute over
+            // num replicas. In this case, we return empty tensors for the
+            // remaining iterations that correspond to this batch.
+            start = end;
+          }
+          Tensor slice = input_desc.whole_tensor.Slice(start, end);
+          if (slice.IsAligned()) {
+            out_tensors->push_back(std::move(slice));
+          } else {
+            out_tensors->push_back(tensor::DeepCopy(std::move(slice)));
+          }
+        }
+        slice_number_ = (slice_number_ + 1) % dataset()->num_replicas_;
+        return Status::OK();
+      }
+
+     protected:
+      Status SaveInternal(IteratorStateWriter* writer) override {
+        mutex_lock l(mu_);
+        if (!input_impl_) {
+          TF_RETURN_IF_ERROR(
+              writer->WriteScalar(full_name("input_impl_empty"), ""));
+        } else {
+          TF_RETURN_IF_ERROR(SaveInput(writer, input_impl_));
+        }
+        TF_RETURN_IF_ERROR(
+            writer->WriteScalar(full_name("slice_number"), slice_number_));
+
+        if (slice_number_ % dataset()->num_replicas_ != 0) {
+          // Save state of input tensors.
+          for (int i = 0; i < input_descriptors_.size(); ++i) {
+            TF_RETURN_IF_ERROR(writer->WriteTensor(
+                full_name(strings::StrCat("tensors[", i, "]")),
+                input_descriptors_[i].whole_tensor));
+          }
+        }
+        return Status::OK();
+      }
+
+      Status RestoreInternal(IteratorContext* ctx,
+                             IteratorStateReader* reader) override {
+        mutex_lock l(mu_);
+        if (!reader->Contains(full_name("input_impl_empty"))) {
+          TF_RETURN_IF_ERROR(RestoreInput(ctx, reader, input_impl_));
+        } else {
+          input_impl_.reset();
+        }
+        TF_RETURN_IF_ERROR(
+            reader->ReadScalar(full_name("slice_number"), &slice_number_));
+
+        input_descriptors_.clear();
+        input_descriptors_.resize(dataset()->output_dtypes().size());
+        if (slice_number_ % dataset()->num_replicas_ != 0) {
+          for (int i = 0; i < input_descriptors_.size(); ++i) {
+            TF_RETURN_IF_ERROR(reader->ReadTensor(
+                full_name(strings::StrCat("tensors[", i, "]")),
+                &input_descriptors_[i].whole_tensor));
+            input_descriptors_[i].original_batch_dim =
+                input_descriptors_[i].whole_tensor.dim_size(0);
+            input_descriptors_[i].interval =
+                CeilDiv(input_descriptors_[i].original_batch_dim,
+                        dataset()->num_replicas_);
+          }
+        }
+        return Status::OK();
+      }
+
+     private:
+      // Describes one component of the input.
+      struct InputDescriptor {
+        InputDescriptor() {}
+        InputDescriptor(Tensor&& whole_tensor, int64 original_batch_dim,
+                        int64 interval)
+            : whole_tensor(std::move(whole_tensor)),
+              original_batch_dim(original_batch_dim),
+              interval(interval) {}
+
+        Tensor whole_tensor;
+        int64 original_batch_dim;
+        int64 interval;
+      };
+
+      mutex mu_;
+      std::unique_ptr<IteratorBase> input_impl_;
+      std::vector<InputDescriptor> input_descriptors_ GUARDED_BY(mu_);
+      int64 slice_number_ GUARDED_BY(mu_) = 0;
+    };
+
+    const DatasetBase* const input_;
+    const int64 num_replicas_;
+    const DataTypeVector output_types_;
+    const std::vector<PartialTensorShape> output_shapes_;
+  };
+
+  DataTypeVector output_types_;
+  std::vector<PartialTensorShape> output_shapes_;
 };
 
 REGISTER_KERNEL_BUILDER(Name("RebatchDataset").Device(DEVICE_CPU),
diff --git a/tensorflow/core/kernels/data/experimental/sampling_dataset_op_test.cc b/tensorflow/core/kernels/data/experimental/sampling_dataset_op_test.cc
index 0c354f821f2..9b62d176828 100644
--- a/tensorflow/core/kernels/data/experimental/sampling_dataset_op_test.cc
+++ b/tensorflow/core/kernels/data/experimental/sampling_dataset_op_test.cc
@@ -22,132 +22,88 @@ constexpr char kNodeName[] = "sampling_dataset";
 constexpr char kIteratorPrefix[] = "Iterator";
 constexpr int64 kRandomSeed = 42;
 constexpr int64 kRandomSeed2 = 7;
-constexpr int64 kStart = 0;
-constexpr int64 kStep = 1;
 
 class SamplingDatasetParams : public DatasetParams {
  public:
-  SamplingDatasetParams(float rate, int64 num_elements,
+  template <typename T>
+  SamplingDatasetParams(T input_dataset_params, float rate,
                         DataTypeVector output_dtypes,
                         std::vector<PartialTensorShape> output_shapes,
                         string node_name)
       : DatasetParams(std::move(output_dtypes), std::move(output_shapes),
-                      std::move(node_name)),
-        rate(CreateTensor<float>(TensorShape({}), {rate})),
-        range_dataset_params(kStart, num_elements, kStep, {DT_INT64},
-                             {PartialTensorShape({})}, "") {}
+                      std::move(node_name), DatasetParamsType::Sampling),
+        rate_(CreateTensor<float>(TensorShape({}), {rate})) {
+    auto input_dataset_params_ptr =
+        std::make_shared<T>(std::move(input_dataset_params));
+    input_dataset_params_group_.emplace_back(
+        std::make_pair(std::move(input_dataset_params_ptr), Tensor()));
+  }
 
-  Status MakeInputs(gtl::InlinedVector<TensorValue, 4>* inputs) override {
-    if (input_dataset.NumElements() == 0 ||
-        input_dataset.dtype() != DT_VARIANT) {
-      return tensorflow::errors::Internal(
-          "The input dataset is not populated as the dataset tensor yet.");
+  Status GetInputs(gtl::InlinedVector<TensorValue, 4>* inputs) override {
+    inputs->reserve(input_dataset_params_group_.size() + 3);
+    for (auto& pair : input_dataset_params_group_) {
+      if (!IsDatasetTensor(pair.second)) {
+        inputs->clear();
+        return errors::Internal(
+            "The input dataset is not populated as the dataset tensor yet.");
+      } else {
+        inputs->emplace_back(TensorValue(&pair.second));
+      }
     }
-    *inputs = {TensorValue(&input_dataset), TensorValue(&rate),
-               TensorValue(&seed_tensor_), TensorValue(&seed2_tensor_)};
+    inputs->emplace_back(TensorValue(&rate_));
+    inputs->emplace_back(TensorValue(&seed_tensor_));
+    inputs->emplace_back(TensorValue(&seed2_tensor_));
+
     return Status::OK();
   }
 
-  // Target sample rate, range (0,1], wrapped in a scalar Tensor
-  Tensor rate;
+  Status GetInputPlaceholder(
+      std::vector<string>* input_placeholder) const override {
+    *input_placeholder = {SamplingDatasetOp::kInputDataset,
+                          SamplingDatasetOp::kRate, SamplingDatasetOp::kSeed,
+                          SamplingDatasetOp::kSeed2};
 
-  // Parameters of the sequence of numbers that will serve as the dynamic input
-  // of the kernel.
-  RangeDatasetParams range_dataset_params;
+    return Status::OK();
+  }
 
-  // RangeDataset kernel wrapped in a variant tensor. Initialized by the test
-  // harness class because the MakeRangeDataset() method requires an instance of
-  // DatasetOpsTestBase.
-  Tensor input_dataset;
+  Status GetAttributes(AttributeVector* attr_vector) const override {
+    *attr_vector = {{SamplingDatasetOp::kOutputTypes, output_dtypes_},
+                    {SamplingDatasetOp::kOutputShapes, output_shapes_}};
+    return Status::OK();
+  }
 
  private:
+  // Target sample rate, range (0,1], wrapped in a scalar Tensor
+  Tensor rate_;
   // Boxed versions of kRandomSeed and kRandomSeed2.
   Tensor seed_tensor_ = CreateTensor<int64>(TensorShape({}), {kRandomSeed});
   Tensor seed2_tensor_ = CreateTensor<int64>(TensorShape({}), {kRandomSeed2});
 };
 
-class SamplingDatasetOpTest
-    : public DatasetOpsTestBaseV2<SamplingDatasetParams> {
- public:
-  Status Initialize(SamplingDatasetParams* dataset_params) override {
-    // Step 1: Set up enough of a TF runtime to be able to invoke a kernel.
-    TF_RETURN_IF_ERROR(InitThreadPool(thread_num_));
-    TF_RETURN_IF_ERROR(InitFunctionLibraryRuntime({}, cpu_num_));
-
-    // Step 2: Create the dataset that will provide input data for the kernel
-    TF_RETURN_IF_ERROR(MakeRangeDataset(dataset_params->range_dataset_params,
-                                        &dataset_params->input_dataset));
-
-    // Step 3: Box up the four inputs to the kernel inside TensorValue objects
-    // inside a vector.
-    gtl::InlinedVector<TensorValue, 4> inputs;
-    TF_RETURN_IF_ERROR(dataset_params->MakeInputs(&inputs));
-
-    // Step 4: Create a dataset kernel to test, passing in attributes of the
-    // kernel.
-    TF_RETURN_IF_ERROR(MakeDatasetOpKernel(*dataset_params, &dataset_kernel_));
-
-    // Step 5: Create a context in which the kernel will operate. This is where
-    // the kernel gets initialized with its inputs
-    TF_RETURN_IF_ERROR(
-        CreateDatasetContext(dataset_kernel_.get(), &inputs, &dataset_ctx_));
-
-    // Step 6: Unbox the DatasetBase object inside the variant tensor backing
-    // the kernel.
-    TF_RETURN_IF_ERROR(
-        CreateDataset(dataset_kernel_.get(), dataset_ctx_.get(), &dataset_));
-
-    // Step 7: Create an iterator in case the test needs to read the output of
-    // the dataset.
-    TF_RETURN_IF_ERROR(
-        CreateIteratorContext(dataset_ctx_.get(), &iterator_ctx_));
-    TF_RETURN_IF_ERROR(dataset_->MakeIterator(iterator_ctx_.get(),
-                                              kIteratorPrefix, &iterator_));
-
-    return Status::OK();
-  }
-
-  // Creates a new `SamplingDataset` op kernel.
-  // Doesn't initialize the kernel's static parameters because they are inputs,
-  // not attributes.
-  Status MakeDatasetOpKernel(
-      const SamplingDatasetParams& dataset_params,
-      std::unique_ptr<OpKernel>* sampling_dataset_op_kernel) override {
-    NodeDef node_def = test::function::NDef(
-        kNodeName, name_utils::OpName(SamplingDatasetOp::kDatasetType),
-        // Inputs
-        {SamplingDatasetOp::kInputDataset, SamplingDatasetOp::kRate,
-         SamplingDatasetOp::kSeed, SamplingDatasetOp::kSeed2},
-        // Attributes
-        {{SamplingDatasetOp::kOutputTypes, dataset_params.output_dtypes},
-         {SamplingDatasetOp::kOutputShapes, dataset_params.output_shapes}});
-    TF_RETURN_IF_ERROR(CreateOpKernel(node_def, sampling_dataset_op_kernel));
-    return Status::OK();
-  }
-};
+class SamplingDatasetOpTest : public DatasetOpsTestBaseV2 {};
 
 SamplingDatasetParams OneHundredPercentSampleParams() {
-  return {/*rate*/ 1.0,
-          /*num_elements*/ 3,
-          /*output_dtypes*/ {DT_INT64},
-          /*output_shapes*/ {PartialTensorShape({})},
-          /*node_name=*/kNodeName};
+  return SamplingDatasetParams(RangeDatasetParams(0, 3, 1),
+                               /*rate=*/1.0,
+                               /*output_dtypes=*/{DT_INT64},
+                               /*output_shapes=*/{PartialTensorShape({})},
+                               /*node_name=*/kNodeName);
 }
 
 SamplingDatasetParams TenPercentSampleParams() {
-  return {/*rate*/ 0.1,
-          /*num_elements*/ 20,
-          /*output_dtypes*/ {DT_INT64},
-          /*output_shapes*/ {PartialTensorShape({})},
-          /*node_name=*/kNodeName};
+  return SamplingDatasetParams(RangeDatasetParams(0, 20, 1),
+                               /*rate=*/0.1,
+                               /*output_dtypes=*/{DT_INT64},
+                               /*output_shapes=*/{PartialTensorShape({})},
+                               /*node_name=*/kNodeName);
 }
 
 SamplingDatasetParams ZeroPercentSampleParams() {
-  return {/*rate*/ 0.0,
-          /*num_elements*/ 20,
-          /*output_dtypes*/ {DT_INT64},
-          /*output_shapes*/ {PartialTensorShape({})},
-          /*node_name=*/kNodeName};
+  return SamplingDatasetParams(RangeDatasetParams(0, 20, 1),
+                               /*rate=*/0.0,
+                               /*output_dtypes=*/{DT_INT64},
+                               /*output_shapes=*/{PartialTensorShape({})},
+                               /*node_name=*/kNodeName);
 }
 
 std::vector<GetNextTestCase<SamplingDatasetParams>> GetNextTestCases() {
@@ -168,7 +124,7 @@ std::vector<GetNextTestCase<SamplingDatasetParams>> GetNextTestCases() {
 }
 
 ITERATOR_GET_NEXT_TEST_P(SamplingDatasetOpTest, SamplingDatasetParams,
-                         GetNextTestCases());
+                         GetNextTestCases())
 
 std::vector<DatasetNodeNameTestCase<SamplingDatasetParams>>
 DatasetNodeNameTestCases() {
@@ -177,7 +133,7 @@ DatasetNodeNameTestCases() {
 }
 
 DATASET_NODE_NAME_TEST_P(SamplingDatasetOpTest, SamplingDatasetParams,
-                         DatasetNodeNameTestCases());
+                         DatasetNodeNameTestCases())
 
 std::vector<DatasetTypeStringTestCase<SamplingDatasetParams>>
 DatasetTypeStringTestCases() {
@@ -187,7 +143,7 @@ DatasetTypeStringTestCases() {
 }
 
 DATASET_TYPE_STRING_TEST_P(SamplingDatasetOpTest, SamplingDatasetParams,
-                           DatasetTypeStringTestCases());
+                           DatasetTypeStringTestCases())
 
 std::vector<DatasetOutputDtypesTestCase<SamplingDatasetParams>>
 DatasetOutputDtypesTestCases() {
@@ -196,7 +152,7 @@ DatasetOutputDtypesTestCases() {
 }
 
 DATASET_OUTPUT_DTYPES_TEST_P(SamplingDatasetOpTest, SamplingDatasetParams,
-                             DatasetOutputDtypesTestCases());
+                             DatasetOutputDtypesTestCases())
 
 std::vector<DatasetOutputShapesTestCase<SamplingDatasetParams>>
 DatasetOutputShapesTestCases() {
@@ -205,7 +161,7 @@ DatasetOutputShapesTestCases() {
 }
 
 DATASET_OUTPUT_SHAPES_TEST_P(SamplingDatasetOpTest, SamplingDatasetParams,
-                             DatasetOutputShapesTestCases());
+                             DatasetOutputShapesTestCases())
 
 std::vector<CardinalityTestCase<SamplingDatasetParams>> CardinalityTestCases() {
   return {{/*dataset_params=*/OneHundredPercentSampleParams(),
@@ -217,7 +173,7 @@ std::vector<CardinalityTestCase<SamplingDatasetParams>> CardinalityTestCases() {
 }
 
 DATASET_CARDINALITY_TEST_P(SamplingDatasetOpTest, SamplingDatasetParams,
-                           CardinalityTestCases());
+                           CardinalityTestCases())
 
 std::vector<IteratorOutputDtypesTestCase<SamplingDatasetParams>>
 IteratorOutputDtypesTestCases() {
@@ -226,7 +182,7 @@ IteratorOutputDtypesTestCases() {
 }
 
 ITERATOR_OUTPUT_DTYPES_TEST_P(SamplingDatasetOpTest, SamplingDatasetParams,
-                              IteratorOutputDtypesTestCases());
+                              IteratorOutputDtypesTestCases())
 
 std::vector<IteratorOutputShapesTestCase<SamplingDatasetParams>>
 IteratorOutputShapesTestCases() {
@@ -235,7 +191,7 @@ IteratorOutputShapesTestCases() {
 }
 
 ITERATOR_OUTPUT_SHAPES_TEST_P(SamplingDatasetOpTest, SamplingDatasetParams,
-                              IteratorOutputShapesTestCases());
+                              IteratorOutputShapesTestCases())
 
 std::vector<IteratorPrefixTestCase<SamplingDatasetParams>>
 IteratorOutputPrefixTestCases() {
@@ -245,7 +201,7 @@ IteratorOutputPrefixTestCases() {
 }
 
 ITERATOR_PREFIX_TEST_P(SamplingDatasetOpTest, SamplingDatasetParams,
-                       IteratorOutputPrefixTestCases());
+                       IteratorOutputPrefixTestCases())
 
 std::vector<IteratorSaveAndRestoreTestCase<SamplingDatasetParams>>
 IteratorSaveAndRestoreTestCases() {
@@ -263,7 +219,7 @@ IteratorSaveAndRestoreTestCases() {
 }
 
 ITERATOR_SAVE_AND_RESTORE_TEST_P(SamplingDatasetOpTest, SamplingDatasetParams,
-                                 IteratorSaveAndRestoreTestCases());
+                                 IteratorSaveAndRestoreTestCases())
 
 }  // namespace
 }  // namespace experimental
diff --git a/tensorflow/core/kernels/data/experimental/snapshot_dataset_op.cc b/tensorflow/core/kernels/data/experimental/snapshot_dataset_op.cc
index 66596215ee3..9da220f326a 100644
--- a/tensorflow/core/kernels/data/experimental/snapshot_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/snapshot_dataset_op.cc
@@ -29,7 +29,10 @@ limitations under the License.
 #include "tensorflow/core/lib/io/buffered_inputstream.h"
 #include "tensorflow/core/lib/io/compression.h"
 #include "tensorflow/core/lib/io/random_inputstream.h"
+#include "tensorflow/core/platform/file_system.h"
 #if !defined(IS_SLIM_BUILD)
+#include "tensorflow/core/lib/io/snappy/snappy_inputbuffer.h"
+#include "tensorflow/core/lib/io/snappy/snappy_outputbuffer.h"
 #include "tensorflow/core/lib/io/zlib_compression_options.h"
 #include "tensorflow/core/lib/io/zlib_inputstream.h"
 #include "tensorflow/core/lib/io/zlib_outputbuffer.h"
@@ -55,6 +58,8 @@ enum SnapshotMode { READER = 0, WRITER = 1, PASSTHROUGH = 2 };
 // Defaults to 10 GiB per shard.
 const int64 kDefaultShardSizeBytes = 10LL * 1024 * 1024 * 1024;
 
+const int64 kSnappyBufferSizeBytes = 256 << 10;  // 256 KB
+
 const size_t kHeaderSize = sizeof(uint64);
 
 constexpr char kSnapshotFilename[] = "snapshot.metadata";
@@ -72,11 +77,13 @@ class SnapshotWriter {
   explicit SnapshotWriter(WritableFile* dest, const string& compression_type =
                                                   io::compression::kNone)
       : dest_(dest), compression_type_(compression_type) {
-    if (compression_type == io::compression::kGzip) {
 #if defined(IS_SLIM_BUILD)
+    if (compression_type != io::compression::kNone) {
       LOG(ERROR) << "Compression is unsupported on mobile platforms. Turning "
                  << "off compression.";
+    }
 #else   // IS_SLIM_BUILD
+    if (compression_type == io::compression::kGzip) {
       io::ZlibCompressionOptions zlib_options;
       zlib_options = io::ZlibCompressionOptions::GZIP();
 
@@ -86,8 +93,14 @@ class SnapshotWriter {
       TF_CHECK_OK(zlib_output_buffer->Init());
       dest_ = zlib_output_buffer;
       dest_is_owned_ = true;
-#endif  // IS_SLIM_BUILD
+    } else if (compression_type == io::compression::kSnappy) {
+      io::SnappyOutputBuffer* snappy_output_buffer = new io::SnappyOutputBuffer(
+          dest, /*input_buffer_bytes=*/kSnappyBufferSizeBytes,
+          /*output_buffer_bytes=*/kSnappyBufferSizeBytes);
+      dest_ = snappy_output_buffer;
+      dest_is_owned_ = true;
     }
+#endif  // IS_SLIM_BUILD
   }
 
   Status WriteRecord(const StringPiece& data) {
@@ -147,21 +160,28 @@ class SnapshotReader {
   explicit SnapshotReader(
       RandomAccessFile* file,
       const string& compression_type = io::compression::kNone)
-      : input_stream_(new io::RandomAccessInputStream(file)),
+      : file_(file),
+        input_stream_(new io::RandomAccessInputStream(file)),
         compression_type_(compression_type) {
-    if (compression_type_ == io::compression::kGzip) {
 #if defined(IS_SLIM_BUILD)
+    if (compression_type_ != io::compression::kNone) {
       LOG(ERROR) << "Compression is unsupported on mobile platforms. Turning "
                  << "off compression.";
+    }
 #else   // IS_SLIM_BUILD
+    if (compression_type_ == io::compression::kGzip) {
       io::ZlibCompressionOptions zlib_options;
       zlib_options = io::ZlibCompressionOptions::GZIP();
 
       input_stream_.reset(new io::ZlibInputStream(
           input_stream_.release(), zlib_options.input_buffer_size,
           zlib_options.output_buffer_size, zlib_options, true));
-#endif  // IS_SLIM_BUILD
+    } else if (compression_type_ == io::compression::kSnappy) {
+      input_stream_ = absl::make_unique<io::SnappyInputBuffer>(
+          file_, /*input_buffer_bytes=*/kSnappyBufferSizeBytes,
+          /*output_buffer_bytes=*/kSnappyBufferSizeBytes);
     }
+#endif  // IS_SLIM_BUILD
   }
 
   Status ReadRecord(tstring* record) {
@@ -194,6 +214,7 @@ class SnapshotReader {
 #endif
 
  private:
+  RandomAccessFile* file_;
   std::unique_ptr<io::InputStreamInterface> input_stream_;
   const string compression_type_;
 };
@@ -323,8 +344,10 @@ class SnapshotDatasetOp : public UnaryDatasetOpKernel {
     OP_REQUIRES(
         ctx,
         compression_ == io::compression::kNone ||
-            compression_ == io::compression::kGzip,
-        errors::InvalidArgument("compression must be either '' or 'GZIP'."));
+            compression_ == io::compression::kGzip ||
+            compression_ == io::compression::kSnappy,
+        errors::InvalidArgument("compression must be either '', 'GZIP' or "
+                                "'SNAPPY'."));
 
     OP_REQUIRES(
         ctx, pending_snapshot_expiry_seconds_ >= 1,
diff --git a/tensorflow/core/kernels/data/map_dataset_op_test.cc b/tensorflow/core/kernels/data/map_dataset_op_test.cc
index 3c0a635ab00..a0f0edf7e3d 100644
--- a/tensorflow/core/kernels/data/map_dataset_op_test.cc
+++ b/tensorflow/core/kernels/data/map_dataset_op_test.cc
@@ -22,127 +22,63 @@ namespace {
 
 constexpr char kNodeName[] = "map_dataset";
 
-class MapDatasetParams : public DatasetParams {
- public:
-  MapDatasetParams(RangeDatasetParams range_dataset_params,
-                   std::vector<Tensor> other_arguments,
-                   FunctionDefHelper::AttrValueWrapper func,
-                   std::vector<FunctionDef> func_lib,
-                   DataTypeVector type_arguments, DataTypeVector output_dtypes,
-                   std::vector<PartialTensorShape> output_shapes,
-                   bool use_inter_op_parallelism, bool preserve_cardinality,
-                   string node_name)
-      : DatasetParams(std::move(output_dtypes), std::move(output_shapes),
-                      std::move(node_name)),
-        range_dataset_params(std::move(range_dataset_params)),
-        other_arguments(std::move(other_arguments)),
-        func(std::move(func)),
-        func_lib(std::move(func_lib)),
-        type_arguments(std::move(type_arguments)),
-        use_inter_op_parallelism(use_inter_op_parallelism),
-        preserve_cardinality(preserve_cardinality) {}
-
-  Status MakeInputs(gtl::InlinedVector<TensorValue, 4>* inputs) override {
-    if (!IsDatasetTensor(input_dataset)) {
-      return tensorflow::errors::Internal(
-          "The input dataset is not populated as the dataset tensor yet.");
-    }
-    *inputs = {TensorValue(&input_dataset)};
-    for (auto& argument : other_arguments) {
-      inputs->emplace_back(TensorValue(&argument));
-    }
-    return Status::OK();
-  }
-
-  RangeDatasetParams range_dataset_params;
-  Tensor input_dataset;
-  std::vector<Tensor> other_arguments;
-  FunctionDefHelper::AttrValueWrapper func;
-  std::vector<FunctionDef> func_lib;
-  DataTypeVector type_arguments;
-  bool use_inter_op_parallelism;
-  bool preserve_cardinality;
-};
-
-class MapDatasetOpTest : public DatasetOpsTestBaseV2<MapDatasetParams> {
- public:
-  Status Initialize(MapDatasetParams* map_dataset_params) override {
-    TF_RETURN_IF_ERROR(InitThreadPool(thread_num_));
-    TF_RETURN_IF_ERROR(
-        InitFunctionLibraryRuntime(map_dataset_params->func_lib, cpu_num_));
-
-    TF_RETURN_IF_ERROR(
-        MakeDatasetOpKernel(*map_dataset_params, &dataset_kernel_));
-    TF_RETURN_IF_ERROR(
-        MakeRangeDataset(map_dataset_params->range_dataset_params,
-                         &map_dataset_params->input_dataset));
-    gtl::InlinedVector<TensorValue, 4> inputs;
-    TF_RETURN_IF_ERROR(map_dataset_params->MakeInputs(&inputs));
-    TF_RETURN_IF_ERROR(
-        CreateDatasetContext(dataset_kernel_.get(), &inputs, &dataset_ctx_));
-    TF_RETURN_IF_ERROR(
-        CreateDataset(dataset_kernel_.get(), dataset_ctx_.get(), &dataset_));
-    TF_RETURN_IF_ERROR(
-        CreateIteratorContext(dataset_ctx_.get(), &iterator_ctx_));
-    TF_RETURN_IF_ERROR(dataset_->MakeIterator(
-        iterator_ctx_.get(), map_dataset_params->iterator_prefix, &iterator_));
-    return Status::OK();
-  }
-
- protected:
-  // Creates a new MapDataset op kernel.
-  Status MakeDatasetOpKernel(const MapDatasetParams& map_dataset_params,
-                             std::unique_ptr<OpKernel>* map_kernel) override {
-    NodeDef map_dataset_node_def = test::function::NDef(
-        map_dataset_params.node_name,
-        name_utils::OpName(MapDatasetOp::kDatasetType),
-        {MapDatasetOp::kInputDataset},
-        {{MapDatasetOp::kFunc, map_dataset_params.func},
-         {MapDatasetOp::kTarguments, map_dataset_params.type_arguments},
-         {MapDatasetOp::kOutputShapes, map_dataset_params.output_shapes},
-         {MapDatasetOp::kOutputTypes, map_dataset_params.output_dtypes},
-         {MapDatasetOp::kUseInterOpParallelism,
-          map_dataset_params.use_inter_op_parallelism},
-         {MapDatasetOp::kPreserveCardinality,
-          map_dataset_params.preserve_cardinality}});
-    TF_RETURN_IF_ERROR(CreateOpKernel(map_dataset_node_def, map_kernel));
-    return Status::OK();
-  }
-};
+class MapDatasetOpTest : public DatasetOpsTestBaseV2 {};
 
 MapDatasetParams MapDatasetParams1() {
-  return {{/*start=*/0, /*stop=*/10, /*step=*/3},
-          /*other_arguments=*/{},
-          /*func=*/
-          FunctionDefHelper::FunctionRef("XTimesTwo", {{"T", DT_INT64}}),
-          /*func_lib=*/{test::function::XTimesTwo()},
-          /*type_arguments=*/{},
-          /*output_dtypes=*/{DT_INT64},
-          /*output_shapes=*/{PartialTensorShape({})},
-          /*use_inter_op_parallelism=*/true,
-          /*preserve_cardinality=*/true,
-          /*node_name=*/kNodeName};
+  auto map_dataset_params_0 = MapDatasetParams(
+      RangeDatasetParams(0, 10, 3),
+      /*other_arguments=*/{},
+      /*func=*/
+      FunctionDefHelper::FunctionRef("XTimesTwo", {{"T", DT_INT64}}),
+      /*func_lib=*/{test::function::XTimesTwo()},
+      /*type_arguments=*/{},
+      /*output_dtypes=*/{DT_INT64},
+      /*output_shapes=*/{PartialTensorShape({})},
+      /*use_inter_op_parallelism=*/true,
+      /*preserve_cardinality=*/true,
+      /*node_name=*/"map_dataset_0");
+  return MapDatasetParams(
+      std::move(map_dataset_params_0),
+      /*other_arguments=*/{},
+      /*func=*/
+      FunctionDefHelper::FunctionRef("XTimesTwo", {{"T", DT_INT64}}),
+      /*func_lib=*/{test::function::XTimesTwo()},
+      /*type_arguments=*/{},
+      /*output_dtypes=*/{DT_INT64},
+      /*output_shapes=*/{PartialTensorShape({})},
+      /*use_inter_op_parallelism=*/true,
+      /*preserve_cardinality=*/true,
+      /*node_name=*/"map_dataset_1");
 }
 
 MapDatasetParams MapDatasetParams2() {
-  return {{/*start=*/10, /*stop=*/0, /*step=*/-3},
-          /*other_arguments=*/{},
-          /*func=*/
-          FunctionDefHelper::FunctionRef("XAddX", {{"T", DT_INT64}}),
-          /*func_lib=*/{test::function::XAddX()},
-          /*type_arguments=*/{},
-          /*output_dtypes=*/{DT_INT64},
-          /*output_shapes=*/{PartialTensorShape({})},
-          /*use_inter_op_parallelism=*/true,
-          /*preserve_cardinality=*/false,
-          /*node_name=*/kNodeName};
+  auto batch_dataset_params =
+      BatchDatasetParams(RangeDatasetParams(10, 0, -3),
+                         /*batch_size=*/2,
+                         /*drop_remainder=*/false,
+                         /*parallel_copy=*/true,
+                         /*output_dtypes=*/{DT_INT64},
+                         /*output_shapes=*/{PartialTensorShape({2})},
+                         /*node_name=*/"batch_dataset");
+  return MapDatasetParams(
+      std::move(batch_dataset_params),
+      /*other_arguments=*/{},
+      /*func=*/
+      FunctionDefHelper::FunctionRef("XAddX", {{"T", DT_INT64}}),
+      /*func_lib=*/{test::function::XAddX()},
+      /*type_arguments=*/{},
+      /*output_dtypes=*/{DT_INT64},
+      /*output_shapes=*/{PartialTensorShape({1})},
+      /*use_inter_op_parallelism=*/true,
+      /*preserve_cardinality=*/false,
+      /*node_name=*/kNodeName);
 }
 
 // In this test case, the function `XTimesFour()` will call `XTimesTwo()`, so
 // both of them are added to the function library.
 MapDatasetParams MapDatasetParams3() {
-  return {
-      {/*start=*/0, /*stop=*/10, /*step=*/3},
+  return MapDatasetParams(
+      RangeDatasetParams(0, 10, 3),
       /*other_arguments=*/{},
       /*func=*/
       FunctionDefHelper::FunctionRef("XTimesFour", {{"T", DT_INT64}}),
@@ -152,16 +88,16 @@ MapDatasetParams MapDatasetParams3() {
       /*output_shapes=*/{PartialTensorShape({})},
       /*use_inter_op_parallelism=*/false,
       /*preserve_cardinality=*/true,
-      /*node_name=*/kNodeName};
+      /*node_name=*/kNodeName);
 }
 
 std::vector<GetNextTestCase<MapDatasetParams>> GetNextTestCases() {
   return {{/*dataset_params=*/MapDatasetParams1(),
            /*expected_outputs=*/
-           CreateTensors<int64>(TensorShape({}), {{0}, {6}, {12}, {18}})},
+           CreateTensors<int64>(TensorShape({}), {{0}, {12}, {24}, {36}})},
           {/*dataset_params=*/MapDatasetParams2(),
            /*expected_outputs=*/
-           CreateTensors<int64>(TensorShape({}), {{20}, {14}, {8}, {2}})},
+           CreateTensors<int64>(TensorShape({2}), {{20, 14}, {8, 2}})},
           {/*dataset_params=*/MapDatasetParams3(),
            /*expected_outputs=*/
            CreateTensors<int64>(TensorShape({}), {{0}, {12}, {24}, {36}})}};
@@ -169,44 +105,38 @@ std::vector<GetNextTestCase<MapDatasetParams>> GetNextTestCases() {
 
 ITERATOR_GET_NEXT_TEST_P(MapDatasetOpTest, MapDatasetParams, GetNextTestCases())
 
-std::vector<DatasetNodeNameTestCase<MapDatasetParams>>
-DatasetNodeNameTestCases() {
-  return {{/*dataset_params=*/MapDatasetParams1(),
-           /*expected_node_name=*/kNodeName}};
-}
-
-DATASET_NODE_NAME_TEST_P(MapDatasetOpTest, MapDatasetParams,
-                         DatasetNodeNameTestCases())
-
 TEST_F(MapDatasetOpTest, DatasetNodeName) {
   auto dataset_params = MapDatasetParams1();
-  TF_ASSERT_OK(Initialize(&dataset_params));
-  TF_ASSERT_OK(CheckDatasetNodeName(dataset_params.node_name));
+  TF_ASSERT_OK(Initialize(dataset_params));
+  TF_ASSERT_OK(CheckDatasetNodeName(dataset_params.node_name()));
 }
 
 TEST_F(MapDatasetOpTest, DatasetTypeString) {
   auto dataset_params = MapDatasetParams1();
-  TF_ASSERT_OK(Initialize(&dataset_params));
+  TF_ASSERT_OK(Initialize(dataset_params));
   TF_ASSERT_OK(
       CheckDatasetTypeString(name_utils::OpName(MapDatasetOp::kDatasetType)));
 }
 
 TEST_F(MapDatasetOpTest, DatasetOutputDtypes) {
   auto dataset_params = MapDatasetParams1();
-  TF_ASSERT_OK(Initialize(&dataset_params));
+  TF_ASSERT_OK(Initialize(dataset_params));
   TF_ASSERT_OK(CheckDatasetOutputDtypes({DT_INT64}));
 }
 
 TEST_F(MapDatasetOpTest, DatasetOutputShapes) {
   auto dataset_params = MapDatasetParams1();
-  TF_ASSERT_OK(Initialize(&dataset_params));
+  TF_ASSERT_OK(Initialize(dataset_params));
   TF_ASSERT_OK(CheckDatasetOutputShapes({PartialTensorShape({})}));
 }
 
 std::vector<CardinalityTestCase<MapDatasetParams>> CardinalityTestCases() {
-  return {{/*dataset_params=*/MapDatasetParams1(), /*expected_cardinality=*/4},
-          {/*dataset_params=*/MapDatasetParams2(), /*expected_cardinality=*/4},
-          {/*dataset_params=*/MapDatasetParams3(), /*expected_cardinality=*/4}};
+  return {{/*dataset_params=*/MapDatasetParams1(),
+           /*expected_cardinality=*/4},
+          {/*dataset_params=*/MapDatasetParams2(),
+           /*expected_cardinality=*/2},
+          {/*dataset_params=*/MapDatasetParams3(),
+           /*expected_cardinality=*/4}};
 }
 
 DATASET_CARDINALITY_TEST_P(MapDatasetOpTest, MapDatasetParams,
@@ -214,21 +144,21 @@ DATASET_CARDINALITY_TEST_P(MapDatasetOpTest, MapDatasetParams,
 
 TEST_F(MapDatasetOpTest, IteratorOutputDtypes) {
   auto dataset_params = MapDatasetParams1();
-  TF_ASSERT_OK(Initialize(&dataset_params));
+  TF_ASSERT_OK(Initialize(dataset_params));
   TF_ASSERT_OK(CheckIteratorOutputDtypes({DT_INT64}));
 }
 
 TEST_F(MapDatasetOpTest, IteratorOutputShapes) {
   auto dataset_params = MapDatasetParams1();
-  TF_ASSERT_OK(Initialize(&dataset_params));
+  TF_ASSERT_OK(Initialize(dataset_params));
   TF_ASSERT_OK(CheckIteratorOutputShapes({PartialTensorShape({})}));
 }
 
 TEST_F(MapDatasetOpTest, IteratorPrefix) {
   auto dataset_params = MapDatasetParams1();
-  TF_ASSERT_OK(Initialize(&dataset_params));
+  TF_ASSERT_OK(Initialize(dataset_params));
   TF_ASSERT_OK(CheckIteratorPrefix(name_utils::IteratorPrefix(
-      MapDatasetOp::kDatasetType, dataset_params.iterator_prefix)));
+      MapDatasetOp::kDatasetType, dataset_params.iterator_prefix())));
 }
 
 std::vector<IteratorSaveAndRestoreTestCase<MapDatasetParams>>
@@ -236,11 +166,11 @@ IteratorSaveAndRestoreTestCases() {
   return {{/*dataset_params=*/MapDatasetParams1(),
            /*breakpoints*/ {0, 1, 5},
            /*expected_outputs=*/
-           CreateTensors<int64>(TensorShape({}), {{0}, {6}, {12}, {18}})},
+           CreateTensors<int64>(TensorShape({}), {{0}, {12}, {24}, {36}})},
           {/*dataset_params=*/MapDatasetParams2(),
            /*breakpoints*/ {0, 1, 5},
            /*expected_outputs=*/
-           CreateTensors<int64>(TensorShape({}), {{20}, {14}, {8}, {2}})},
+           CreateTensors<int64>(TensorShape({2}), {{20, 14}, {8, 2}})},
           {/*dataset_params=*/MapDatasetParams3(),
            /*breakpoints*/ {0, 1, 5},
            /*expected_outputs=*/
diff --git a/tensorflow/core/kernels/data/range_dataset_op_test.cc b/tensorflow/core/kernels/data/range_dataset_op_test.cc
index 62f621fd838..1e74d9f20a0 100644
--- a/tensorflow/core/kernels/data/range_dataset_op_test.cc
+++ b/tensorflow/core/kernels/data/range_dataset_op_test.cc
@@ -20,70 +20,18 @@ namespace tensorflow {
 namespace data {
 namespace {
 
-constexpr char kNodeName[] = "range_dataset";
-
-class RangeDatasetOpTest : public DatasetOpsTestBaseV2<RangeDatasetParams> {
- public:
-  Status Initialize(RangeDatasetParams* range_dataset_params) override {
-    TF_RETURN_IF_ERROR(InitThreadPool(thread_num_));
-    TF_RETURN_IF_ERROR(InitFunctionLibraryRuntime({}, cpu_num_));
-
-    TF_RETURN_IF_ERROR(
-        MakeDatasetOpKernel(*range_dataset_params, &dataset_kernel_));
-    gtl::InlinedVector<TensorValue, 4> inputs;
-    TF_RETURN_IF_ERROR(range_dataset_params->MakeInputs(&inputs));
-    TF_RETURN_IF_ERROR(
-        CreateDatasetContext(dataset_kernel_.get(), &inputs, &dataset_ctx_));
-    TF_RETURN_IF_ERROR(
-        CreateDataset(dataset_kernel_.get(), dataset_ctx_.get(), &dataset_));
-    TF_RETURN_IF_ERROR(
-        CreateIteratorContext(dataset_ctx_.get(), &iterator_ctx_));
-    TF_RETURN_IF_ERROR(dataset_->MakeIterator(
-        iterator_ctx_.get(), range_dataset_params->iterator_prefix,
-        &iterator_));
-    return Status::OK();
-  }
-
- protected:
-  Status MakeDatasetOpKernel(
-      const RangeDatasetParams& dataset_params,
-      std::unique_ptr<OpKernel>* range_dataset_op_kernel) override {
-    NodeDef node_def = test::function::NDef(
-        dataset_params.node_name,
-        name_utils::OpName(RangeDatasetOp::kDatasetType),
-        {RangeDatasetOp::kStart, RangeDatasetOp::kStop, RangeDatasetOp::kStep},
-        {{RangeDatasetOp::kOutputTypes, dataset_params.output_dtypes},
-         {RangeDatasetOp::kOutputShapes, dataset_params.output_shapes}});
-    TF_RETURN_IF_ERROR(CreateOpKernel(node_def, range_dataset_op_kernel));
-    return Status::OK();
-  }
-};
+class RangeDatasetOpTest : public DatasetOpsTestBaseV2 {};
 
 RangeDatasetParams PositiveStepRangeDatasetParams() {
-  return {/*start=*/0,
-          /*stop=*/10,
-          /*step=*/3,
-          /*output_dtypes=*/{DT_INT64},
-          /*output_shapes=*/{PartialTensorShape({})},
-          /*node_name=*/kNodeName};
+  return RangeDatasetParams(/*start=*/0, /*stop=*/10, /*step=*/3);
 }
 
 RangeDatasetParams NegativeStepRangeDatasetParams() {
-  return {/*start=*/10,
-          /*stop=*/0,
-          /*step=*/-3,
-          /*output_dtypes=*/{DT_INT64},
-          /*output_shapes=*/{PartialTensorShape({})},
-          /*node_name=*/kNodeName};
+  return RangeDatasetParams(/*start=*/10, /*stop=*/0, /*step=*/-3);
 }
 
 RangeDatasetParams ZeroStepRangeDatasetParams() {
-  return {/*start=*/10,
-          /*stop=*/0,
-          /*step=*/0,
-          /*output_dtypes=*/{DT_INT64},
-          /*output_shapes=*/{PartialTensorShape({})},
-          /*node_name=*/kNodeName};
+  return RangeDatasetParams(/*start=*/10, /*stop=*/0, /*step=*/0);
 }
 
 std::vector<GetNextTestCase<RangeDatasetParams>> GetNextTestCases() {
@@ -100,26 +48,26 @@ ITERATOR_GET_NEXT_TEST_P(RangeDatasetOpTest, RangeDatasetParams,
 
 TEST_F(RangeDatasetOpTest, DatasetNodeName) {
   auto range_dataset_params = PositiveStepRangeDatasetParams();
-  TF_ASSERT_OK(Initialize(&range_dataset_params));
-  TF_ASSERT_OK(CheckDatasetNodeName(range_dataset_params.node_name));
+  TF_ASSERT_OK(Initialize(range_dataset_params));
+  TF_ASSERT_OK(CheckDatasetNodeName(range_dataset_params.node_name()));
 }
 
 TEST_F(RangeDatasetOpTest, DatasetTypeString) {
   auto range_dataset_params = PositiveStepRangeDatasetParams();
-  TF_ASSERT_OK(Initialize(&range_dataset_params));
+  TF_ASSERT_OK(Initialize(range_dataset_params));
   TF_ASSERT_OK(
       CheckDatasetTypeString(name_utils::OpName(RangeDatasetOp::kDatasetType)));
 }
 
 TEST_F(RangeDatasetOpTest, DatasetOutputDtypes) {
   auto range_dataset_params = PositiveStepRangeDatasetParams();
-  TF_ASSERT_OK(Initialize(&range_dataset_params));
+  TF_ASSERT_OK(Initialize(range_dataset_params));
   TF_ASSERT_OK(CheckDatasetOutputDtypes({DT_INT64}));
 }
 
 TEST_F(RangeDatasetOpTest, DatasetOutputShapes) {
   auto range_dataset_params = PositiveStepRangeDatasetParams();
-  TF_ASSERT_OK(Initialize(&range_dataset_params));
+  TF_ASSERT_OK(Initialize(range_dataset_params));
   TF_ASSERT_OK(CheckDatasetOutputShapes({PartialTensorShape({})}));
 }
 
@@ -135,21 +83,21 @@ DATASET_CARDINALITY_TEST_P(RangeDatasetOpTest, RangeDatasetParams,
 
 TEST_F(RangeDatasetOpTest, IteratorOutputDtypes) {
   auto range_dataset_params = PositiveStepRangeDatasetParams();
-  TF_ASSERT_OK(Initialize(&range_dataset_params));
+  TF_ASSERT_OK(Initialize(range_dataset_params));
   TF_ASSERT_OK(CheckIteratorOutputDtypes({DT_INT64}));
 }
 
 TEST_F(RangeDatasetOpTest, IteratorOutputShapes) {
   auto range_dataset_params = PositiveStepRangeDatasetParams();
-  TF_ASSERT_OK(Initialize(&range_dataset_params));
+  TF_ASSERT_OK(Initialize(range_dataset_params));
   TF_ASSERT_OK(CheckIteratorOutputShapes({PartialTensorShape({})}));
 }
 
 TEST_F(RangeDatasetOpTest, IteratorPrefix) {
   auto range_dataset_params = PositiveStepRangeDatasetParams();
-  TF_ASSERT_OK(Initialize(&range_dataset_params));
+  TF_ASSERT_OK(Initialize(range_dataset_params));
   TF_ASSERT_OK(CheckIteratorPrefix(name_utils::IteratorPrefix(
-      RangeDatasetOp::kDatasetType, range_dataset_params.iterator_prefix)));
+      RangeDatasetOp::kDatasetType, range_dataset_params.iterator_prefix())));
 }
 
 std::vector<IteratorSaveAndRestoreTestCase<RangeDatasetParams>>
@@ -169,7 +117,7 @@ ITERATOR_SAVE_AND_RESTORE_TEST_P(RangeDatasetOpTest, RangeDatasetParams,
 
 TEST_F(RangeDatasetOpTest, ZeroStep) {
   auto range_dataset_params = ZeroStepRangeDatasetParams();
-  EXPECT_EQ(Initialize(&range_dataset_params).code(),
+  EXPECT_EQ(Initialize(range_dataset_params).code(),
             tensorflow::error::INVALID_ARGUMENT);
 }
 
diff --git a/tensorflow/core/kernels/dequantize_op.cc b/tensorflow/core/kernels/dequantize_op.cc
index 6abbb3f31d6..247ed4edddd 100644
--- a/tensorflow/core/kernels/dequantize_op.cc
+++ b/tensorflow/core/kernels/dequantize_op.cc
@@ -56,12 +56,54 @@ class DequantizeOp : public OpKernel {
     } else if (mode_string == "SCALED") {
       mode_ = QUANTIZE_MODE_SCALED;
     }
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("axis", &axis_));
   }
 
   void Compute(OpKernelContext* ctx) override {
     const Tensor& input = ctx->input(0);
-    const float min_range = ctx->input(1).flat<float>()(0);
-    const float max_range = ctx->input(2).flat<float>()(0);
+    const Tensor& input_min_tensor = ctx->input(1);
+    const Tensor& input_max_tensor = ctx->input(2);
+
+    int num_slices = 1;
+    if (axis_ > -1) {
+      num_slices = input.dim_size(axis_);
+    }
+
+    Tensor* output = nullptr;
+    OP_REQUIRES_OK(ctx, ctx->allocate_output(0, input.shape(), &output));
+    if (num_slices == 1) {
+      const float min_range = input_min_tensor.flat<float>()(0);
+      const float max_range = input_max_tensor.flat<float>()(0);
+      DequantizeTensor(ctx, input, min_range, max_range, output);
+      return;
+    }
+
+    OP_REQUIRES(ctx, mode_ != QUANTIZE_MODE_MIN_FIRST,
+                errors::Unimplemented("MIN_FIRST mode is not implemented for "
+                                      "Dequantize with axis != -1."));
+
+    int64 pre_dim = 1, post_dim = 1;
+    for (int i = 0; i < axis_; ++i) {
+      pre_dim *= output->dim_size(i);
+    }
+    for (int i = axis_ + 1; i < output->dims(); ++i) {
+      post_dim *= output->dim_size(i);
+    }
+    auto input_tensor =
+        input.template bit_casted_shaped<T, 3>({pre_dim, num_slices, post_dim});
+    auto output_tensor = output->flat_inner_outer_dims<float, 3>(axis_ - 1);
+    auto min_ranges = input_min_tensor.vec<float>();
+    auto max_ranges = input_max_tensor.vec<float>();
+    for (int i = 0; i < num_slices; ++i) {
+      DequantizeSlice(ctx->eigen_device<Device>(), ctx,
+                      input_tensor.template chip<1>(i), min_ranges(i),
+                      max_ranges(i), output_tensor.template chip<1>(i));
+    }
+  }
+
+  void DequantizeTensor(OpKernelContext* ctx, const Tensor& input,
+                        const float min_range, const float max_range,
+                        Tensor* output) {
     const float half_range =
         !std::is_signed<T>::value
             ? 0.0f
@@ -69,8 +111,6 @@ class DequantizeOp : public OpKernel {
                std::numeric_limits<T>::min() + 1) /
                   2.0f;
 
-    Tensor* output = nullptr;
-    OP_REQUIRES_OK(ctx, ctx->allocate_output(0, input.shape(), &output));
     if (mode_ == QUANTIZE_MODE_MIN_COMBINED) {
       const float scale_factor =
           (max_range - min_range) /
@@ -105,8 +145,41 @@ class DequantizeOp : public OpKernel {
     }
   }
 
+  template <typename ConstVec, typename Vec>
+  void DequantizeSlice(const Device& d, OpKernelContext* ctx,
+                       const ConstVec& input, float min_range, float max_range,
+                       Vec output) {
+    // TODO(pauldonnelly): Factor out the similar calculations in quantize,
+    //   dequantize and quantize_and_dequantize ops.
+    const float half_range =
+        !std::is_signed<T>::value
+            ? 0.0f
+            : (static_cast<float>(std::numeric_limits<T>::max()) -
+               std::numeric_limits<T>::min() + 1) /
+                  2.0f;
+
+    if (mode_ == QUANTIZE_MODE_MIN_COMBINED) {
+      const float scale_factor =
+          (max_range - min_range) /
+          (static_cast<float>(std::numeric_limits<T>::max()) -
+           std::numeric_limits<T>::min());
+
+      output.device(d) =
+          ((input.template cast<float>() + half_range) * scale_factor) +
+          min_range;
+    } else if (mode_ == QUANTIZE_MODE_SCALED) {
+      const float scale_factor =
+          std::numeric_limits<T>::min() == 0
+              ? (max_range / std::numeric_limits<T>::max())
+              : std::max(min_range / std::numeric_limits<T>::min(),
+                         max_range / std::numeric_limits<T>::max());
+      output.device(d) = input.template cast<float>() * scale_factor;
+    }
+  }
+
  private:
   int mode_;
+  int axis_;
 };
 
 REGISTER_KERNEL_BUILDER(
diff --git a/tensorflow/core/kernels/dequantize_op_test.cc b/tensorflow/core/kernels/dequantize_op_test.cc
index 63b18d72631..562b53378e3 100644
--- a/tensorflow/core/kernels/dequantize_op_test.cc
+++ b/tensorflow/core/kernels/dequantize_op_test.cc
@@ -86,31 +86,68 @@ class DequantizeOpTest : public OpsTestBase {
     test::ExpectTensorEqual<float>(expected, *GetOutput(0));
   }
 
+  // Creates a tensor with the specified dims, using values chosen from data,
+  // multiplied by (1 + index) along the axis dimension.
   template <typename T>
-  void RunDequantizeScaledTest(float min_range, float max_range, int input_int,
-                               float expected_output) {
+  std::vector<T> ScalePerSliceAlongAxis(std::vector<int64> dims, int axis,
+                                        const std::vector<T>& data) {
+    uint32 seed = 123;
+    int64 out_size = 1;
+    for (int dim : dims) {
+      out_size *= dim;
+    }
+    int minor_size = 1;
+    for (int i = axis + 1; i < dims.size(); ++i) {
+      minor_size *= dims[i];
+    }
+    std::vector<T> out(out_size);
+    int num_slices = (axis == -1) ? 1 : dims[axis];
+    for (int out_idx = 0; out_idx < out_size; ++out_idx) {
+      int in_idx = rand_r(&seed) % data.size();
+      T multiplier = ((out_idx / minor_size) % num_slices) + 1;
+      out[out_idx] = data[in_idx] * multiplier;
+    }
+    return out;
+  }
+
+  template <typename T>
+  void RunDequantizeScaledTest(float min_range, float max_range, int axis,
+                               const std::vector<T>& values,
+                               const std::vector<float>& expected) {
+    const std::vector<int64> dims = {2, 3, 4, 5};
+    int num_slices = (axis == -1) ? 1 : dims[axis];
     TF_ASSERT_OK(NodeDefBuilder("dequantize_op", "Dequantize")
                      .Input(FakeInput(DataTypeToEnum<T>::v()))
                      .Input(FakeInput(DT_FLOAT))
                      .Input(FakeInput(DT_FLOAT))
                      .Attr("T", DataTypeToEnum<T>::v())
                      .Attr("mode", "SCALED")
+                     .Attr("axis", axis)
                      .Finalize(node_def()));
     TF_ASSERT_OK(InitOp());
 
-    std::vector<T> input;
-    input.push_back(static_cast<T>(input_int));
-    TensorShape shape({static_cast<int64>(input.size())});
-    AddInputFromArray<T>(shape, input);
-    AddInputFromArray<float>(TensorShape({}), {min_range});
-    AddInputFromArray<float>(TensorShape({}), {max_range});
+    AddInputFromArray<T>(TensorShape(dims),
+                         ScalePerSliceAlongAxis(dims, -1, values));
+    std::vector<float> min_ranges(num_slices), max_ranges(num_slices);
+    for (int slice_idx = 0; slice_idx < num_slices; ++slice_idx) {
+      min_ranges[slice_idx] = (slice_idx + 1) * min_range;
+      max_ranges[slice_idx] = (slice_idx + 1) * max_range;
+    }
+    AddInputFromArray<float>(TensorShape({num_slices}), min_ranges);
+    AddInputFromArray<float>(TensorShape({num_slices}), max_ranges);
     TF_ASSERT_OK(RunOpKernel());
-    Tensor expected(allocator(), DT_FLOAT, shape);
-    test::FillValues<float>(&expected, {expected_output});
-    test::ExpectClose(expected, *GetOutput(0));
+
+    Tensor expected_tensor(allocator(), DT_FLOAT, TensorShape(dims));
+    test::FillValues<float>(&expected_tensor,
+                            ScalePerSliceAlongAxis(dims, axis, expected));
+    test::ExpectClose(expected_tensor, *GetOutput(0));
   }
 };
 
+struct ParameterizedDequantizeOpTest
+    : public OpsTestBase,
+      public ::testing::WithParamInterface<int> {};
+
 TEST_F(DequantizeOpTest, DequantizeMinCombinedQuint8) {
   RunDequantizeMinCombinedTest<quint8>(0, 255.0f);
 }
@@ -125,29 +162,37 @@ TEST_F(DequantizeOpTest, DequantizeMinCombinedQuint16) {
 }
 
 TEST_F(DequantizeOpTest, DequantizeScaledQuint8Zero) {
-  RunDequantizeScaledTest<quint8>(-255.0f, 127.0f, 0, 0.0);
+  RunDequantizeScaledTest<quint8>(-255.0f, 127.0f, -1, {0}, {0.0});
 }
 TEST_F(DequantizeOpTest, DequantizeScaledQuint8CheckIgnoresNegative) {
-  RunDequantizeScaledTest<quint8>(-512.0f, 255.0f, 255, 255.0);
+  RunDequantizeScaledTest<quint8>(-512.0f, 255.0f, -1, {255}, {255.0});
 }
 TEST_F(DequantizeOpTest, DequantizeScaledQuint8ScaleDown) {
-  RunDequantizeScaledTest<quint8>(-1.0f, 2.0f, 255, 2.0);
+  RunDequantizeScaledTest<quint8>(-1.0f, 2.0f, -1, {255}, {2.0});
 }
 TEST_F(DequantizeOpTest, DequantizeScaledQuint8ScaleUp) {
-  RunDequantizeScaledTest<quint8>(200.0f, 400.0f, 255, 400.0);
+  RunDequantizeScaledTest<quint8>(200.0f, 400.0f, -1, {255}, {400.0});
 }
 
 TEST_F(DequantizeOpTest, DequantizeScaledQint8Zero) {
-  RunDequantizeScaledTest<qint8>(-255.0f, 127.0f, 0, 0.0);
+  RunDequantizeScaledTest<qint8>(-255.0f, 127.0f, -1, {0}, {0.0});
 }
 TEST_F(DequantizeOpTest, DequantizeScaledQint8ScaleIdentity) {
-  RunDequantizeScaledTest<qint8>(-10.0f, 127.0f, -127, -127.0);
+  RunDequantizeScaledTest<qint8>(-10.0f, 127.0f, -1, {-127}, {-127.0});
 }
 TEST_F(DequantizeOpTest, DequantizeScaledQint8ScaleDown) {
-  RunDequantizeScaledTest<qint8>(-2.0f, 1.0f, -128, -2.0);
+  RunDequantizeScaledTest<qint8>(-2.0f, 1.0f, -1, {-128}, {-2.0});
 }
 TEST_F(DequantizeOpTest, DequantizeScaledQint8ScaleUp) {
-  RunDequantizeScaledTest<qint8>(-1.0f, 300.0f, 42, 99.212601);
+  RunDequantizeScaledTest<qint8>(-1.0f, 300.0f, -1, {42}, {99.212601});
+}
+TEST_F(DequantizeOpTest, DequantizeScaledQint8Axis1) {
+  RunDequantizeScaledTest<qint8>(-12.8f, 12.7f, 1, {-20, -10, 0, 1, 10, 20},
+                                 {-2.0, -1.0, 0.0, 0.1, 1.0, 2.0});
+}
+TEST_F(DequantizeOpTest, DequantizeScaledQint8Axis3) {
+  RunDequantizeScaledTest<qint8>(-12.8f, 12.7f, 3, {-20, -10, 0, 1, 10, 20},
+                                 {-2.0, -1.0, 0.0, 0.1, 1.0, 2.0});
 }
 
 template <typename T>
diff --git a/tensorflow/core/kernels/einsum_op_impl.h b/tensorflow/core/kernels/einsum_op_impl.h
index 0139ec735da..a7afaad97bf 100644
--- a/tensorflow/core/kernels/einsum_op_impl.h
+++ b/tensorflow/core/kernels/einsum_op_impl.h
@@ -32,6 +32,7 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor_types.h"
 #include "tensorflow/core/kernels/batch_matmul_op_impl.h"
 #include "tensorflow/core/kernels/einsum_op.h"
+#include "tensorflow/core/kernels/fill_functor.h"
 #include "tensorflow/core/kernels/reduction_ops_common.h"
 #include "tensorflow/core/kernels/transpose_functor.h"
 #include "tensorflow/core/lib/core/errors.h"
@@ -343,6 +344,11 @@ struct EinsumHelper {
     for (int i = 0; i < input.dims(); ++i) {
       transposed_shape.AddDim(input.dim_size(permutation[i]));
     }
+    // For empty Tensors, just change the shape. E.g. we may need to transpose
+    // from shape [1, 0, 5] to [5, 1, 0].
+    if (input.NumElements() == 0) {
+      return CopyFrom(input, transposed_shape, output);
+    }
     TF_RETURN_IF_ERROR(
         ctx->allocate_temp(DataTypeToEnum<T>::value, transposed_shape, output));
     const Device& device = ctx->eigen_device<Device>();
@@ -586,6 +592,11 @@ struct EinsumHelper {
     }
     TF_RETURN_IF_ERROR(
         ctx->allocate_temp(DataTypeToEnum<T>::value, output_shape, output));
+    if (lhs.NumElements() == 0 || rhs.NumElements() == 0) {
+      functor::SetZeroFunctor<Device, T> set_zero;
+      set_zero(ctx->eigen_device<Device>(), output->flat<T>());
+      return Status::OK();
+    }
     Tensor output_reshaped;
     TF_RETURN_IF_ERROR(
         ReshapeToRank3(*output, bcast.output_batch_size(), &output_reshaped));
diff --git a/tensorflow/core/kernels/example_parsing_ops.cc b/tensorflow/core/kernels/example_parsing_ops.cc
index d07d3549493..a6c099a8138 100644
--- a/tensorflow/core/kernels/example_parsing_ops.cc
+++ b/tensorflow/core/kernels/example_parsing_ops.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include <unordered_set>
 #include <vector>
 
+#include "tensorflow/core/common_runtime/metrics.h"
 #include "tensorflow/core/example/example.pb.h"
 #include "tensorflow/core/example/feature.pb.h"
 #include "tensorflow/core/framework/common_shape_fns.h"
@@ -68,6 +69,11 @@ class ParseExampleOp : public OpKernel {
       OP_REQUIRES_OK(ctx, GetInputListKeys(ctx, "dense_keys", &dense_keys_t));
       OP_REQUIRES_OK(ctx, GetInputListKeys(ctx, "sparse_keys", &sparse_keys_t));
     }
+    std::call_once(flag_, [&dense_keys_t, &sparse_keys_t, &ragged_keys_t]() {
+      metrics::RecordParseDenseFeature(dense_keys_t.size());
+      metrics::RecordParseSparseFeature(sparse_keys_t.size());
+      metrics::RecordParseRaggedFeature(ragged_keys_t.size());
+    });
     OP_REQUIRES_OK(ctx, ctx->input_list("dense_defaults", &dense_defaults));
 
     // Validate input tensor shapes.
@@ -273,6 +279,7 @@ class ParseExampleOp : public OpKernel {
 
   ParseExampleAttrs attrs_;
   int op_version_;
+  std::once_flag flag_;
 };
 
 REGISTER_KERNEL_BUILDER(Name("ParseExample").Device(DEVICE_CPU),
@@ -284,6 +291,8 @@ class ParseSingleExampleOp : public OpKernel {
  public:
   explicit ParseSingleExampleOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
     OP_REQUIRES_OK(ctx, attrs_.Init(ctx));
+    metrics::RecordParseDenseFeature(attrs_.dense_keys.size());
+    metrics::RecordParseSparseFeature(attrs_.sparse_keys.size());
   }
 
   void Compute(OpKernelContext* ctx) override {
@@ -380,6 +389,10 @@ class ParseSequenceExampleOp : public OpKernel {
  public:
   explicit ParseSequenceExampleOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
     OP_REQUIRES_OK(ctx, attrs_.Init(ctx));
+    metrics::RecordParseDenseFeature(attrs_.context_dense_keys.size() +
+                                     attrs_.feature_list_dense_keys.size());
+    metrics::RecordParseSparseFeature(attrs_.context_sparse_keys.size() +
+                                      attrs_.feature_list_sparse_keys.size());
   }
 
   void Compute(OpKernelContext* ctx) override {
@@ -573,6 +586,14 @@ class ParseSingleSequenceExampleOp : public OpKernel {
         attrs_.num_feature_list_dense);
     std::vector<string> feature_list_sparse_keys_t(
         attrs_.num_feature_list_sparse);
+    std::call_once(
+        flag_, [&context_dense_keys_t, &context_sparse_keys_t,
+                &feature_list_dense_keys_t, &feature_list_sparse_keys_t]() {
+          metrics::RecordParseDenseFeature(context_dense_keys_t.size() +
+                                           feature_list_dense_keys_t.size());
+          metrics::RecordParseSparseFeature(context_sparse_keys_t.size() +
+                                            feature_list_sparse_keys_t.size());
+        });
     std::unordered_set<string> feature_list_dense_missing_assumed_empty_set;
     CHECK_EQ(context_dense_keys.size(), attrs_.num_context_dense);
     CHECK_EQ(context_sparse_keys.size(), attrs_.num_context_sparse);
@@ -941,6 +962,7 @@ class ParseSingleSequenceExampleOp : public OpKernel {
 
  protected:
   ParseSingleSequenceExampleAttrs attrs_;
+  std::once_flag flag_;
 };
 
 REGISTER_KERNEL_BUILDER(Name("ParseSingleSequenceExample").Device(DEVICE_CPU),
diff --git a/tensorflow/core/kernels/fake_quant_ops.cc b/tensorflow/core/kernels/fake_quant_ops.cc
index 01e3468c93d..f08575c0e99 100644
--- a/tensorflow/core/kernels/fake_quant_ops.cc
+++ b/tensorflow/core/kernels/fake_quant_ops.cc
@@ -21,10 +21,11 @@ limitations under the License.
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
 #include "tensorflow/core/kernels/fake_quant_ops_functor.h"
-
+// Above is the related header but clang tidy doesn't recognize it.
 #include "tensorflow/core/framework/numeric_op.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/monitoring/gauge.h"
 #include "tensorflow/core/platform/protobuf.h"
 
 using tensorflow::BinaryElementWiseOp;
@@ -46,6 +47,12 @@ namespace tensorflow {
 
 typedef Eigen::ThreadPoolDevice CPUDevice;
 
+auto* using_fake_quant = monitoring::Gauge<bool, 0>::New(
+    "/tensorflow/api/op/using_fake_quantization",
+    "True if a fake_quant op is created.");
+
+#define SET_USING_FAKE_QUANT() using_fake_quant->GetCell()->Set(true)
+
 namespace {
 bool IsNumBitsValid(int num_bits) { return num_bits >= 2 && num_bits <= 16; }
 }  // namespace
@@ -74,6 +81,7 @@ class FakeQuantWithMinMaxArgsOp
     OP_REQUIRES_OK(context, context->GetAttr("narrow_range", &narrow_range));
     quant_min_ = narrow_range ? 1 : 0;
     quant_max_ = (1 << num_bits) - 1;
+    SET_USING_FAKE_QUANT();
   }
 
   void Operate(OpKernelContext* context, const Tensor& input, Tensor* output) {
@@ -187,6 +195,7 @@ class FakeQuantWithMinMaxVarsOp : public OpKernel {
     OP_REQUIRES_OK(context, context->GetAttr("narrow_range", &narrow_range));
     quant_min_ = narrow_range ? 1 : 0;
     quant_max_ = (1 << num_bits) - 1;
+    SET_USING_FAKE_QUANT();
   }
 
   void Compute(OpKernelContext* context) override {
@@ -317,6 +326,7 @@ class FakeQuantWithMinMaxVarsPerChannelOp : public OpKernel {
     OP_REQUIRES_OK(context, context->GetAttr("narrow_range", &narrow_range));
     quant_min_ = narrow_range ? 1 : 0;
     quant_max_ = (1 << num_bits) - 1;
+    SET_USING_FAKE_QUANT();
   }
 
   void Compute(OpKernelContext* context) override {
diff --git a/tensorflow/core/kernels/fft_ops.cc b/tensorflow/core/kernels/fft_ops.cc
index fabd8e9cb36..05843594839 100644
--- a/tensorflow/core/kernels/fft_ops.cc
+++ b/tensorflow/core/kernels/fft_ops.cc
@@ -15,7 +15,7 @@ limitations under the License.
 
 #define EIGEN_USE_THREADS
 
-// See docs in ../ops/spectral_ops.cc.
+// See docs in ../ops/fft_ops.cc.
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/framework/op.h"
@@ -23,6 +23,7 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/util/env_var.h"
@@ -94,7 +95,34 @@ class FFTBase : public OpKernel {
     }
 
     OP_REQUIRES_OK(ctx, ctx->allocate_output(0, output_shape, &out));
+
+    if (IsReal()) {
+      if (IsForward()) {
+        OP_REQUIRES(
+            ctx,
+            (in.dtype() == DT_FLOAT && out->dtype() == DT_COMPLEX64) ||
+                (in.dtype() == DT_DOUBLE && out->dtype() == DT_COMPLEX128),
+            errors::InvalidArgument("Wrong types for forward real FFT: in=",
+                                    in.dtype(), " out=", out->dtype()));
+      } else {
+        OP_REQUIRES(
+            ctx,
+            (in.dtype() == DT_COMPLEX64 && out->dtype() == DT_FLOAT) ||
+                (in.dtype() == DT_COMPLEX128 && out->dtype() == DT_DOUBLE),
+            errors::InvalidArgument("Wrong types for backward real FFT: in=",
+                                    in.dtype(), " out=", out->dtype()));
+      }
+    } else {
+      OP_REQUIRES(
+          ctx,
+          (in.dtype() == DT_COMPLEX64 && out->dtype() == DT_COMPLEX64) ||
+              (in.dtype() == DT_COMPLEX128 && out->dtype() == DT_COMPLEX128),
+          errors::InvalidArgument("Wrong types for FFT: in=", in.dtype(),
+                                  " out=", out->dtype()));
+    }
+
     if (input_shape.num_elements() == 0) {
+      DCHECK_EQ(0, output_shape.num_elements());
       return;
     }
 
@@ -129,164 +157,185 @@ class FFTCPU : public FFTBase {
     const auto axes = Eigen::ArrayXi::LinSpaced(FFTRank, 1, FFTRank);
     auto device = ctx->eigen_device<CPUDevice>();
 
+    const bool is_complex128 =
+        in.dtype() == DT_COMPLEX128 || out->dtype() == DT_COMPLEX128;
+
     if (!IsReal()) {
       // Compute the FFT using Eigen.
       constexpr auto direction =
           Forward ? Eigen::FFT_FORWARD : Eigen::FFT_REVERSE;
-      if (in.dtype() == DT_COMPLEX64) {
+      if (is_complex128) {
+        DCHECK_EQ(in.dtype(), DT_COMPLEX128);
+        DCHECK_EQ(out->dtype(), DT_COMPLEX128);
+        auto input = Tensor(in).flat_inner_dims<complex128, FFTRank + 1>();
+        auto output = out->flat_inner_dims<complex128, FFTRank + 1>();
+        output.device(device) =
+            input.template fft<Eigen::BothParts, direction>(axes);
+      } else {
+        DCHECK_EQ(in.dtype(), DT_COMPLEX64);
         DCHECK_EQ(out->dtype(), DT_COMPLEX64);
         auto input = Tensor(in).flat_inner_dims<complex64, FFTRank + 1>();
         auto output = out->flat_inner_dims<complex64, FFTRank + 1>();
         output.device(device) =
             input.template fft<Eigen::BothParts, direction>(axes);
-      } else {
-        DCHECK_EQ(DT_COMPLEX128, in.dtype());
-        DCHECK_EQ(DT_COMPLEX128, out->dtype());
-        auto input = Tensor(in).flat_inner_dims<complex128, FFTRank + 1>();
-        auto output = out->flat_inner_dims<complex128, FFTRank + 1>();
-        output.device(device) =
-            input.template fft<Eigen::BothParts, direction>(axes);
       }
     } else {
       if (IsForward()) {
-        auto input = Tensor(in).flat_inner_dims<float, FFTRank + 1>();
-        const auto input_dims = input.dimensions();
-
-        // Slice input to fft_shape on its inner-most dimensions.
-        Eigen::DSizes<Eigen::DenseIndex, FFTRank + 1> input_slice_sizes;
-        input_slice_sizes[0] = input_dims[0];
-        TensorShape temp_shape{input_dims[0]};
-        for (int i = 1; i <= FFTRank; ++i) {
-          input_slice_sizes[i] = fft_shape[i - 1];
-          temp_shape.AddDim(fft_shape[i - 1]);
+        if (is_complex128) {
+          DCHECK_EQ(in.dtype(), DT_DOUBLE);
+          DCHECK_EQ(out->dtype(), DT_COMPLEX128);
+          DoRealForwardFFT<double, complex128>(ctx, fft_shape, in, out);
+        } else {
+          DCHECK_EQ(in.dtype(), DT_FLOAT);
+          DCHECK_EQ(out->dtype(), DT_COMPLEX64);
+          DoRealForwardFFT<float, complex64>(ctx, fft_shape, in, out);
         }
-
-        auto output = out->flat_inner_dims<complex64, FFTRank + 1>();
-        const Eigen::DSizes<Eigen::DenseIndex, FFTRank + 1> zero_start_indices;
-
-        // Compute the full FFT using a temporary tensor.
-        Tensor temp;
-        OP_REQUIRES_OK(ctx, ctx->allocate_temp(DataTypeToEnum<complex64>::v(),
-                                               temp_shape, &temp));
-        auto full_fft = temp.flat_inner_dims<complex64, FFTRank + 1>();
-        full_fft.device(device) =
-            input.slice(zero_start_indices, input_slice_sizes)
-                .template fft<Eigen::BothParts, Eigen::FFT_FORWARD>(axes);
-
-        // Slice away the negative frequency components.
-        output.device(device) =
-            full_fft.slice(zero_start_indices, output.dimensions());
       } else {
-        // Reconstruct the full FFT and take the inverse.
-        auto input = Tensor(in).flat_inner_dims<complex64, FFTRank + 1>();
-        auto output = out->flat_inner_dims<float, FFTRank + 1>();
-        const auto input_dims = input.dimensions();
-
-        // Calculate the shape of the temporary tensor for the full FFT and the
-        // region we will slice from input given fft_shape. We slice input to
-        // fft_shape on its inner-most dimensions, except the last (which we
-        // slice to fft_shape[-1] / 2 + 1).
-        Eigen::DSizes<Eigen::DenseIndex, FFTRank + 1> input_slice_sizes;
-        input_slice_sizes[0] = input_dims[0];
-        TensorShape full_fft_shape;
-        full_fft_shape.AddDim(input_dims[0]);
-        for (auto i = 1; i <= FFTRank; i++) {
-          input_slice_sizes[i] =
-              i == FFTRank ? fft_shape[i - 1] / 2 + 1 : fft_shape[i - 1];
-          full_fft_shape.AddDim(fft_shape[i - 1]);
+        if (is_complex128) {
+          DCHECK_EQ(in.dtype(), DT_COMPLEX128);
+          DCHECK_EQ(out->dtype(), DT_DOUBLE);
+          DoRealBackwardFFT<complex128, double>(ctx, fft_shape, in, out);
+        } else {
+          DCHECK_EQ(in.dtype(), DT_COMPLEX64);
+          DCHECK_EQ(out->dtype(), DT_FLOAT);
+          DoRealBackwardFFT<complex64, float>(ctx, fft_shape, in, out);
         }
-
-        Tensor temp;
-        OP_REQUIRES_OK(ctx, ctx->allocate_temp(DataTypeToEnum<complex64>::v(),
-                                               full_fft_shape, &temp));
-        auto full_fft = temp.flat_inner_dims<complex64, FFTRank + 1>();
-
-        // Calculate the starting point and range of the source of
-        // negative frequency part.
-        auto neg_sizes = input_slice_sizes;
-        neg_sizes[FFTRank] =
-            fft_shape[FFTRank - 1] - input_slice_sizes[FFTRank];
-        Eigen::DSizes<Eigen::DenseIndex, FFTRank + 1> neg_target_indices;
-        neg_target_indices[FFTRank] = input_slice_sizes[FFTRank];
-
-        const Eigen::DSizes<Eigen::DenseIndex, FFTRank + 1> start_indices;
-        Eigen::DSizes<Eigen::DenseIndex, FFTRank + 1> neg_start_indices;
-        neg_start_indices[FFTRank] = 1;
-
-        full_fft.slice(start_indices, input_slice_sizes).device(device) =
-            input.slice(start_indices, input_slice_sizes);
-
-        // First, conduct IFFTs on outer dimensions. We save computation (and
-        // avoid touching uninitialized memory) by slicing full_fft to the
-        // subregion we wrote input to.
-        if (FFTRank > 1) {
-          const auto outer_axes =
-              Eigen::ArrayXi::LinSpaced(FFTRank - 1, 1, FFTRank - 1);
-          full_fft.slice(start_indices, input_slice_sizes).device(device) =
-              full_fft.slice(start_indices, input_slice_sizes)
-                  .template fft<Eigen::BothParts, Eigen::FFT_REVERSE>(
-                      outer_axes);
-        }
-
-        // Reconstruct the full FFT by appending reversed and conjugated
-        // spectrum as the negative frequency part.
-        Eigen::array<bool, FFTRank + 1> reverse_last_axis;
-        for (auto i = 0; i <= FFTRank; i++) {
-          reverse_last_axis[i] = i == FFTRank;
-        }
-
-        if (neg_sizes[FFTRank] != 0) {
-          full_fft.slice(neg_target_indices, neg_sizes).device(device) =
-              full_fft.slice(neg_start_indices, neg_sizes)
-                  .reverse(reverse_last_axis)
-                  .conjugate();
-        }
-
-        auto inner_axis = Eigen::array<int, 1>{FFTRank};
-        output.device(device) =
-            full_fft.template fft<Eigen::RealPart, Eigen::FFT_REVERSE>(
-                inner_axis);
       }
     }
   }
+
+  template <typename RealT, typename ComplexT>
+  void DoRealForwardFFT(OpKernelContext* ctx, uint64* fft_shape,
+                        const Tensor& in, Tensor* out) {
+    // Create the axes (which are always trailing).
+    const auto axes = Eigen::ArrayXi::LinSpaced(FFTRank, 1, FFTRank);
+    auto device = ctx->eigen_device<CPUDevice>();
+    auto input = Tensor(in).flat_inner_dims<RealT, FFTRank + 1>();
+    const auto input_dims = input.dimensions();
+
+    // Slice input to fft_shape on its inner-most dimensions.
+    Eigen::DSizes<Eigen::DenseIndex, FFTRank + 1> input_slice_sizes;
+    input_slice_sizes[0] = input_dims[0];
+    TensorShape temp_shape{input_dims[0]};
+    for (int i = 1; i <= FFTRank; ++i) {
+      input_slice_sizes[i] = fft_shape[i - 1];
+      temp_shape.AddDim(fft_shape[i - 1]);
+    }
+
+    auto output = out->flat_inner_dims<ComplexT, FFTRank + 1>();
+    const Eigen::DSizes<Eigen::DenseIndex, FFTRank + 1> zero_start_indices;
+
+    // Compute the full FFT using a temporary tensor.
+    Tensor temp;
+    OP_REQUIRES_OK(ctx, ctx->allocate_temp(DataTypeToEnum<ComplexT>::v(),
+                                           temp_shape, &temp));
+    auto full_fft = temp.flat_inner_dims<ComplexT, FFTRank + 1>();
+    full_fft.device(device) =
+        input.slice(zero_start_indices, input_slice_sizes)
+            .template fft<Eigen::BothParts, Eigen::FFT_FORWARD>(axes);
+
+    // Slice away the negative frequency components.
+    output.device(device) =
+        full_fft.slice(zero_start_indices, output.dimensions());
+  }
+
+  template <typename ComplexT, typename RealT>
+  void DoRealBackwardFFT(OpKernelContext* ctx, uint64* fft_shape,
+                         const Tensor& in, Tensor* out) {
+    auto device = ctx->eigen_device<CPUDevice>();
+    // Reconstruct the full FFT and take the inverse.
+    auto input = Tensor(in).flat_inner_dims<ComplexT, FFTRank + 1>();
+    auto output = out->flat_inner_dims<RealT, FFTRank + 1>();
+    const auto input_dims = input.dimensions();
+
+    // Calculate the shape of the temporary tensor for the full FFT and the
+    // region we will slice from input given fft_shape. We slice input to
+    // fft_shape on its inner-most dimensions, except the last (which we
+    // slice to fft_shape[-1] / 2 + 1).
+    Eigen::DSizes<Eigen::DenseIndex, FFTRank + 1> input_slice_sizes;
+    input_slice_sizes[0] = input_dims[0];
+    TensorShape full_fft_shape;
+    full_fft_shape.AddDim(input_dims[0]);
+    for (auto i = 1; i <= FFTRank; i++) {
+      input_slice_sizes[i] =
+          i == FFTRank ? fft_shape[i - 1] / 2 + 1 : fft_shape[i - 1];
+      full_fft_shape.AddDim(fft_shape[i - 1]);
+    }
+
+    Tensor temp;
+    OP_REQUIRES_OK(ctx, ctx->allocate_temp(DataTypeToEnum<ComplexT>::v(),
+                                           full_fft_shape, &temp));
+    auto full_fft = temp.flat_inner_dims<ComplexT, FFTRank + 1>();
+
+    // Calculate the starting point and range of the source of
+    // negative frequency part.
+    auto neg_sizes = input_slice_sizes;
+    neg_sizes[FFTRank] = fft_shape[FFTRank - 1] - input_slice_sizes[FFTRank];
+    Eigen::DSizes<Eigen::DenseIndex, FFTRank + 1> neg_target_indices;
+    neg_target_indices[FFTRank] = input_slice_sizes[FFTRank];
+
+    const Eigen::DSizes<Eigen::DenseIndex, FFTRank + 1> start_indices;
+    Eigen::DSizes<Eigen::DenseIndex, FFTRank + 1> neg_start_indices;
+    neg_start_indices[FFTRank] = 1;
+
+    full_fft.slice(start_indices, input_slice_sizes).device(device) =
+        input.slice(start_indices, input_slice_sizes);
+
+    // First, conduct IFFTs on outer dimensions. We save computation (and
+    // avoid touching uninitialized memory) by slicing full_fft to the
+    // subregion we wrote input to.
+    if (FFTRank > 1) {
+      const auto outer_axes =
+          Eigen::ArrayXi::LinSpaced(FFTRank - 1, 1, FFTRank - 1);
+      full_fft.slice(start_indices, input_slice_sizes).device(device) =
+          full_fft.slice(start_indices, input_slice_sizes)
+              .template fft<Eigen::BothParts, Eigen::FFT_REVERSE>(outer_axes);
+    }
+
+    // Reconstruct the full FFT by appending reversed and conjugated
+    // spectrum as the negative frequency part.
+    Eigen::array<bool, FFTRank + 1> reverse_last_axis;
+    for (auto i = 0; i <= FFTRank; i++) {
+      reverse_last_axis[i] = i == FFTRank;
+    }
+
+    if (neg_sizes[FFTRank] != 0) {
+      full_fft.slice(neg_target_indices, neg_sizes).device(device) =
+          full_fft.slice(neg_start_indices, neg_sizes)
+              .reverse(reverse_last_axis)
+              .conjugate();
+    }
+
+    auto inner_axis = Eigen::array<int, 1>{FFTRank};
+    output.device(device) =
+        full_fft.template fft<Eigen::RealPart, Eigen::FFT_REVERSE>(inner_axis);
+  }
 };
 
-// Use labels to distinguish between internal and open source versions
-// of these kernels.
-#ifdef PLATFORM_GOOGLE
-#define FFT_LABEL "eigen"
-#else
-#define FFT_LABEL ""
-#endif
-
-REGISTER_KERNEL_BUILDER(Name("FFT").Device(DEVICE_CPU).Label(FFT_LABEL),
-                        FFTCPU<true, false, 1>);
-REGISTER_KERNEL_BUILDER(Name("IFFT").Device(DEVICE_CPU).Label(FFT_LABEL),
+REGISTER_KERNEL_BUILDER(Name("FFT").Device(DEVICE_CPU), FFTCPU<true, false, 1>);
+REGISTER_KERNEL_BUILDER(Name("IFFT").Device(DEVICE_CPU),
                         FFTCPU<false, false, 1>);
-REGISTER_KERNEL_BUILDER(Name("FFT2D").Device(DEVICE_CPU).Label(FFT_LABEL),
+REGISTER_KERNEL_BUILDER(Name("FFT2D").Device(DEVICE_CPU),
                         FFTCPU<true, false, 2>);
-REGISTER_KERNEL_BUILDER(Name("IFFT2D").Device(DEVICE_CPU).Label(FFT_LABEL),
+REGISTER_KERNEL_BUILDER(Name("IFFT2D").Device(DEVICE_CPU),
                         FFTCPU<false, false, 2>);
-REGISTER_KERNEL_BUILDER(Name("FFT3D").Device(DEVICE_CPU).Label(FFT_LABEL),
+REGISTER_KERNEL_BUILDER(Name("FFT3D").Device(DEVICE_CPU),
                         FFTCPU<true, false, 3>);
-REGISTER_KERNEL_BUILDER(Name("IFFT3D").Device(DEVICE_CPU).Label(FFT_LABEL),
+REGISTER_KERNEL_BUILDER(Name("IFFT3D").Device(DEVICE_CPU),
                         FFTCPU<false, false, 3>);
 
-REGISTER_KERNEL_BUILDER(Name("RFFT").Device(DEVICE_CPU).Label(FFT_LABEL),
-                        FFTCPU<true, true, 1>);
-REGISTER_KERNEL_BUILDER(Name("IRFFT").Device(DEVICE_CPU).Label(FFT_LABEL),
+REGISTER_KERNEL_BUILDER(Name("RFFT").Device(DEVICE_CPU), FFTCPU<true, true, 1>);
+REGISTER_KERNEL_BUILDER(Name("IRFFT").Device(DEVICE_CPU),
                         FFTCPU<false, true, 1>);
-REGISTER_KERNEL_BUILDER(Name("RFFT2D").Device(DEVICE_CPU).Label(FFT_LABEL),
+REGISTER_KERNEL_BUILDER(Name("RFFT2D").Device(DEVICE_CPU),
                         FFTCPU<true, true, 2>);
-REGISTER_KERNEL_BUILDER(Name("IRFFT2D").Device(DEVICE_CPU).Label(FFT_LABEL),
+REGISTER_KERNEL_BUILDER(Name("IRFFT2D").Device(DEVICE_CPU),
                         FFTCPU<false, true, 2>);
-REGISTER_KERNEL_BUILDER(Name("RFFT3D").Device(DEVICE_CPU).Label(FFT_LABEL),
+REGISTER_KERNEL_BUILDER(Name("RFFT3D").Device(DEVICE_CPU),
                         FFTCPU<true, true, 3>);
-REGISTER_KERNEL_BUILDER(Name("IRFFT3D").Device(DEVICE_CPU).Label(FFT_LABEL),
+REGISTER_KERNEL_BUILDER(Name("IRFFT3D").Device(DEVICE_CPU),
                         FFTCPU<false, true, 3>);
 
-#undef FFT_LABEL
-
 #if (defined(GOOGLE_CUDA) && GOOGLE_CUDA) || \
     (defined(TENSORFLOW_USE_ROCM) && TENSORFLOW_USE_ROCM)
 
@@ -402,16 +451,19 @@ class FFTGPUBase : public FFTBase {
     }
 
     constexpr bool kInPlaceFft = false;
-    const bool is_complex128 = in.dtype() == DT_COMPLEX128;
-    // complex128 real FFT is not supported yet.
-    DCHECK(!IsReal() || !is_complex128);
+    const bool is_complex128 =
+        in.dtype() == DT_COMPLEX128 || out->dtype() == DT_COMPLEX128;
 
     const auto kFftType =
-        IsReal() ? (IsForward() ? se::fft::Type::kR2C : se::fft::Type::kC2R)
-                 : (IsForward() ? (is_complex128 ? se::fft::Type::kZ2ZForward
-                                                 : se::fft::Type::kC2CForward)
-                                : (is_complex128 ? se::fft::Type::kZ2ZInverse
-                                                 : se::fft::Type::kC2CInverse));
+        IsReal()
+            ? (IsForward()
+                   ? (is_complex128 ? se::fft::Type::kD2Z : se::fft::Type::kR2C)
+                   : (is_complex128 ? se::fft::Type::kZ2D
+                                    : se::fft::Type::kC2R))
+            : (IsForward() ? (is_complex128 ? se::fft::Type::kZ2ZForward
+                                            : se::fft::Type::kC2CForward)
+                           : (is_complex128 ? se::fft::Type::kZ2ZInverse
+                                            : se::fft::Type::kC2CInverse));
 
     CufftScratchAllocator scratch_allocator(CufftScratchSize, ctx);
     auto plan =
@@ -422,67 +474,80 @@ class FFTGPUBase : public FFTBase {
 
     if (IsReal()) {
       if (IsForward()) {
-        auto src = AsDeviceMemory<float>(in.flat<float>().data());
-        auto dst = AsDeviceMemory<complex64>(out->flat<complex64>().data());
-        OP_REQUIRES(
-            ctx, stream->ThenFft(plan.get(), src, &dst).ok(),
-            errors::Internal("fft failed : type=", static_cast<int>(kFftType),
-                             " in.shape=", input_shape.DebugString()));
+        if (is_complex128) {
+          DCHECK_EQ(in.dtype(), DT_DOUBLE);
+          DCHECK_EQ(out->dtype(), DT_COMPLEX128);
+          DoFFTInternal<double, complex128>(ctx, stream, plan.get(), kFftType,
+                                            output_distance, in, out);
+        } else {
+          DCHECK_EQ(in.dtype(), DT_FLOAT);
+          DCHECK_EQ(out->dtype(), DT_COMPLEX64);
+          DoFFTInternal<float, complex64>(ctx, stream, plan.get(), kFftType,
+                                          output_distance, in, out);
+        }
       } else {
-        auto src = AsDeviceMemory<complex64>(in.flat<complex64>().data());
-        auto dst = AsDeviceMemory<float>(out->flat<float>().data());
-        OP_REQUIRES(
-            ctx, stream->ThenFft(plan.get(), src, &dst).ok(),
-            errors::Internal("fft failed : type=", static_cast<int>(kFftType),
-                             " in.shape=", input_shape.DebugString()));
-        auto alpha = 1.f / output_distance;
-        OP_REQUIRES(
-            ctx,
-            stream->ThenBlasScal(output_shape.num_elements(), alpha, &dst, 1)
-                .ok(),
-            errors::Internal("BlasScal failed : in.shape=",
-                             input_shape.DebugString()));
+        if (is_complex128) {
+          DCHECK_EQ(in.dtype(), DT_COMPLEX128);
+          DCHECK_EQ(out->dtype(), DT_DOUBLE);
+          DoFFTInternal<complex128, double>(ctx, stream, plan.get(), kFftType,
+                                            output_distance, in, out);
+        } else {
+          DCHECK_EQ(in.dtype(), DT_COMPLEX64);
+          DCHECK_EQ(out->dtype(), DT_FLOAT);
+          DoFFTInternal<complex64, float>(ctx, stream, plan.get(), kFftType,
+                                          output_distance, in, out);
+        }
       }
     } else {
-      if (!is_complex128) {
-        DCHECK_EQ(in.dtype(), DT_COMPLEX64);
-        DCHECK_EQ(out->dtype(), DT_COMPLEX64);
-        auto src = AsDeviceMemory<complex64>(in.flat<complex64>().data());
-        auto dst = AsDeviceMemory<complex64>(out->flat<complex64>().data());
-        OP_REQUIRES(
-            ctx, stream->ThenFft(plan.get(), src, &dst).ok(),
-            errors::Internal("fft failed : type=", static_cast<int>(kFftType),
-                             " in.shape=", input_shape.DebugString()));
-        if (!IsForward()) {
-          float alpha = 1.f / output_distance;
-          OP_REQUIRES(
-              ctx,
-              stream->ThenBlasScal(output_shape.num_elements(), alpha, &dst, 1)
-                  .ok(),
-              errors::Internal("BlasScal failed : in.shape=",
-                               input_shape.DebugString()));
-        }
-      } else {
+      if (is_complex128) {
         DCHECK_EQ(in.dtype(), DT_COMPLEX128);
         DCHECK_EQ(out->dtype(), DT_COMPLEX128);
-        auto src = AsDeviceMemory<complex128>(in.flat<complex128>().data());
-        auto dst = AsDeviceMemory<complex128>(out->flat<complex128>().data());
-        OP_REQUIRES(
-            ctx, stream->ThenFft(plan.get(), src, &dst).ok(),
-            errors::Internal("fft failed : type=", static_cast<int>(kFftType),
-                             " in.shape=", input_shape.DebugString()));
-        if (!IsForward()) {
-          double alpha = 1.0 / output_distance;
-          OP_REQUIRES(
-              ctx,
-              stream->ThenBlasScal(output_shape.num_elements(), alpha, &dst, 1)
-                  .ok(),
-              errors::Internal("BlasScal failed : in.shape=",
-                               input_shape.DebugString()));
-        }
+        DoFFTInternal<complex128, complex128>(ctx, stream, plan.get(), kFftType,
+                                              output_distance, in, out);
+      } else {
+        DCHECK_EQ(in.dtype(), DT_COMPLEX64);
+        DCHECK_EQ(out->dtype(), DT_COMPLEX64);
+        DoFFTInternal<complex64, complex64>(ctx, stream, plan.get(), kFftType,
+                                            output_distance, in, out);
       }
     }
   }
+
+ private:
+  template <typename T>
+  struct RealTypeFromComplexType {
+    typedef T RealT;
+  };
+
+  template <typename T>
+  struct RealTypeFromComplexType<std::complex<T>> {
+    typedef T RealT;
+  };
+
+  template <typename InT, typename OutT>
+  void DoFFTInternal(OpKernelContext* ctx, se::Stream* stream,
+                     se::fft::Plan* plan, const se::fft::Type fft_type,
+                     const uint64 output_distance, const Tensor& in,
+                     Tensor* out) {
+    auto src = AsDeviceMemory<InT>(in.flat<InT>().data());
+    auto dst = AsDeviceMemory<OutT>(out->flat<OutT>().data());
+    const TensorShape& input_shape = in.shape();
+    const TensorShape& output_shape = out->shape();
+    OP_REQUIRES(
+        ctx, stream->ThenFft(plan, src, &dst).ok(),
+        errors::Internal("fft failed : type=", static_cast<int>(fft_type),
+                         " in.shape=", input_shape.DebugString()));
+    if (!IsForward()) {
+      typedef typename RealTypeFromComplexType<OutT>::RealT RealT;
+      RealT alpha = 1.0 / output_distance;
+      OP_REQUIRES(
+          ctx,
+          stream->ThenBlasScal(output_shape.num_elements(), alpha, &dst, 1)
+              .ok(),
+          errors::Internal("BlasScal failed : in.shape=",
+                           input_shape.DebugString()));
+    }
+  }
 };
 
 int64 FFTGPUBase::CufftScratchSize = GetCufftWorkspaceLimit(
@@ -503,49 +568,53 @@ class FFTGPU : public FFTGPUBase {
   bool IsReal() const override { return _Real; }
 };
 
-REGISTER_KERNEL_BUILDER(Name("FFT").Device(DEVICE_GPU), FFTGPU<true, false, 1>);
-REGISTER_KERNEL_BUILDER(Name("IFFT").Device(DEVICE_GPU),
+// Register GPU kernels with priority 1 so that if a custom FFT CPU kernel is
+// registered with priority 1 (to override the default Eigen CPU kernel), the
+// CPU kernel does not outrank the GPU kernel.
+REGISTER_KERNEL_BUILDER(Name("FFT").Device(DEVICE_GPU).Priority(1),
+                        FFTGPU<true, false, 1>);
+REGISTER_KERNEL_BUILDER(Name("IFFT").Device(DEVICE_GPU).Priority(1),
                         FFTGPU<false, false, 1>);
-REGISTER_KERNEL_BUILDER(Name("FFT2D").Device(DEVICE_GPU),
+REGISTER_KERNEL_BUILDER(Name("FFT2D").Device(DEVICE_GPU).Priority(1),
                         FFTGPU<true, false, 2>);
-REGISTER_KERNEL_BUILDER(Name("IFFT2D").Device(DEVICE_GPU),
+REGISTER_KERNEL_BUILDER(Name("IFFT2D").Device(DEVICE_GPU).Priority(1),
                         FFTGPU<false, false, 2>);
-REGISTER_KERNEL_BUILDER(Name("FFT3D").Device(DEVICE_GPU),
+REGISTER_KERNEL_BUILDER(Name("FFT3D").Device(DEVICE_GPU).Priority(1),
                         FFTGPU<true, false, 3>);
-REGISTER_KERNEL_BUILDER(Name("IFFT3D").Device(DEVICE_GPU),
+REGISTER_KERNEL_BUILDER(Name("IFFT3D").Device(DEVICE_GPU).Priority(1),
                         FFTGPU<false, false, 3>);
 
 REGISTER_KERNEL_BUILDER(
-    Name("RFFT").Device(DEVICE_GPU).HostMemory("fft_length"),
+    Name("RFFT").Device(DEVICE_GPU).HostMemory("fft_length").Priority(1),
     FFTGPU<true, true, 1>);
 REGISTER_KERNEL_BUILDER(
-    Name("IRFFT").Device(DEVICE_GPU).HostMemory("fft_length"),
+    Name("IRFFT").Device(DEVICE_GPU).HostMemory("fft_length").Priority(1),
     FFTGPU<false, true, 1>);
 REGISTER_KERNEL_BUILDER(
-    Name("RFFT2D").Device(DEVICE_GPU).HostMemory("fft_length"),
+    Name("RFFT2D").Device(DEVICE_GPU).HostMemory("fft_length").Priority(1),
     FFTGPU<true, true, 2>);
 REGISTER_KERNEL_BUILDER(
-    Name("IRFFT2D").Device(DEVICE_GPU).HostMemory("fft_length"),
+    Name("IRFFT2D").Device(DEVICE_GPU).HostMemory("fft_length").Priority(1),
     FFTGPU<false, true, 2>);
 REGISTER_KERNEL_BUILDER(
-    Name("RFFT3D").Device(DEVICE_GPU).HostMemory("fft_length"),
+    Name("RFFT3D").Device(DEVICE_GPU).HostMemory("fft_length").Priority(1),
     FFTGPU<true, true, 3>);
 REGISTER_KERNEL_BUILDER(
-    Name("IRFFT3D").Device(DEVICE_GPU).HostMemory("fft_length"),
+    Name("IRFFT3D").Device(DEVICE_GPU).HostMemory("fft_length").Priority(1),
     FFTGPU<false, true, 3>);
 
 // Deprecated kernels.
-REGISTER_KERNEL_BUILDER(Name("BatchFFT").Device(DEVICE_GPU),
+REGISTER_KERNEL_BUILDER(Name("BatchFFT").Device(DEVICE_GPU).Priority(1),
                         FFTGPU<true, false, 1>);
-REGISTER_KERNEL_BUILDER(Name("BatchIFFT").Device(DEVICE_GPU),
+REGISTER_KERNEL_BUILDER(Name("BatchIFFT").Device(DEVICE_GPU).Priority(1),
                         FFTGPU<false, false, 1>);
-REGISTER_KERNEL_BUILDER(Name("BatchFFT2D").Device(DEVICE_GPU),
+REGISTER_KERNEL_BUILDER(Name("BatchFFT2D").Device(DEVICE_GPU).Priority(1),
                         FFTGPU<true, false, 2>);
-REGISTER_KERNEL_BUILDER(Name("BatchIFFT2D").Device(DEVICE_GPU),
+REGISTER_KERNEL_BUILDER(Name("BatchIFFT2D").Device(DEVICE_GPU).Priority(1),
                         FFTGPU<false, false, 2>);
-REGISTER_KERNEL_BUILDER(Name("BatchFFT3D").Device(DEVICE_GPU),
+REGISTER_KERNEL_BUILDER(Name("BatchFFT3D").Device(DEVICE_GPU).Priority(1),
                         FFTGPU<true, false, 3>);
-REGISTER_KERNEL_BUILDER(Name("BatchIFFT3D").Device(DEVICE_GPU),
+REGISTER_KERNEL_BUILDER(Name("BatchIFFT3D").Device(DEVICE_GPU).Priority(1),
                         FFTGPU<false, false, 3>);
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
diff --git a/tensorflow/core/kernels/fuzzing/one_hot_fuzz.cc b/tensorflow/core/kernels/fuzzing/one_hot_fuzz.cc
index 85cbe51ba8b..94ce3035044 100644
--- a/tensorflow/core/kernels/fuzzing/one_hot_fuzz.cc
+++ b/tensorflow/core/kernels/fuzzing/one_hot_fuzz.cc
@@ -20,6 +20,9 @@ limitations under the License.
 namespace tensorflow {
 namespace fuzzing {
 
+// Don't generate tensors that are too large as we don't test that branch here
+constexpr size_t kMaxSize = 1024;
+
 class FuzzOneHot : public FuzzSession {
   void BuildGraph(const Scope& scope) override {
     auto input =
@@ -39,6 +42,11 @@ class FuzzOneHot : public FuzzSession {
     const uint8_t* input_data;
 
     if (size > 3) {
+      // Since we only care about the one hot decoding and not about the size of
+      // the tensor, limit `size` to at most `kMaxSize`.
+      if (size > kMaxSize) {
+        size = kMaxSize;
+      }
       depth = static_cast<int32>(data[0]);
       on = data[1];
       off = data[2];
diff --git a/tensorflow/core/kernels/gpu_utils.cc b/tensorflow/core/kernels/gpu_utils.cc
index 68b069acb0c..d06a09f04d9 100644
--- a/tensorflow/core/kernels/gpu_utils.cc
+++ b/tensorflow/core/kernels/gpu_utils.cc
@@ -25,8 +25,8 @@ limitations under the License.
 #include "tensorflow/core/protobuf/autotuning.pb.h"
 #include "tensorflow/core/protobuf/conv_autotuning.pb.h"
 #include "tensorflow/core/util/proto/proto_utils.h"
-#include "tensorflow/stream_executor/cuda/ptxas_utils.h"
-#include "tensorflow/stream_executor/cuda/redzone_allocator.h"
+#include "tensorflow/stream_executor/gpu/asm_compiler.h"
+#include "tensorflow/stream_executor/gpu/redzone_allocator.h"
 
 namespace tensorflow {
 
@@ -35,8 +35,8 @@ bool RedzoneCheckDisabled() {
   return disable_rz_str != nullptr && std::strcmp(disable_rz_str, "1") == 0;
 }
 
-se::DeviceMemoryBase WrapRedzoneBestEffort(
-    se::cuda::RedzoneAllocator* rz_allocator, se::DeviceMemoryBase buffer) {
+se::DeviceMemoryBase WrapRedzoneBestEffort(se::RedzoneAllocator* rz_allocator,
+                                           se::DeviceMemoryBase buffer) {
   if (RedzoneCheckDisabled()) {
     return buffer;
   }
@@ -55,9 +55,9 @@ se::DeviceMemoryBase WrapRedzoneBestEffort(
   return se::DeviceMemoryBase(output_rz_or.ValueOrDie());
 }
 
-void CheckRedzones(const se::cuda::RedzoneAllocator& rz_allocator,
+void CheckRedzones(const se::RedzoneAllocator& rz_allocator,
                    tensorflow::AutotuneResult* autotune_result) {
-  se::port::StatusOr<se::cuda::RedzoneAllocator::RedzoneCheckStatus> rz_status =
+  se::port::StatusOr<se::RedzoneAllocator::RedzoneCheckStatus> rz_status =
       rz_allocator.CheckRedzones();
   if (!rz_status.ok()) {
     static std::once_flag failure_logged;
diff --git a/tensorflow/core/kernels/gpu_utils.h b/tensorflow/core/kernels/gpu_utils.h
index b3ac9535443..72f4a25596a 100644
--- a/tensorflow/core/kernels/gpu_utils.h
+++ b/tensorflow/core/kernels/gpu_utils.h
@@ -30,9 +30,7 @@ limitations under the License.
 #include "tensorflow/core/platform/stream_executor.h"
 
 namespace stream_executor {
-namespace cuda {
 class RedzoneAllocator;
-}
 }  // namespace stream_executor
 
 namespace tensorflow {
@@ -52,13 +50,13 @@ bool RedzoneCheckDisabled();
 // Returns `buffer` if RedzoneCheckDisabled() is true.
 //
 // On error, return `buffer`, and log an error message (once).
-se::DeviceMemoryBase WrapRedzoneBestEffort(
-    se::cuda::RedzoneAllocator* rz_allocator, se::DeviceMemoryBase buffer);
+se::DeviceMemoryBase WrapRedzoneBestEffort(se::RedzoneAllocator* rz_allocator,
+                                           se::DeviceMemoryBase buffer);
 
 // Check the passed allocator for redzone violations.
 // If violations have occurred, mark the corresponding autotune result
 // as a failure.
-void CheckRedzones(const se::cuda::RedzoneAllocator& rz_allocator,
+void CheckRedzones(const se::RedzoneAllocator& rz_allocator,
                    tensorflow::AutotuneResult* autotune_result);
 
 template <typename T>
diff --git a/tensorflow/core/kernels/mkl_cwise_ops_common.cc b/tensorflow/core/kernels/mkl_cwise_ops_common.cc
index 9c6a9c48bab..e332d530e3a 100644
--- a/tensorflow/core/kernels/mkl_cwise_ops_common.cc
+++ b/tensorflow/core/kernels/mkl_cwise_ops_common.cc
@@ -70,6 +70,8 @@ class MklBinaryOp : public BinaryOp<Device, Functor> {
 
 REGISTER6(MklBinaryOp, CPU, "_MklAdd", functor::add, float, Eigen::half, double,
           int32, int64, bfloat16);
+REGISTER6(MklBinaryOp, CPU, "_MklAddV2", functor::add, float, Eigen::half,
+          double, int32, int64, bfloat16);
 REGISTER8(MklBinaryOp, CPU, "_MklSub", functor::sub, float, Eigen::half, double,
           int32, int64, complex64, complex128, bfloat16);
 REGISTER6(MklBinaryOp, CPU, "_MklMul", functor::mul, float, Eigen::half, double,
diff --git a/tensorflow/core/kernels/nccl_ops.cc b/tensorflow/core/kernels/nccl_ops.cc
index 9ccf591058e..bc028e8197c 100644
--- a/tensorflow/core/kernels/nccl_ops.cc
+++ b/tensorflow/core/kernels/nccl_ops.cc
@@ -108,9 +108,8 @@ class NcclAllReduceOpKernel : public NcclReduceOpBase {
     auto* compute_stream = c->op_device_context()->stream();
     auto* gpu_info = c->device()->tensorflow_gpu_device_info();
     auto participant = absl::make_unique<NcclManager::Participant>(
-        compute_stream->parent(), compute_stream, gpu_info->event_mgr,
-        gpu_info->gpu_id, input, output, /*global_rank=*/-1,
-        std::move(actual_done));
+        compute_stream->parent(), compute_stream, gpu_info, input, output,
+        /*global_rank=*/-1, std::move(actual_done));
     NcclManager::instance()->AddToAllReduce(
         std::move(participant),
         {GetCollectiveKey(c),
@@ -140,9 +139,8 @@ class NcclReduceSendKernel : public NcclReduceOpBase {
     auto* compute_stream = c->op_device_context()->stream();
     auto* gpu_info = c->device()->tensorflow_gpu_device_info();
     auto participant = absl::make_unique<NcclManager::Participant>(
-        compute_stream->parent(), compute_stream, gpu_info->event_mgr,
-        gpu_info->gpu_id, &c->input(0), /*output=*/nullptr, /*global_rank=*/-1,
-        std::move(actual_done));
+        compute_stream->parent(), compute_stream, gpu_info, &c->input(0),
+        /*output=*/nullptr, /*global_rank=*/-1, std::move(actual_done));
     NcclManager::instance()->AddReduceSend(
         std::move(participant),
         {GetCollectiveKey(c),
@@ -177,9 +175,8 @@ class NcclReduceRecvKernel : public NcclReduceOpBase {
     auto* compute_stream = c->op_device_context()->stream();
     auto* gpu_info = c->device()->tensorflow_gpu_device_info();
     auto participant = absl::make_unique<NcclManager::Participant>(
-        compute_stream->parent(), compute_stream, gpu_info->event_mgr,
-        gpu_info->gpu_id, input, output, /*global_rank=*/-1,
-        std::move(actual_done));
+        compute_stream->parent(), compute_stream, gpu_info, input, output,
+        /*global_rank=*/-1, std::move(actual_done));
     NcclManager::instance()->AddReduceRecv(
         std::move(participant),
         {GetCollectiveKey(c),
@@ -212,9 +209,8 @@ class NcclBroadcastSendKernel : public NcclAsyncOpBase {
     auto* compute_stream = c->op_device_context()->stream();
     auto* gpu_info = c->device()->tensorflow_gpu_device_info();
     auto participant = absl::make_unique<NcclManager::Participant>(
-        compute_stream->parent(), compute_stream, gpu_info->event_mgr,
-        gpu_info->gpu_id, &c->input(0), /*output=*/nullptr, /*global_rank=*/-1,
-        std::move(actual_done));
+        compute_stream->parent(), compute_stream, gpu_info, &c->input(0),
+        /*output=*/nullptr, /*global_rank=*/-1, std::move(actual_done));
     NcclManager::instance()->AddBroadcastSend(
         std::move(participant), {GetCollectiveKey(c),
                                  /*num_local_devices=*/num_devices(),
@@ -249,9 +245,8 @@ class NcclBroadcastRecvKernel : public NcclAsyncOpBase {
     auto* compute_stream = c->op_device_context()->stream();
     auto* gpu_info = c->device()->tensorflow_gpu_device_info();
     auto participant = absl::make_unique<NcclManager::Participant>(
-        compute_stream->parent(), compute_stream, gpu_info->event_mgr,
-        gpu_info->gpu_id, /*input=*/nullptr, output, /*global_rank=*/-1,
-        std::move(actual_done));
+        compute_stream->parent(), compute_stream, gpu_info,
+        /*input=*/nullptr, output, /*global_rank=*/-1, std::move(actual_done));
     NcclManager::instance()->AddBroadcastRecv(
         std::move(participant), {GetCollectiveKey(c),
                                  /*num_local_devices=*/num_devices(),
diff --git a/tensorflow/core/kernels/nextafter_op.cc b/tensorflow/core/kernels/nextafter_op.cc
index 6166a1053f3..3a075f82a33 100644
--- a/tensorflow/core/kernels/nextafter_op.cc
+++ b/tensorflow/core/kernels/nextafter_op.cc
@@ -30,8 +30,8 @@ REGISTER_SYCL_KERNEL(double);
 #undef REGISTER_SYCL_KERNEL
 #endif  // TENSORFLOW_USE_SYCL
 
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 REGISTER2(BinaryOp, GPU, "NextAfter", functor::nextafter, float, double);
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/nextafter_op.h b/tensorflow/core/kernels/nextafter_op.h
index 64374980f2d..3bf3ee5185d 100644
--- a/tensorflow/core/kernels/nextafter_op.h
+++ b/tensorflow/core/kernels/nextafter_op.h
@@ -25,10 +25,23 @@ namespace functor {
 template <typename T>
 struct nextafter_op {
   EIGEN_EMPTY_STRUCT_CTOR(nextafter_op)
+  // GPU kernels on ROCm may have issues including standard C++ APIs. Use
+  // specialized member functions and invoke HIP runtime APIs instead.
+#if !TENSORFLOW_USE_ROCM
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const T operator()(const T& x1,
                                                            const T& x2) const {
     return std::nextafter(x1, x2);
   }
+#else
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const float operator()(
+      const float& x1, const float& x2) const {
+    return nextafterf(x1, x2);
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const double operator()(
+      const double& x1, const double& x2) const {
+    return nextafter(x1, x2);
+  }
+#endif
 };
 
 template <typename T>
diff --git a/tensorflow/core/kernels/nextafter_op_gpu.cu.cc b/tensorflow/core/kernels/nextafter_op_gpu.cu.cc
index d2321c6a882..0714ae2e24f 100644
--- a/tensorflow/core/kernels/nextafter_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/nextafter_op_gpu.cu.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
 #include "tensorflow/core/kernels/cwise_ops_gpu_common.cu.h"
 #include "tensorflow/core/kernels/nextafter_op.h"
@@ -26,4 +26,4 @@ DEFINE_BINARY2(nextafter, float, double);
 }  // namespace functor
 }  // namespace tensorflow
 
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
diff --git a/tensorflow/core/kernels/non_max_suppression_op.cu.cc b/tensorflow/core/kernels/non_max_suppression_op.cu.cc
index 50fdcd76b35..c0583908905 100644
--- a/tensorflow/core/kernels/non_max_suppression_op.cu.cc
+++ b/tensorflow/core/kernels/non_max_suppression_op.cu.cc
@@ -15,6 +15,8 @@ limitations under the License.
 
 #if GOOGLE_CUDA
 #define EIGEN_USE_GPU
+#include <limits>
+
 #include "absl/strings/str_cat.h"
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "third_party/cub/device/device_radix_sort.cuh"
@@ -82,10 +84,9 @@ __device__ EIGEN_STRONG_INLINE void Swap(T& a, T& b) {
 
 // Check whether two boxes have an IoU greater than threshold.
 template <typename T>
-__device__ EIGEN_STRONG_INLINE bool OverThreshold(const Box* __restrict__ a,
-                                                  const Box* __restrict__ b,
-                                                  float a_area,
-                                                  T iou_threshold) {
+__device__ EIGEN_STRONG_INLINE bool OverThreshold(const Box* a, const Box* b,
+                                                  const float a_area,
+                                                  const T iou_threshold) {
   const float b_area = (b->x2 - b->x1) * (b->y2 - b->y1);
   if (a_area == 0.0f || b_area == 0.0f) return false;
   const float xx1 = fmaxf(a->x1, b->x1);
@@ -94,8 +95,8 @@ __device__ EIGEN_STRONG_INLINE bool OverThreshold(const Box* __restrict__ a,
   const float yy2 = fminf(a->y2, b->y2);
 
   // fdimf computes the positive difference between xx2+1 and xx1.
-  const float w = fdimf(xx2 + 1.0f, xx1);
-  const float h = fdimf(yy2 + 1.0f, yy1);
+  const float w = fdimf(xx2, xx1);
+  const float h = fdimf(yy2, yy1);
   const float intersection = w * h;
 
   // Testing for aa/bb > t
@@ -118,6 +119,47 @@ __device__ EIGEN_STRONG_INLINE void Flipped<true>(Box& box) {
   if (box.x1 > box.x2) Swap(box.x1, box.x2);
   if (box.y1 > box.y2) Swap(box.y1, box.y2);
 }
+template <typename T>
+__device__ EIGEN_STRONG_INLINE bool CheckBit(T* bit_mask, int bit) {
+  constexpr int kShiftLen = NumBits(8 * sizeof(T)) - 1;
+  constexpr int kRemainderMask = 8 * sizeof(T) - 1;
+  int bin = bit >> kShiftLen;
+  return (bit_mask[bin] >> (bit & kRemainderMask)) & 1;
+}
+
+// Produce a global bitmask (result_mask) of selected boxes from bitmask
+// generated by NMSKernel Abort early if max_boxes boxes are selected. Bitmask
+// is num_boxes*bit_mask_len bits indicating whether to keep or remove a box.
+__global__ void NMSReduce(const int* bitmask, const int bit_mask_len,
+                          const int num_boxes, const int max_boxes,
+                          char* result_mask) {
+  extern __shared__ int local[];
+  // set global mask to accept all boxes
+  for (int box : CudaGridRangeX(bit_mask_len)) {
+    local[box] = 0xFFFFFFFF;
+  }
+  __syncthreads();
+  int accepted_boxes = 0;
+  for (int box = 0; box < num_boxes - 1; ++box) {
+    // if current box is masked by an earlier box, skip it.
+    if (!CheckBit(local, box)) {
+      continue;
+    }
+    accepted_boxes += 1;
+    int offset = box * bit_mask_len;
+    // update global mask with current box's mask
+    for (int b : CudaGridRangeX(bit_mask_len)) {
+      local[b] &= ~bitmask[offset + b];
+    }
+    __syncthreads();
+    if (accepted_boxes > max_boxes) break;
+  }
+  // copy global mask to result_max char array. char array is needed for
+  // cub::DeviceSelect later.
+  for (int box : CudaGridRangeX(num_boxes)) {
+    result_mask[box] = CheckBit(local, box);
+  }
+}
 
 // For each box, compute a bitmask of boxes which has an overlap with given box
 // above threshold.
@@ -131,9 +173,9 @@ __device__ EIGEN_STRONG_INLINE void Flipped<true>(Box& box) {
 // x1<x2 and y1<y2.
 template <bool flip_box>
 __launch_bounds__(kNmsBlockDim* kNmsBlockDim, 4) __global__
-    void NMSKernel(const Box* __restrict__ d_desc_sorted_boxes,
-                   const int num_boxes, const float iou_threshold,
-                   const int bit_mask_len, int* __restrict__ d_delete_mask) {
+    void NMSKernel(const Box* d_desc_sorted_boxes, const int num_boxes,
+                   const float iou_threshold, const int bit_mask_len,
+                   int* d_delete_mask) {
   // Storing boxes used by this CUDA block in the shared memory.
   __shared__ Box shared_i_boxes[kNmsBlockDim];
   // Same thing with areas
@@ -173,8 +215,8 @@ __launch_bounds__(kNmsBlockDim* kNmsBlockDim, 4) __global__
         Box j_box = d_desc_sorted_boxes[j];
         const Box i_box = shared_i_boxes[threadIdx.x];
         Flipped<flip_box>(j_box);
-        if (OverThreshold(&i_box, &j_box, shared_i_areas[threadIdx.x],
-                          iou_threshold)) {
+        if (OverThreshold<float>(&i_box, &j_box, shared_i_areas[threadIdx.x],
+                                 iou_threshold)) {
           // we have score[j] <= score[i].
           above_threshold |= (1U << ib);
         }
@@ -196,8 +238,7 @@ __device__ EIGEN_STRONG_INLINE void SelectHelper(const Index i_selected,
 template <typename Index, typename T, typename... Args>
 __device__ EIGEN_STRONG_INLINE void SelectHelper(const Index i_selected,
                                                  const Index i_original,
-                                                 const T* __restrict__ original,
-                                                 T* __restrict__ selected,
+                                                 const T* original, T* selected,
                                                  Args... args) {
   selected[i_selected] = original[i_original];
   SelectHelper(i_selected, i_original, args...);
@@ -210,18 +251,15 @@ __device__ EIGEN_STRONG_INLINE void SelectHelper(const Index i_selected,
 // IndexMultiSelect(num_elements, indices, original1 ,selected1, original2,
 // selected2).
 template <typename Index, typename T, typename... Args>
-__global__ void IndexMultiSelect(const int num_elements,
-                                 const Index* __restrict__ indices,
-                                 const T* __restrict__ original,
-                                 T* __restrict__ selected, Args... args) {
+__global__ void IndexMultiSelect(const int num_elements, const Index* indices,
+                                 const T* original, T* selected, Args... args) {
   for (const int idx : CudaGridRangeX(num_elements)) {
     SelectHelper(idx, indices[idx], original, selected, args...);
   }
 }
 
 template <typename T>
-__global__ void Iota(const int num_elements, const T offset,
-                     T* __restrict__ to_fill) {
+__global__ void Iota(const int num_elements, const T offset, T* to_fill) {
   for (int idx : CudaGridRangeX(num_elements)) {
     to_fill[idx] = static_cast<T>(idx) + offset;
   }
@@ -229,7 +267,7 @@ __global__ void Iota(const int num_elements, const T offset,
 
 Status NmsGpu(const float* d_sorted_boxes_float_ptr, const int num_boxes,
               const float iou_threshold, int* d_selected_indices, int* h_nkeep,
-              OpKernelContext* context, bool flip_boxes) {
+              OpKernelContext* context, const int max_boxes, bool flip_boxes) {
   // Making sure we respect the __align(16)__
   // we promised to the compiler.
   auto iptr = reinterpret_cast<std::uintptr_t>(d_sorted_boxes_float_ptr);
@@ -237,7 +275,7 @@ Status NmsGpu(const float* d_sorted_boxes_float_ptr, const int num_boxes,
     return errors::InvalidArgument("Boxes should be aligned to 16 Bytes.");
   }
   // allocate bitmask arrays on host and on device
-  Tensor h_nms_mask, d_nms_mask;
+  Tensor h_num_selected, d_nms_mask;
   const int bit_mask_len =
       (num_boxes + kNmsBoxesPerThread - 1) / kNmsBoxesPerThread;
 
@@ -257,12 +295,11 @@ Status NmsGpu(const float* d_sorted_boxes_float_ptr, const int num_boxes,
   alloc_attr.set_gpu_compatible(true);
   // Size of this buffer can be reduced to kNmsChunkSize*bit_mask_len*2 and
   // using it as a ring buffer. However savings should be a few MB .
-  TF_RETURN_IF_ERROR(context->allocate_temp(DataType::DT_INT32,
-                                            TensorShape({max_nms_mask_size}),
-                                            &h_nms_mask, alloc_attr));
+  TF_RETURN_IF_ERROR(context->allocate_temp(
+      DataType::DT_INT32, TensorShape({1}), &h_num_selected, alloc_attr));
 
   int* d_delete_mask = d_nms_mask.flat<int>().data();
-  int* h_delete_mask = h_nms_mask.flat<int>().data();
+  int* h_selected_count = h_num_selected.flat<int>().data();
   const Box* d_sorted_boxes =
       reinterpret_cast<const Box*>(d_sorted_boxes_float_ptr);
   dim3 block_dim, thread_block;
@@ -286,58 +323,222 @@ Status NmsGpu(const float* d_sorted_boxes_float_ptr, const int num_boxes,
   TF_RETURN_IF_CUDA_ERROR(cudaGetLastError());
   // Overlapping CPU computes and D2H memcpy
   // both take about the same time
-  int num_to_copy = std::min(kNmsChunkSize, num_boxes);
+
+  config = GetGpuLaunchConfig(num_boxes, device);
+  Tensor selected_boxes;
+  TF_RETURN_IF_ERROR(context->allocate_temp(
+      DataType::DT_INT8, TensorShape({num_boxes}), &selected_boxes));
+  Tensor d_indices;
+  TF_RETURN_IF_ERROR(context->allocate_temp(
+      DataType::DT_INT32, TensorShape({num_boxes}), &d_indices));
+  TF_CHECK_OK(GpuLaunchKernel(Iota<int>, config.block_count,
+                              config.thread_per_block, 0, device.stream(),
+                              config.virtual_thread_count, 0,
+                              d_indices.flat<int>().data()));
+
+  char* selected = (char*)(selected_boxes.flat<int8>().data());
+  TF_CHECK_OK(GpuLaunchKernel(NMSReduce, 1, 1024, bit_mask_len * sizeof(int),
+                              device.stream(), d_delete_mask, bit_mask_len,
+                              num_boxes, max_boxes, selected));
+  TF_RETURN_IF_CUDA_ERROR(cudaGetLastError());
+  // do Cub::deviceSelect::flagged
+  size_t flagged_buffer_size = 0;
+  cub::DeviceSelect::Flagged(static_cast<void*>(nullptr),  // temp_storage
+                             flagged_buffer_size,
+                             static_cast<int*>(nullptr),   // input
+                             static_cast<char*>(nullptr),  // selection flag
+                             static_cast<int*>(nullptr),   // selected items
+                             static_cast<int*>(nullptr),   // num_selected
+                             num_boxes, device.stream());
+  Tensor cub_scratch;
+  TF_RETURN_IF_ERROR(context->allocate_temp(
+      DataType::DT_INT8, TensorShape({(int64)flagged_buffer_size}),
+      &cub_scratch));
+  Tensor d_num_selected;
+  TF_RETURN_IF_ERROR(context->allocate_temp(DataType::DT_INT32,
+                                            TensorShape({1}), &d_num_selected));
+
+  cub::DeviceSelect::Flagged(
+      (void*)cub_scratch.flat<int8>().data(),  // temp_storage
+      flagged_buffer_size,
+      d_indices.flat<int>().data(),  // input
+      selected,                      // selection flag
+      d_selected_indices,            // selected items
+      d_num_selected.flat<int>().data(), num_boxes, device.stream());
   cudaEvent_t copy_done;
-  cudaEventCreate(&copy_done);
-  device.memcpyDeviceToHost(&h_delete_mask[0], &d_delete_mask[0],
-                            num_to_copy * bit_mask_len * sizeof(int));
+  TF_RETURN_IF_CUDA_ERROR(
+      cudaEventCreateWithFlags(&copy_done, cudaEventDisableTiming));
+  device.memcpyDeviceToHost(h_selected_count, d_num_selected.flat<int>().data(),
+                            sizeof(int));
   TF_RETURN_IF_CUDA_ERROR(cudaEventRecord(copy_done, device.stream()));
-  int offset = 0;
-  std::vector<int> h_selected_indices;
-  // Reserve worst case scenario. Since box count is not huge, this should have
-  // negligible footprint.
-  h_selected_indices.reserve(num_boxes);
-  std::vector<int> to_remove(bit_mask_len, 0);
-  while (offset < num_boxes) {
-    const int num_copied = num_to_copy;
-    int next_offset = offset + num_copied;
-    num_to_copy = std::min(kNmsChunkSize, num_boxes - next_offset);
-    if (num_to_copy > 0) {
-      device.memcpyDeviceToHost(&h_delete_mask[next_offset * bit_mask_len],
-                                &d_delete_mask[next_offset * bit_mask_len],
-                                num_to_copy * bit_mask_len * sizeof(int));
-    }
-    // Waiting for previous copy
-    TF_RETURN_IF_CUDA_ERROR(cudaEventSynchronize(copy_done));
-    if (num_to_copy > 0) {
-      TF_RETURN_IF_CUDA_ERROR(cudaEventRecord(copy_done, device.stream()));
-    }
-    // Starting from highest scoring box, mark any box with iou>threshold and
-    // lower score for deletion if current box is not marked for deletion. Add
-    // current box to to_keep list.
-    for (int i = offset; i < next_offset; ++i) {
-      // See the comment at the beginning of the file.
-      // Bit shift and logical And operations are used
-      // instead of division and modulo operations.
-      int iblock = i >> kNmsBoxesPerThreadShiftBits;
-      int inblock = i & kNmsBoxesPerThreadModuloMask;
-      if (!(to_remove[iblock] & (1 << inblock))) {
-        h_selected_indices.push_back(i);
-        int* p = &h_delete_mask[i * bit_mask_len];
-        for (int ib = 0; ib < bit_mask_len; ++ib) {
-          to_remove[ib] |= p[ib];
-        }
-      }
-    }
-    offset = next_offset;
-  }
+  TF_RETURN_IF_CUDA_ERROR(cudaEventSynchronize(copy_done));
+  *h_nkeep = *h_selected_count;
   cudaEventDestroy(copy_done);
+  return Status::OK();
+}
 
-  const int nkeep = h_selected_indices.size();
-  device.memcpyHostToDevice(d_selected_indices, &h_selected_indices[0],
-                            nkeep * sizeof(int));
+struct GreaterThanCubOp {
+  float threshold_;
+  __host__ __device__ __forceinline__ GreaterThanCubOp(float threshold)
+      : threshold_(threshold) {}
+  __host__ __device__ __forceinline__ bool operator()(const float& val) const {
+    return (val > threshold_);
+  }
+};
+// Use DeviceSelect::If to count number of elements.
+// TODO(sami) Not really a good way. Perhaps consider using thrust?
+template <typename Op>
+Status CountIf(OpKernelContext* context, const float* dev_array, const Op& op,
+               int num_elements, int* result) {
+  Tensor scratch_output;
+  Tensor workspace;
+  Tensor element_count;
+  size_t workspace_size = 0;
+  auto cuda_stream = tensorflow::GetGpuStream(context);
+  auto device = context->eigen_gpu_device();
+  cub::DeviceSelect::If(nullptr, workspace_size, static_cast<float*>(nullptr),
+                        static_cast<float*>(nullptr),
+                        static_cast<int*>(nullptr), num_elements, op);
 
-  *h_nkeep = nkeep;
+  TF_RETURN_IF_ERROR(context->allocate_temp(
+      DataType::DT_FLOAT, TensorShape({num_elements}), &scratch_output));
+  TF_RETURN_IF_ERROR(context->allocate_temp(
+      DataType::DT_INT8, TensorShape({(int64)workspace_size}), &workspace));
+  TF_RETURN_IF_ERROR(context->allocate_temp(DataType::DT_INT32,
+                                            TensorShape({1}), &element_count));
+  cudaEvent_t copy_done;
+  TF_RETURN_IF_CUDA_ERROR(
+      cudaEventCreateWithFlags(&copy_done, cudaEventDisableTiming));
+  TF_RETURN_IF_CUDA_ERROR(cub::DeviceSelect::If(
+      workspace.flat<int8>().data(), workspace_size, dev_array,
+      scratch_output.flat<float>().data(), element_count.flat<int32>().data(),
+      num_elements, op, cuda_stream));
+  device.memcpyDeviceToHost(result, element_count.flat<int32>().data(),
+                            sizeof(int));
+  TF_RETURN_IF_CUDA_ERROR(cudaEventRecord(copy_done, device.stream()));
+  TF_RETURN_IF_CUDA_ERROR(cudaEventSynchronize(copy_done));
+  return Status::OK();
+}
+
+Status DoNMS(OpKernelContext* context, const Tensor& boxes,
+             const Tensor& scores, const int64_t max_output_size,
+             const float iou_threshold_val, const float score_threshold) {
+  const int output_size = max_output_size;
+  int num_boxes = boxes.dim_size(0);
+  size_t cub_sort_temp_storage_bytes = 0;
+  auto cuda_stream = GetGpuStream(context);
+  auto device = context->eigen_gpu_device();
+  // Calling cub with nullptrs as inputs will make it return
+  // workspace size needed for the operation instead of doing the operation.
+  // In this specific instance, cub_sort_temp_storage_bytes will contain the
+  // necessary workspace size for sorting after the call.
+  if (num_boxes == 0) {
+    Tensor* output_indices = nullptr;
+    TF_RETURN_IF_ERROR(
+        context->allocate_output(0, TensorShape({0}), &output_indices));
+    return Status::OK();
+  }
+
+  cudaError_t cuda_ret = cub::DeviceRadixSort::SortPairsDescending(
+      nullptr, cub_sort_temp_storage_bytes,
+      static_cast<float*>(nullptr),  // scores
+      static_cast<float*>(nullptr),  // sorted scores
+      static_cast<int*>(nullptr),    // input indices
+      static_cast<int*>(nullptr),    // sorted indices
+      num_boxes,                     // num items
+      0, 8 * sizeof(float),          // sort all bits
+      cuda_stream);
+  TF_RETURN_IF_CUDA_ERROR(cuda_ret);
+  Tensor d_cub_sort_buffer;
+  TF_RETURN_IF_ERROR(context->allocate_temp(
+      DataType::DT_INT8, TensorShape({(int64)cub_sort_temp_storage_bytes}),
+      &d_cub_sort_buffer));
+  Tensor d_indices;
+  TF_RETURN_IF_ERROR(context->allocate_temp(
+      DataType::DT_INT32, TensorShape({num_boxes}), &d_indices));
+  Tensor d_sorted_indices;
+  TF_RETURN_IF_ERROR(context->allocate_temp(
+      DataType::DT_INT32, TensorShape({num_boxes}), &d_sorted_indices));
+  Tensor d_selected_indices;
+  TF_RETURN_IF_ERROR(context->allocate_temp(
+      DataType::DT_INT32, TensorShape({num_boxes}), &d_selected_indices));
+  Tensor d_sorted_scores;
+  TF_RETURN_IF_ERROR(context->allocate_temp(
+      DataType::DT_FLOAT, TensorShape({num_boxes}), &d_sorted_scores));
+  Tensor d_sorted_boxes;
+  TF_RETURN_IF_ERROR(context->allocate_temp(
+      DataType::DT_FLOAT, TensorShape({num_boxes, 4}), &d_sorted_boxes));
+
+  // this will return sorted scores and their indices
+  auto config = GetGpuLaunchConfig(num_boxes, device);
+  // initialize box and score indices
+  TF_CHECK_OK(GpuLaunchKernel(Iota<int>, config.block_count,
+                              config.thread_per_block, 0, device.stream(),
+                              config.virtual_thread_count, 0,
+                              d_indices.flat<int>().data()));
+  TF_RETURN_IF_CUDA_ERROR(cudaGetLastError());
+  cuda_ret = cub::DeviceRadixSort::SortPairsDescending(
+      d_cub_sort_buffer.flat<int8>().data(), cub_sort_temp_storage_bytes,
+      scores.flat<float>().data(), d_sorted_scores.flat<float>().data(),
+      d_indices.flat<int>().data(), d_sorted_indices.flat<int>().data(),
+      num_boxes, 0,
+      8 * sizeof(float),  // sort all bits
+      cuda_stream);
+  TF_RETURN_IF_CUDA_ERROR(cuda_ret);
+
+  // get pointers for easy access
+  const float4* original_boxes =
+      reinterpret_cast<const float4*>(boxes.flat<float>().data());
+  float4* sorted_boxes =
+      reinterpret_cast<float4*>(d_sorted_boxes.flat<float>().data());
+  const int* sorted_indices = d_sorted_indices.flat<int>().data();
+  // sort boxes using indices
+  TF_CHECK_OK(GpuLaunchKernel(IndexMultiSelect<int, float4>, config.block_count,
+                              config.thread_per_block, 0, device.stream(),
+                              config.virtual_thread_count, sorted_indices,
+                              original_boxes, sorted_boxes));
+  int limited_num_boxes = num_boxes;
+  // filter boxes by scores if nms v3
+  if (score_threshold > std::numeric_limits<float>::lowest()) {
+    GreaterThanCubOp score_limit(score_threshold);
+    TF_RETURN_IF_ERROR(CountIf(context, d_sorted_scores.flat<float>().data(),
+                               score_limit, num_boxes, &limited_num_boxes));
+    if (limited_num_boxes == 0) {
+      Tensor* output_indices = nullptr;
+      VLOG(1) << "Number of boxes above score threshold " << score_threshold
+              << " is 0";
+      TF_RETURN_IF_ERROR(
+          context->allocate_output(0, TensorShape({0}), &output_indices));
+      return Status::OK();
+    } else {
+      VLOG(2) << "Number of boxes above threshold=" << score_threshold << " is "
+              << limited_num_boxes;
+    }
+  }
+  int num_to_keep = 0;
+  // There is no guarantee that boxes are given in the for x1<x2 and/or y1<y2,
+  // flip boxes if necessary!
+  const bool flip_boxes = true;
+  auto status = NmsGpu(d_sorted_boxes.flat<float>().data(), limited_num_boxes,
+                       iou_threshold_val, d_selected_indices.flat<int>().data(),
+                       &num_to_keep, context, output_size, flip_boxes);
+  TF_RETURN_IF_CUDA_ERROR(cudaGetLastError());
+  if (!status.ok()) {
+    context->SetStatus(status);
+    return status;
+  }
+  Tensor* output_indices = nullptr;
+  int num_outputs = std::min(num_to_keep, output_size);  // no padding!
+  TF_RETURN_IF_ERROR(
+      context->allocate_output(0, TensorShape({num_outputs}), &output_indices));
+  if (num_outputs == 0) return Status::OK();
+  config = GetGpuLaunchConfig(num_outputs, device);
+  TF_CHECK_OK(GpuLaunchKernel(
+      IndexMultiSelect<int, int>, config.block_count, config.thread_per_block,
+      0, device.stream(), config.virtual_thread_count,
+      d_selected_indices.flat<int>().data(), sorted_indices,
+      (*output_indices).flat<int>().data()));
+  TF_RETURN_IF_CUDA_ERROR(cudaGetLastError());
   return Status::OK();
 }
 
@@ -384,112 +585,86 @@ class NonMaxSuppressionV2GPUOp : public OpKernel {
                                                        &output_indices));
       return;
     }
-    const int output_size = max_output_size.scalar<int>()();
-    size_t cub_sort_temp_storage_bytes = 0;
-    auto cuda_stream = GetGpuStream(context);
-    auto device = context->eigen_gpu_device();
-    // Calling cub with nullptrs as inputs will make it return
-    // workspace size needed for the operation instead of doing the operation.
-    // In this specific instance, cub_sort_temp_storage_bytes will contain the
-    // necessary workspace size for sorting after the call.
-    cudaError_t cuda_ret = cub::DeviceRadixSort::SortPairsDescending(
-        nullptr, cub_sort_temp_storage_bytes,
-        static_cast<float*>(nullptr),  // scores
-        static_cast<float*>(nullptr),  // sorted scores
-        static_cast<int*>(nullptr),    // input indices
-        static_cast<int*>(nullptr),    // sorted indices
-        num_boxes,                     // num items
-        0, 8 * sizeof(float),          // sort all bits
-        cuda_stream);
-    TF_OP_REQUIRES_CUDA_SUCCESS(context, cuda_ret);
-    Tensor d_cub_sort_buffer;
-    OP_REQUIRES_OK(context,
-                   context->allocate_temp(
-                       DataType::DT_INT8,
-                       TensorShape({(int64)cub_sort_temp_storage_bytes}),
-                       &d_cub_sort_buffer));
-    Tensor d_indices;
+    const int64_t output_size = max_output_size.scalar<int>()();
     OP_REQUIRES_OK(
-        context, context->allocate_temp(DataType::DT_INT32,
-                                        TensorShape({num_boxes}), &d_indices));
-    Tensor d_sorted_indices;
-    OP_REQUIRES_OK(context, context->allocate_temp(DataType::DT_INT32,
-                                                   TensorShape({num_boxes}),
-                                                   &d_sorted_indices));
-    Tensor d_selected_indices;
-    OP_REQUIRES_OK(context, context->allocate_temp(DataType::DT_INT32,
-                                                   TensorShape({num_boxes}),
-                                                   &d_selected_indices));
-    Tensor d_sorted_scores;
-    OP_REQUIRES_OK(context, context->allocate_temp(DataType::DT_FLOAT,
-                                                   TensorShape({num_boxes}),
-                                                   &d_sorted_scores));
-    Tensor d_sorted_boxes;
-    OP_REQUIRES_OK(context, context->allocate_temp(DataType::DT_FLOAT,
-                                                   TensorShape({num_boxes, 4}),
-                                                   &d_sorted_boxes));
-
-    // this will return sorted scores and their indices
-    auto config = GetGpuLaunchConfig(num_boxes, device);
-    // initialize box and score indices
-    TF_CHECK_OK(GpuLaunchKernel(Iota<int>, config.block_count,
-                                config.thread_per_block, 0, device.stream(),
-                                config.virtual_thread_count, 0,
-                                d_indices.flat<int>().data()));
-    TF_OP_REQUIRES_CUDA_SUCCESS(context, cudaGetLastError());
-    cuda_ret = cub::DeviceRadixSort::SortPairsDescending(
-        d_cub_sort_buffer.flat<int8>().data(), cub_sort_temp_storage_bytes,
-        scores.flat<float>().data(), d_sorted_scores.flat<float>().data(),
-        d_indices.flat<int>().data(), d_sorted_indices.flat<int>().data(),
-        num_boxes, 0,
-        8 * sizeof(float),  // sort all bits
-        cuda_stream);
-    TF_OP_REQUIRES_CUDA_SUCCESS(context, cuda_ret);
-
-    // get pointers for easy access
-    const float4* original_boxes =
-        reinterpret_cast<const float4*>(boxes.flat<float>().data());
-    float4* sorted_boxes =
-        reinterpret_cast<float4*>(d_sorted_boxes.flat<float>().data());
-    const int* sorted_indices = d_sorted_indices.flat<int>().data();
-    // sort boxes using indices
-    TF_CHECK_OK(GpuLaunchKernel(IndexMultiSelect<int, float4>,
-                                config.block_count, config.thread_per_block, 0,
-                                device.stream(), config.virtual_thread_count,
-                                sorted_indices, original_boxes, sorted_boxes));
-
-    int num_to_keep = 0;
-    // There is no guarantee that boxes are given in the for x1<x2 and/or y1<y2,
-    // flip boxes if necessary!
-    const bool flip_boxes = true;
-    auto status =
-        NmsGpu(d_sorted_boxes.flat<float>().data(), num_boxes,
-               iou_threshold_val, d_selected_indices.flat<int>().data(),
-               &num_to_keep, context, flip_boxes);
-    TF_OP_REQUIRES_CUDA_SUCCESS(context, cudaGetLastError());
-    if (!status.ok()) {
-      context->SetStatus(status);
-      return;
-    }
-    Tensor* output_indices = nullptr;
-    int num_outputs = std::min(num_to_keep, output_size);  // no padding!
-    OP_REQUIRES_OK(context,
-                   context->allocate_output(0, TensorShape({num_outputs}),
-                                            &output_indices));
-    if (num_outputs == 0) return;
-    config = GetGpuLaunchConfig(num_outputs, device);
-    TF_CHECK_OK(GpuLaunchKernel(
-        IndexMultiSelect<int, int>, config.block_count, config.thread_per_block,
-        0, device.stream(), config.virtual_thread_count,
-        d_selected_indices.flat<int>().data(), sorted_indices,
-        (*output_indices).flat<int>().data()));
-    TF_OP_REQUIRES_CUDA_SUCCESS(context, cudaGetLastError());
+        context,
+        DoNMS(context, boxes, scores, output_size, iou_threshold_val,
+              /*score_threshold is float min if score threshold is disabled*/
+              std::numeric_limits<float>::lowest()));
   }
 };
 
-REGISTER_KERNEL_BUILDER(
-    Name("NonMaxSuppressionV2").TypeConstraint<float>("T").Device(DEVICE_GPU),
-    NonMaxSuppressionV2GPUOp);
+class NonMaxSuppressionV3GPUOp : public OpKernel {
+ public:
+  explicit NonMaxSuppressionV3GPUOp(OpKernelConstruction* context)
+      : OpKernel(context) {}
+
+  void Compute(OpKernelContext* context) override {
+    // boxes: [num_boxes, 4]
+    const Tensor& boxes = context->input(0);
+    // scores: [num_boxes]
+    const Tensor& scores = context->input(1);
+    // max_output_size: scalar
+    const Tensor& max_output_size = context->input(2);
+    OP_REQUIRES(
+        context, TensorShapeUtils::IsScalar(max_output_size.shape()),
+        errors::InvalidArgument("max_output_size must be 0-D, got shape ",
+                                max_output_size.shape().DebugString()));
+    // iou_threshold: scalar
+    const Tensor& iou_threshold = context->input(3);
+    OP_REQUIRES(context, TensorShapeUtils::IsScalar(iou_threshold.shape()),
+                errors::InvalidArgument("iou_threshold must be 0-D, got shape ",
+                                        iou_threshold.shape().DebugString()));
+    const float iou_threshold_val = iou_threshold.scalar<float>()();
+
+    const Tensor& score_threshold = context->input(4);
+    OP_REQUIRES(
+        context, TensorShapeUtils::IsScalar(score_threshold.shape()),
+        errors::InvalidArgument("score_threshold must be 0-D, got shape ",
+                                score_threshold.shape().DebugString()));
+    const float score_threshold_val = score_threshold.scalar<float>()();
+
+    OP_REQUIRES(context, iou_threshold_val >= 0 && iou_threshold_val <= 1,
+                errors::InvalidArgument("iou_threshold must be in [0, 1]"));
+    OP_REQUIRES(context, boxes.dims() == 2,
+                errors::InvalidArgument("boxes must be a rank 2 tensor!"));
+    int num_boxes = boxes.dim_size(0);
+    OP_REQUIRES(context, boxes.dim_size(1) == 4,
+                errors::InvalidArgument("boxes must be Nx4"));
+    OP_REQUIRES(context, scores.dims() == 1,
+                errors::InvalidArgument("scores must be a vector!"));
+    OP_REQUIRES(
+        context, scores.dim_size(0) == num_boxes,
+        errors::InvalidArgument(
+            "scores has incompatible shape"));  // message must be exactly this
+                                                // otherwise tests fail!
+    if (num_boxes == 0) {
+      Tensor* output_indices = nullptr;
+      OP_REQUIRES_OK(context, context->allocate_output(0, TensorShape({0}),
+                                                       &output_indices));
+      return;
+    }
+    const int output_size = max_output_size.scalar<int>()();
+    OP_REQUIRES_OK(context, DoNMS(context, boxes, scores, output_size,
+                                  iou_threshold_val, score_threshold_val));
+  }
+};
+
+REGISTER_KERNEL_BUILDER(Name("NonMaxSuppressionV2")
+                            .TypeConstraint<float>("T")
+                            .Device(DEVICE_GPU)
+                            .HostMemory("iou_threshold")
+                            .HostMemory("max_output_size"),
+                        NonMaxSuppressionV2GPUOp);
+
+// TODO(laigd): enable the op once b/140816449 is fixed.
+// REGISTER_KERNEL_BUILDER(Name("NonMaxSuppressionV3")
+//                             .TypeConstraint<float>("T")
+//                             .Device(DEVICE_GPU)
+//                             .HostMemory("iou_threshold")
+//                             .HostMemory("max_output_size")
+//                             .HostMemory("score_threshold"),
+//                         NonMaxSuppressionV3GPUOp);
 
 }  // namespace tensorflow
 #endif
diff --git a/tensorflow/core/kernels/non_max_suppression_op.h b/tensorflow/core/kernels/non_max_suppression_op.h
index 7ff4f16c689..eaa1b28ad4b 100644
--- a/tensorflow/core/kernels/non_max_suppression_op.h
+++ b/tensorflow/core/kernels/non_max_suppression_op.h
@@ -54,7 +54,7 @@ extern const int kNmsBoxesPerTread;
 Status NmsGpu(const float* d_sorted_boxes_float_ptr, const int num_boxes,
               const float iou_threshold, int* d_selected_indices,
               int* h_num_boxes_to_keep, OpKernelContext* context,
-              bool flip_boxes = false);
+              const int max_boxes, bool flip_boxes = false);
 #endif
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/non_max_suppression_op_gpu_test.cc b/tensorflow/core/kernels/non_max_suppression_op_gpu_test.cc
index 1034aeaf37f..563e3cb6ad3 100644
--- a/tensorflow/core/kernels/non_max_suppression_op_gpu_test.cc
+++ b/tensorflow/core/kernels/non_max_suppression_op_gpu_test.cc
@@ -203,6 +203,223 @@ TEST_F(NonMaxSuppressionV2GPUOpTest, TestEmptyInput) {
   test::ExpectTensorEqual<int>(expected, *GetOutput(0));
 }
 
+//
+// NonMaxSuppressionV3GPUOp Tests
+// Copied from CPU tests
+
+class NonMaxSuppressionV3GPUOpTest : public OpsTestBase {
+ protected:
+  void MakeOp() {
+    // TODO(laigd): enable the op once b/140816449 is fixed.
+    // SetDevice(DEVICE_GPU,
+    //           std::unique_ptr<tensorflow::Device>(DeviceFactory::NewDevice(
+    //               "GPU", {}, "/job:a/replica:0/task:0")));
+
+    TF_EXPECT_OK(NodeDefBuilder("non_max_suppression_op", "NonMaxSuppressionV3")
+                     .Input(FakeInput(DT_FLOAT))
+                     .Input(FakeInput(DT_FLOAT))
+                     .Input(FakeInput(DT_INT32))
+                     .Input(FakeInput(DT_FLOAT))
+                     .Input(FakeInput(DT_FLOAT))
+                     .Finalize(node_def()));
+    TF_EXPECT_OK(InitOp());
+  }
+};
+
+TEST_F(NonMaxSuppressionV3GPUOpTest, TestSelectFromThreeClusters) {
+  MakeOp();
+  AddInputFromArray<float>(
+      TensorShape({6, 4}),
+      {0, 0,  1, 1,  0, 0.1f,  1, 1.1f,  0, -0.1f, 1, 0.9f,
+       0, 10, 1, 11, 0, 10.1f, 1, 11.1f, 0, 100,   1, 101});
+  AddInputFromArray<float>(TensorShape({6}), {.9f, .75f, .6f, .95f, .5f, .3f});
+  AddInputFromArray<int>(TensorShape({}), {3});
+  AddInputFromArray<float>(TensorShape({}), {.5f});
+  AddInputFromArray<float>(TensorShape({}), {0.0f});
+  TF_ASSERT_OK(RunOpKernel());
+
+  Tensor expected(allocator(), DT_INT32, TensorShape({3}));
+  test::FillValues<int>(&expected, {3, 0, 5});
+  test::ExpectTensorEqual<int>(expected, *GetOutput(0));
+}
+
+TEST_F(NonMaxSuppressionV3GPUOpTest,
+       TestSelectFromThreeClustersWithScoreThreshold) {
+  MakeOp();
+  AddInputFromArray<float>(
+      TensorShape({6, 4}),
+      {0, 0,  1, 1,  0, 0.1f,  1, 1.1f,  0, -0.1f, 1, 0.9f,
+       0, 10, 1, 11, 0, 10.1f, 1, 11.1f, 0, 100,   1, 101});
+  AddInputFromArray<float>(TensorShape({6}), {.9f, .75f, .6f, .95f, .5f, .3f});
+  AddInputFromArray<int>(TensorShape({}), {3});
+  AddInputFromArray<float>(TensorShape({}), {0.5f});
+  AddInputFromArray<float>(TensorShape({}), {0.4f});
+  TF_ASSERT_OK(RunOpKernel());
+
+  Tensor expected(allocator(), DT_INT32, TensorShape({2}));
+  test::FillValues<int>(&expected, {3, 0});
+  test::ExpectTensorEqual<int>(expected, *GetOutput(0));
+}
+
+TEST_F(NonMaxSuppressionV3GPUOpTest,
+       TestSelectFromThreeClustersWithScoreThresholdZeroScores) {
+  MakeOp();
+  AddInputFromArray<float>(
+      TensorShape({6, 4}),
+      {0, 0,  1, 1,  0, 0.1f,  1, 1.1f,  0, -0.1f, 1, 0.9f,
+       0, 10, 1, 11, 0, 10.1f, 1, 11.1f, 0, 100,   1, 101});
+  AddInputFromArray<float>(TensorShape({6}), {.1, 0, 0, .3, .2, -5.0});
+  // If we ask for more boxes than we actually expect to get back;
+  // should still only get 2 boxes back.
+  AddInputFromArray<int>(TensorShape({}), {6});
+  AddInputFromArray<float>(TensorShape({}), {0.5f});
+  AddInputFromArray<float>(TensorShape({}), {-3.0f});
+  TF_ASSERT_OK(RunOpKernel());
+
+  Tensor expected(allocator(), DT_INT32, TensorShape({2}));
+  test::FillValues<int>(&expected, {3, 0});
+
+  test::ExpectTensorEqual<int>(expected, *GetOutput(0));
+}
+
+TEST_F(NonMaxSuppressionV3GPUOpTest,
+       TestSelectFromThreeClustersFlippedCoordinates) {
+  MakeOp();
+  AddInputFromArray<float>(TensorShape({6, 4}),
+                           {1, 1,  0, 0,  0, 0.1f,  1, 1.1f,  0, .9f, 1, -0.1f,
+                            0, 10, 1, 11, 1, 10.1f, 0, 11.1f, 1, 101, 0, 100});
+  AddInputFromArray<float>(TensorShape({6}), {.9f, .75f, .6f, .95f, .5f, .3f});
+  AddInputFromArray<int>(TensorShape({}), {3});
+  AddInputFromArray<float>(TensorShape({}), {.5f});
+  AddInputFromArray<float>(TensorShape({}), {0.0f});
+  TF_ASSERT_OK(RunOpKernel());
+
+  Tensor expected(allocator(), DT_INT32, TensorShape({3}));
+  test::FillValues<int>(&expected, {3, 0, 5});
+  test::ExpectTensorEqual<int>(expected, *GetOutput(0));
+}
+
+TEST_F(NonMaxSuppressionV3GPUOpTest,
+       TestSelectAtMostTwoBoxesFromThreeClusters) {
+  MakeOp();
+  AddInputFromArray<float>(
+      TensorShape({6, 4}),
+      {0, 0,  1, 1,  0, 0.1f,  1, 1.1f,  0, -0.1f, 1, 0.9f,
+       0, 10, 1, 11, 0, 10.1f, 1, 11.1f, 0, 100,   1, 101});
+  AddInputFromArray<float>(TensorShape({6}), {.9f, .75f, .6f, .95f, .5f, .3f});
+  AddInputFromArray<int>(TensorShape({}), {2});
+  AddInputFromArray<float>(TensorShape({}), {.5f});
+  AddInputFromArray<float>(TensorShape({}), {0.0f});
+  TF_ASSERT_OK(RunOpKernel());
+
+  Tensor expected(allocator(), DT_INT32, TensorShape({2}));
+  test::FillValues<int>(&expected, {3, 0});
+  test::ExpectTensorEqual<int>(expected, *GetOutput(0));
+}
+
+TEST_F(NonMaxSuppressionV3GPUOpTest,
+       TestSelectAtMostThirtyBoxesFromThreeClusters) {
+  MakeOp();
+  AddInputFromArray<float>(
+      TensorShape({6, 4}),
+      {0, 0,  1, 1,  0, 0.1f,  1, 1.1f,  0, -0.1f, 1, 0.9f,
+       0, 10, 1, 11, 0, 10.1f, 1, 11.1f, 0, 100,   1, 101});
+  AddInputFromArray<float>(TensorShape({6}), {.9f, .75f, .6f, .95f, .5f, .3f});
+  AddInputFromArray<int>(TensorShape({}), {30});
+  AddInputFromArray<float>(TensorShape({}), {.5f});
+  AddInputFromArray<float>(TensorShape({}), {0.0f});
+  TF_ASSERT_OK(RunOpKernel());
+
+  Tensor expected(allocator(), DT_INT32, TensorShape({3}));
+  test::FillValues<int>(&expected, {3, 0, 5});
+  test::ExpectTensorEqual<int>(expected, *GetOutput(0));
+}
+
+TEST_F(NonMaxSuppressionV3GPUOpTest, TestSelectSingleBox) {
+  MakeOp();
+  AddInputFromArray<float>(TensorShape({1, 4}), {0, 0, 1, 1});
+  AddInputFromArray<float>(TensorShape({1}), {.9f});
+  AddInputFromArray<int>(TensorShape({}), {3});
+  AddInputFromArray<float>(TensorShape({}), {.5f});
+  AddInputFromArray<float>(TensorShape({}), {0.0f});
+  TF_ASSERT_OK(RunOpKernel());
+
+  Tensor expected(allocator(), DT_INT32, TensorShape({1}));
+  test::FillValues<int>(&expected, {0});
+  test::ExpectTensorEqual<int>(expected, *GetOutput(0));
+}
+
+TEST_F(NonMaxSuppressionV3GPUOpTest, TestSelectFromTenIdenticalBoxes) {
+  MakeOp();
+
+  int num_boxes = 10;
+  std::vector<float> corners(num_boxes * 4);
+  std::vector<float> scores(num_boxes);
+  for (int i = 0; i < num_boxes; ++i) {
+    corners[i * 4 + 0] = 0;
+    corners[i * 4 + 1] = 0;
+    corners[i * 4 + 2] = 1;
+    corners[i * 4 + 3] = 1;
+    scores[i] = .9;
+  }
+  AddInputFromArray<float>(TensorShape({num_boxes, 4}), corners);
+  AddInputFromArray<float>(TensorShape({num_boxes}), scores);
+  AddInputFromArray<int>(TensorShape({}), {3});
+  AddInputFromArray<float>(TensorShape({}), {.5f});
+  AddInputFromArray<float>(TensorShape({}), {0.0f});
+  TF_ASSERT_OK(RunOpKernel());
+
+  Tensor expected(allocator(), DT_INT32, TensorShape({1}));
+  test::FillValues<int>(&expected, {0});
+  test::ExpectTensorEqual<int>(expected, *GetOutput(0));
+}
+
+TEST_F(NonMaxSuppressionV3GPUOpTest, TestInconsistentBoxAndScoreShapes) {
+  MakeOp();
+  AddInputFromArray<float>(
+      TensorShape({6, 4}),
+      {0, 0,  1, 1,  0, 0.1f,  1, 1.1f,  0, -0.1f, 1, 0.9f,
+       0, 10, 1, 11, 0, 10.1f, 1, 11.1f, 0, 100,   1, 101});
+  AddInputFromArray<float>(TensorShape({5}), {.9f, .75f, .6f, .95f, .5f});
+  AddInputFromArray<int>(TensorShape({}), {30});
+  AddInputFromArray<float>(TensorShape({}), {.5f});
+  AddInputFromArray<float>(TensorShape({}), {0.0f});
+  Status s = RunOpKernel();
+
+  ASSERT_FALSE(s.ok());
+  EXPECT_TRUE(absl::StrContains(s.ToString(), "scores has incompatible shape"))
+      << s;
+}
+
+TEST_F(NonMaxSuppressionV3GPUOpTest, TestInvalidIOUThreshold) {
+  MakeOp();
+  AddInputFromArray<float>(TensorShape({1, 4}), {0, 0, 1, 1});
+  AddInputFromArray<float>(TensorShape({1}), {.9f});
+  AddInputFromArray<int>(TensorShape({}), {3});
+  AddInputFromArray<float>(TensorShape({}), {1.2f});
+  AddInputFromArray<float>(TensorShape({}), {0.0f});
+  Status s = RunOpKernel();
+
+  ASSERT_FALSE(s.ok());
+  EXPECT_TRUE(
+      absl::StrContains(s.ToString(), "iou_threshold must be in [0, 1]"))
+      << s;
+}
+
+TEST_F(NonMaxSuppressionV3GPUOpTest, TestEmptyInput) {
+  MakeOp();
+  AddInputFromArray<float>(TensorShape({0, 4}), {});
+  AddInputFromArray<float>(TensorShape({0}), {});
+  AddInputFromArray<int>(TensorShape({}), {30});
+  AddInputFromArray<float>(TensorShape({}), {.5f});
+  AddInputFromArray<float>(TensorShape({}), {0.0f});
+  TF_ASSERT_OK(RunOpKernel());
+
+  Tensor expected(allocator(), DT_INT32, TensorShape({0}));
+  test::FillValues<int>(&expected, {});
+  test::ExpectTensorEqual<int>(expected, *GetOutput(0));
+}
+
 #endif
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/pooling_ops_common.cc b/tensorflow/core/kernels/pooling_ops_common.cc
index 997c2aba62f..2c56d8d8bfb 100644
--- a/tensorflow/core/kernels/pooling_ops_common.cc
+++ b/tensorflow/core/kernels/pooling_ops_common.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/core/kernels/pooling_ops_common.h"
 
 #include <vector>
+
 #include "tensorflow/core/common_runtime/device.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
@@ -133,20 +134,6 @@ TensorShape PoolParameters::forward_output_shape() {
 
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
-// Forward declarations of the functor specializations for GPU.
-namespace functor {
-#define DECLARE_GPU_SPEC(T)                                         \
-  template <>                                                       \
-  void TransformDepth<GPUDevice, T, Eigen::DenseIndex>::operator()( \
-      const GPUDevice& d, typename TTypes<T, 4>::ConstTensor in,    \
-      const Eigen::DSizes<Eigen::DenseIndex, 4>& shuffle,           \
-      typename TTypes<T, 4>::Tensor out);                           \
-  extern template struct TransformDepth<GPUDevice, T, Eigen::DenseIndex>;
-
-TF_CALL_GPU_NUMBER_TYPES(DECLARE_GPU_SPEC)
-#undef DECLARE_GPU_SPEC
-}  // namespace functor
-
 template <typename T>
 void DnnPoolingOp<T>::Compute(OpKernelContext* context,
                               se::dnn::PoolingMode pooling_mode,
diff --git a/tensorflow/core/kernels/quantize_op.cc b/tensorflow/core/kernels/quantize_op.cc
index 857273e04ee..e652708bcd7 100644
--- a/tensorflow/core/kernels/quantize_op.cc
+++ b/tensorflow/core/kernels/quantize_op.cc
@@ -96,19 +96,75 @@ class QuantizeV2Op : public OpKernel {
       OP_REQUIRES(ctx, mode_string == "SCALED",
                   errors::InvalidArgument("Round mode 'HALF_TO_EVEN' "
                                           "only supported for mode 'SCALED', "
-                                          "but mode is '" +
+                                          "b  ut mode is '" +
                                           mode_string + "'."));
       round_mode_ = ROUND_HALF_TO_EVEN;
     }
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("narrow_range", &narrow_range_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("axis", &axis_));
   }
 
   void Compute(OpKernelContext* ctx) override {
     const Tensor& input = ctx->input(0);
-    const float input_min_range = ctx->input(1).flat<float>()(0);
-    const float input_max_range = ctx->input(2).flat<float>()(0);
+    const Tensor& input_min_range = ctx->input(1);
+    const Tensor& input_max_range = ctx->input(2);
 
-    float min_range;
-    float max_range;
+    int num_slices = 1;
+    if (axis_ > -1) {
+      num_slices = input.dim_size(axis_);
+    }
+
+    const TensorShape& minmax_shape = ctx->input(1).shape();
+    Tensor* output = nullptr;
+    OP_REQUIRES_OK(ctx, ctx->allocate_output(0, input.shape(), &output));
+
+    Tensor* output_min_tensor = nullptr;
+    Tensor* output_max_tensor = nullptr;
+
+    if (num_slices == 1) {
+      OP_REQUIRES_OK(ctx, ctx->allocate_output(1, {}, &output_min_tensor));
+      OP_REQUIRES_OK(ctx, ctx->allocate_output(2, {}, &output_max_tensor));
+      const float min_range = input_min_range.template flat<float>()(0);
+      const float max_range = input_max_range.template flat<float>()(0);
+      QuantizeTensor(ctx, input, min_range, max_range, output,
+                     output_min_tensor, output_max_tensor);
+      return;
+    }
+
+    OP_REQUIRES(ctx, mode_ != QUANTIZE_MODE_MIN_FIRST,
+                errors::Unimplemented("MIN_FIRST mode is not implemented for "
+                                      "Quantize with axis != -1."));
+    OP_REQUIRES_OK(ctx,
+                   ctx->allocate_output(1, minmax_shape, &output_min_tensor));
+    OP_REQUIRES_OK(ctx,
+                   ctx->allocate_output(2, minmax_shape, &output_max_tensor));
+
+    auto input_tensor =
+        input.template flat_inner_outer_dims<float, 3>(axis_ - 1);
+    int64 pre_dim = 1, post_dim = 1;
+    for (int i = 0; i < axis_; ++i) {
+      pre_dim *= output->dim_size(i);
+    }
+    for (int i = axis_ + 1; i < output->dims(); ++i) {
+      post_dim *= output->dim_size(i);
+    }
+    auto output_tensor = output->template bit_casted_shaped<T, 3>(
+        {pre_dim, num_slices, post_dim});
+    auto min_ranges = input_min_range.template vec<float>();
+    auto max_ranges = input_max_range.template vec<float>();
+    for (int i = 0; i < num_slices; ++i) {
+      QuantizeSlice(ctx->eigen_device<Device>(), ctx,
+                    input_tensor.template chip<1>(i), min_ranges(i),
+                    max_ranges(i), output_tensor.template chip<1>(i),
+                    &output_min_tensor->flat<float>()(i),
+                    &output_max_tensor->flat<float>()(i));
+    }
+  }
+
+  void QuantizeTensor(OpKernelContext* ctx, const Tensor& input,
+                      const float input_min_range, const float input_max_range,
+                      Tensor* output, Tensor* output_min_tensor,
+                      Tensor* output_max_tensor) {
     OP_REQUIRES(ctx, !(input_max_range < input_min_range),
                 errors::InvalidArgument(
                     "input_max_range must be larger than input_min_range."));
@@ -122,16 +178,50 @@ class QuantizeV2Op : public OpKernel {
     // overall range from the maximum, so that the value can be easily
     // represented when we promote the quantized value to a higher
     // intermediate bit depth, since that's a common requirement.
-    min_range = std::min(0.0f, input_min_range);
+    float min_range = std::min(0.0f, input_min_range);
     const float epsilon = std::max(1.0f, std::max(fabsf(input_min_range),
                                                   fabsf(input_max_range))) /
                           100.0f;
-    max_range = std::max(input_max_range, min_range + epsilon);
-    max_range = std::max(0.0f, max_range);
+    float max_range =
+        std::max(0.0f, std::max(input_max_range, min_range + epsilon));
+
+    if (mode_ == QUANTIZE_MODE_MIN_FIRST) {
+      if (meta::IsSupportedAndEnabled() && std::is_same<T, quint8>()) {
+        TTypes<const float>::Vec input_array = input.flat<float>();
+
+        meta::Quantize(ctx, input_array.data(), input_array.size(), min_range,
+                       max_range, output->flat<quint8>().data());
+      } else {
+        FloatTensorToQuantizedInPlaceUsingEigen<T>(
+            ctx->template eigen_device<Device>(), input, min_range, max_range,
+            output);
+      }
+      output_min_tensor->flat<float>()(0) = min_range;
+      output_max_tensor->flat<float>()(0) = max_range;
+    } else {
+      QuantizeSlice(ctx->eigen_device<Device>(), ctx, input.flat<float>(),
+                    input_min_range, input_max_range,
+                    output->template flat<T>(),
+                    &output_min_tensor->flat<float>()(0),
+                    &output_max_tensor->flat<float>()(0));
+    }
+  }
+
+  template <typename ConstVec, typename Vec>
+  void QuantizeSlice(const Device& d, OpKernelContext* ctx,
+                     const ConstVec& input, float input_min_range,
+                     float input_max_range, Vec output, float* output_min_range,
+                     float* output_max_range) {
+    OP_REQUIRES(ctx, !(input_max_range < input_min_range),
+                errors::InvalidArgument(
+                    "input_max_range must be larger than input_min_range."));
+    float min_range = std::min(0.0f, input_min_range);
+    const float epsilon = std::max(1.0f, std::max(fabsf(input_min_range),
+                                                  fabsf(input_max_range))) /
+                          100.0f;
+    float max_range =
+        std::max(0.0f, std::max(input_max_range, min_range + epsilon));
 
-    Tensor* output = nullptr;
-    OP_REQUIRES_OK(ctx, ctx->allocate_output(0, input.shape(), &output));
-    typename TTypes<T>::Vec o = output->template flat<T>();
     if (mode_ == QUANTIZE_MODE_MIN_COMBINED) {
       const float scale_factor =
           (static_cast<double>(std::numeric_limits<T>::max()) -
@@ -152,9 +242,8 @@ class QuantizeV2Op : public OpKernel {
       if (is_signed) {
         // The slow path.
         // TODO(xbing,yonghui): Speedup this path as well.
-        o.device(ctx->template eigen_device<Device>()) =
-            ((input.flat<float>().cwiseMin(max_range).cwiseMax(min_range) -
-              min_range) *
+        output.device(d) =
+            ((input.cwiseMin(max_range).cwiseMax(min_range) - min_range) *
                  scale_factor -
              half_range_)
                 .round()
@@ -162,25 +251,15 @@ class QuantizeV2Op : public OpKernel {
       } else {
         // The fast path that avoids unaryExpr
         // According to the micro-benchmark, adding device here doesn't help.
-        o = ((input.flat<float>().cwiseMin(max_range).cwiseMax(min_range) -
-              min_range) *
+        output.device(d) =
+            ((input.cwiseMin(max_range).cwiseMax(min_range) - min_range) *
                  scale_factor +
              0.5f)
                 .template cast<T>();
       }
-    } else if (mode_ == QUANTIZE_MODE_MIN_FIRST) {
-      if (meta::IsSupportedAndEnabled() && std::is_same<T, quint8>()) {
-        TTypes<const float>::Vec input_array = input.flat<float>();
-
-        meta::Quantize(ctx, input_array.data(), input_array.size(), min_range,
-                       max_range, output->flat<quint8>().data());
-      } else {
-        FloatTensorToQuantizedInPlaceUsingEigen<T>(
-            ctx->template eigen_device<Device>(), input, min_range, max_range,
-            output);
-      }
     } else if (mode_ == QUANTIZE_MODE_SCALED) {
-      const int min_output_value = std::numeric_limits<T>::min();
+      const int min_output_value =
+          std::numeric_limits<T>::min() + (narrow_range_ ? 1 : 0);
       const int max_output_value = std::numeric_limits<T>::max();
       const float scale_factor_from_min_side =
           (min_output_value * min_range > 0)
@@ -196,34 +275,29 @@ class QuantizeV2Op : public OpKernel {
       max_range = max_output_value / scale_factor;
       if (round_mode_ == ROUND_HALF_TO_EVEN) {
         // scalar_round_op_google implements "round-half-to-even".
-        o.device(ctx->template eigen_device<Device>()) =
-            (input.flat<float>().cwiseMin(max_range).cwiseMax(min_range) *
-             scale_factor)
+        output.device(d) =
+            (input.cwiseMin(max_range).cwiseMax(min_range) * scale_factor)
                 .unaryExpr(Eigen::internal::scalar_round_op_google<float>())
                 .template cast<T>();
       } else if (round_mode_ == ROUND_HALF_AWAY_FROM_ZERO) {
         // scalar_round_op implements "round-half-away-from-zero".
-        o.device(ctx->template eigen_device<Device>()) =
-            (input.flat<float>().cwiseMin(max_range).cwiseMax(min_range) *
-             scale_factor)
+        output.device(d) =
+            (input.cwiseMin(max_range).cwiseMax(min_range) * scale_factor)
                 .unaryExpr(Eigen::internal::scalar_round_op<float>())
                 .template cast<T>();
       }
     }
 
-    Tensor* output_min_tensor = nullptr;
-    OP_REQUIRES_OK(ctx, ctx->allocate_output(1, {}, &output_min_tensor));
-    output_min_tensor->flat<float>()(0) = min_range;
-
-    Tensor* output_max_tensor = nullptr;
-    OP_REQUIRES_OK(ctx, ctx->allocate_output(2, {}, &output_max_tensor));
-    output_max_tensor->flat<float>()(0) = max_range;
+    *output_min_range = min_range;
+    *output_max_range = max_range;
   }
 
  private:
   float half_range_;
   int mode_;
   int round_mode_;
+  int axis_;
+  bool narrow_range_;
 };
 
 REGISTER_KERNEL_BUILDER(
diff --git a/tensorflow/core/kernels/quantize_op_test.cc b/tensorflow/core/kernels/quantize_op_test.cc
index 0a672686a2d..211b6ab85d3 100644
--- a/tensorflow/core/kernels/quantize_op_test.cc
+++ b/tensorflow/core/kernels/quantize_op_test.cc
@@ -28,6 +28,10 @@ class QuantizedOpTest : public OpsTestBase {
  protected:
 };
 
+struct ParameterizedQuantizeOpTest : public OpsTestBase,
+                                     public ::testing::WithParamInterface<int> {
+};
+
 TEST_F(QuantizedOpTest, QuantizeV2) {
   TF_ASSERT_OK(NodeDefBuilder("quantize_op", "QuantizeV2")
                    .Input(FakeInput(DT_FLOAT))
@@ -51,36 +55,78 @@ TEST_F(QuantizedOpTest, QuantizeV2) {
   test::ExpectTensorEqual<quint8>(expected, *GetOutput(0));
 }
 
-TEST_F(QuantizedOpTest, QuantizeV2Quint8Scaled) {
+// Creates a tensor with the specified dims, using values chosen from data,
+// multiplied by (1 + index) along the axis dimension.
+template <typename T>
+std::vector<T> ScalePerSliceAlongAxis(std::vector<int64> dims, int axis,
+                                      const std::vector<T>& data) {
+  uint32 seed = 123;
+  int64 out_size = 1;
+  for (int dim : dims) {
+    out_size *= dim;
+  }
+  int minor_size = 1;
+  for (int i = axis + 1; i < dims.size(); ++i) {
+    minor_size *= dims[i];
+  }
+  std::vector<T> out(out_size);
+  int num_slices = (axis == -1) ? 1 : dims[axis];
+  for (int out_idx = 0; out_idx < out_size; ++out_idx) {
+    int in_idx = rand_r(&seed) % data.size();
+    T multiplier = ((out_idx / minor_size) % num_slices) + 1;
+    out[out_idx] = data[in_idx] * multiplier;
+  }
+  return out;
+}
+
+TEST_P(ParameterizedQuantizeOpTest, QuantizeV2Quint8Scaled) {
+  const int axis = GetParam();
   TF_ASSERT_OK(NodeDefBuilder("quantize_op", "QuantizeV2")
                    .Input(FakeInput(DT_FLOAT))
                    .Input(FakeInput(DT_FLOAT))
                    .Input(FakeInput(DT_FLOAT))
                    .Attr("T", DataTypeToEnum<quint8>::v())
                    .Attr("mode", "SCALED")
+                   .Attr("axis", axis)
                    .Finalize(node_def()));
   TF_ASSERT_OK(InitOp());
-  AddInputFromArray<float>(TensorShape({8}),
-                           {-255.0, 0.0, 1.0, 1.25, 1.75, 64.0, 127.0, 500.0});
-  AddInputFromArray<float>(TensorShape({1}), {-255.0f});
-  AddInputFromArray<float>(TensorShape({1}), {127.0f});
+  const std::vector<int64> dims = {2, 3, 4, 5};
+  int num_slices = (axis == -1) ? 1 : dims[axis];
+
+  // Each channel contains the same 8 values multiplied by (channel + 1).
+  AddInputFromArray<float>(
+      TensorShape(dims),
+      ScalePerSliceAlongAxis<float>(
+          dims, axis, {-255.0, 0.0, 1.0, 1.25, 1.75, 64.0, 127.0, 500.0}));
+  std::vector<float> min_ranges(num_slices), max_ranges(num_slices);
+  for (int slice_idx = 0; slice_idx < num_slices; ++slice_idx) {
+    min_ranges[slice_idx] = (slice_idx + 1) * -255.0;
+    max_ranges[slice_idx] = (slice_idx + 1) * 127.0;
+  }
+  AddInputFromArray<float>(TensorShape({num_slices}), min_ranges);
+  AddInputFromArray<float>(TensorShape({num_slices}), max_ranges);
   TF_ASSERT_OK(RunOpKernel());
-  Tensor expected(allocator(), DT_QUINT8, TensorShape({8}));
   // Input values < 0 should map to 0 even though min_range = -255, because
   // we are performing quantization by scaling to quint8.
   // Input value 0.0 should map to 0.
   // The scale factor chosen should be 255 / 127 =  2.00787
   // Output values are clipped to 255.
-  test::FillValues<quint8>(&expected, {0, 0, 2, 3, 4, 129, 255, 255});
+
+  Tensor expected(allocator(), DT_QUINT8, TensorShape(dims));
+  test::FillValues<quint8>(
+      &expected,
+      ScalePerSliceAlongAxis<quint8>(dims, -1, {0, 0, 2, 3, 4, 129, 255, 255}));
+
+  auto output_min = *GetOutput(1);
+  auto output_max = *GetOutput(2);
+
+  for (int slice_idx = 0; slice_idx < num_slices; ++slice_idx) {
+    EXPECT_EQ(output_min.flat<float>()(slice_idx), 0);
+    EXPECT_EQ(output_max.flat<float>()(slice_idx), 127.0 * (slice_idx + 1));
+  }
+
+  auto output = *GetOutput(0);
   test::ExpectTensorEqual<quint8>(expected, *GetOutput(0));
-
-  Tensor expected_output_min(allocator(), DT_FLOAT, TensorShape({}));
-  test::FillValues<float>(&expected_output_min, {0.0});
-  test::ExpectTensorEqual<float>(expected_output_min, *GetOutput(1));
-
-  Tensor expected_output_max(allocator(), DT_FLOAT, TensorShape({}));
-  test::FillValues<float>(&expected_output_max, {127.0});
-  test::ExpectTensorEqual<float>(expected_output_max, *GetOutput(2));
 }
 
 TEST_F(QuantizedOpTest, QuantizeV2Quint8ScaledSmallInputRange) {
@@ -113,36 +159,110 @@ TEST_F(QuantizedOpTest, QuantizeV2Quint8ScaledSmallInputRange) {
   test::ExpectTensorEqual<float>(expected_output_max, *GetOutput(2));
 }
 
-TEST_F(QuantizedOpTest, QuantizeV2Qint8Scaled) {
+TEST_P(ParameterizedQuantizeOpTest, QuantizeV2Qint8Scaled) {
+  const int axis = GetParam();
   TF_ASSERT_OK(NodeDefBuilder("quantize_op", "QuantizeV2")
                    .Input(FakeInput(DT_FLOAT))
                    .Input(FakeInput(DT_FLOAT))
                    .Input(FakeInput(DT_FLOAT))
                    .Attr("T", DataTypeToEnum<qint8>::v())
                    .Attr("mode", "SCALED")
+                   .Attr("narrow_range", false)
+                   .Attr("axis", axis)
                    .Finalize(node_def()));
   TF_ASSERT_OK(InitOp());
-  AddInputFromArray<float>(TensorShape({7}),
-                           {-128.0, 0.0, 1.0, 1.25, 1.75, 64.0, 127.0});
-  AddInputFromArray<float>(TensorShape({1}), {-128.0f});
-  AddInputFromArray<float>(TensorShape({1}), {100.0f});
+  const std::vector<int64> dims = {2, 3, 4, 5};
+  int num_slices = (axis == -1) ? 1 : dims[axis];
+
+  // Each channel contains the same 7 values multiplied by (channel + 1).
+  AddInputFromArray<float>(
+      TensorShape(dims),
+      ScalePerSliceAlongAxis<float>(
+          dims, axis, {-128.0, 0.0, 1.0, 1.25, 1.75, 64.0, 127.0}));
+  std::vector<float> min_ranges(num_slices), max_ranges(num_slices);
+  for (int slice_idx = 0; slice_idx < num_slices; ++slice_idx) {
+    min_ranges[slice_idx] = (slice_idx + 1) * -128.0;
+    max_ranges[slice_idx] = (slice_idx + 1) * 100.0;
+  }
+  AddInputFromArray<float>(TensorShape({num_slices}), min_ranges);
+  AddInputFromArray<float>(TensorShape({num_slices}), max_ranges);
   TF_ASSERT_OK(RunOpKernel());
-  Tensor expected(allocator(), DT_QINT8, TensorShape({7}));
+
   // Input element 0.0 should map to 0.
-  // Input element 127.0 maps to 127 instead of 100 because
-  // max(abs(-127), abs(100)) = 127.
-  test::FillValues<qint8>(&expected, {-128, 0, 1, 1, 2, 64, 127});
+  // Input element 127.0 maps to 127 instead of 100.
+  // (i.e. the max_ranges[] values should be ignored because their magnitude is
+  // less than the min_ranges[] values).
+  Tensor expected(allocator(), DT_QINT8, TensorShape(dims));
+  test::FillValues<qint8>(
+      &expected,
+      ScalePerSliceAlongAxis<qint8>(dims, -1, {-128, 0, 1, 1, 2, 64, 127}));
+
+  auto output_min = *GetOutput(1);
+  auto output_max = *GetOutput(2);
+
+  for (int slice_idx = 0; slice_idx < num_slices; ++slice_idx) {
+    EXPECT_EQ(output_min.flat<float>()(slice_idx), -128.0 * (slice_idx + 1));
+    EXPECT_EQ(output_max.flat<float>()(slice_idx), 127.0 * (slice_idx + 1));
+  }
+
+  auto output = *GetOutput(0);
   test::ExpectTensorEqual<qint8>(expected, *GetOutput(0));
-
-  Tensor expected_output_min(allocator(), DT_FLOAT, TensorShape({}));
-  test::FillValues<float>(&expected_output_min, {-128.0});
-  test::ExpectTensorEqual<float>(expected_output_min, *GetOutput(1));
-
-  Tensor expected_output_max(allocator(), DT_FLOAT, TensorShape({}));
-  test::FillValues<float>(&expected_output_max, {127.0});
-  test::ExpectTensorEqual<float>(expected_output_max, *GetOutput(2));
 }
 
+TEST_P(ParameterizedQuantizeOpTest, QuantizeV2Qint8ScaledNarrowRange) {
+  const int axis = GetParam();
+  TF_ASSERT_OK(NodeDefBuilder("quantize_op", "QuantizeV2")
+                   .Input(FakeInput(DT_FLOAT))
+                   .Input(FakeInput(DT_FLOAT))
+                   .Input(FakeInput(DT_FLOAT))
+                   .Attr("T", DataTypeToEnum<qint8>::v())
+                   .Attr("mode", "SCALED")
+                   .Attr("narrow_range", true)
+                   .Attr("axis", axis)
+                   .Finalize(node_def()));
+  TF_ASSERT_OK(InitOp());
+  const std::vector<int64> dims = {2, 3, 4, 5};
+  int num_slices = (axis == -1) ? 1 : dims[axis];
+
+  // Each channel contains the same 7 values multiplied by (channel + 1).
+  AddInputFromArray<float>(
+      TensorShape(dims),
+      ScalePerSliceAlongAxis<float>(
+          dims, axis, {-128.0, 0.0, 1.0, 1.25, 1.75, 64.0, 127.0}));
+  std::vector<float> min_ranges(num_slices), max_ranges(num_slices);
+  for (int slice_idx = 0; slice_idx < num_slices; ++slice_idx) {
+    min_ranges[slice_idx] = (slice_idx + 1) * -128.0;
+    max_ranges[slice_idx] = (slice_idx + 1) * 100.0;
+  }
+  AddInputFromArray<float>(TensorShape({num_slices}), min_ranges);
+  AddInputFromArray<float>(TensorShape({num_slices}), max_ranges);
+  TF_ASSERT_OK(RunOpKernel());
+
+  // Input element 0.0 should map to 0.
+  // Input element 127.0 maps to 127 instead of 100.
+  // (i.e. the max_ranges[] values should be ignored because their magnitude is
+  // less than the min_ranges[] values).
+  Tensor expected(allocator(), DT_QINT8, TensorShape(dims));
+  test::FillValues<qint8>(
+      &expected,
+      ScalePerSliceAlongAxis<qint8>(dims, -1, {-127, 0, 1, 1, 2, 64, 126}));
+
+  auto output_min = *GetOutput(1);
+  auto output_max = *GetOutput(2);
+
+  for (int slice_idx = 0; slice_idx < num_slices; ++slice_idx) {
+    EXPECT_EQ(output_min.flat<float>()(slice_idx), -128.0 * (slice_idx + 1));
+    EXPECT_EQ(output_max.flat<float>()(slice_idx), 128.0 * (slice_idx + 1));
+  }
+
+  auto output = *GetOutput(0);
+  test::ExpectTensorEqual<qint8>(expected, *GetOutput(0));
+}
+
+// Instantiate parameterized tests for axis = -1, 1, 3.
+INSTANTIATE_TEST_SUITE_P(, ParameterizedQuantizeOpTest,
+                         ::testing::Values(-1, 1, 3));
+
 TEST_F(QuantizedOpTest, QuantizeV2Qint8ScaledSmallInputRange) {
   TF_ASSERT_OK(NodeDefBuilder("quantize_op", "QuantizeV2")
                    .Input(FakeInput(DT_FLOAT))
diff --git a/tensorflow/core/kernels/resource_variable_ops.cc b/tensorflow/core/kernels/resource_variable_ops.cc
index 6d82bd507b0..174a0bfd124 100644
--- a/tensorflow/core/kernels/resource_variable_ops.cc
+++ b/tensorflow/core/kernels/resource_variable_ops.cc
@@ -626,30 +626,6 @@ REGISTER_KERNEL_BUILDER(Name("VarIsInitializedOp")
 
 template <typename Device, typename T, typename Index>
 class ResourceGatherOp : public OpKernel {
- private:
-  int32 batch_dims_ = 0;
-
-  // Add the batch offset derrived from params to each batch of indices.
-  // Example: batch_dims = 1, indices = [[0, 1, 2], [0, 1, 2]]
-  // If indexing into a params dimension of size 4, then the indices will become
-  // [0, 1, 2, 4, 5, 6]
-  void AddBatchOffsets(Tensor* indices, const Tensor& params) {
-    int64 batch_size = 1;  // The size of all batch dimensions.
-    for (int idx = 0; idx < batch_dims_; ++idx) {
-      batch_size *= params.dim_size(idx);
-    }
-
-    auto indices_flat = indices->flat<Index>();
-    int64 const index_inner_size = indices->NumElements() / batch_size;
-    int64 const batch_offset = params.dim_size(batch_dims_);
-    for (int64 batch_idx = 0, dest_idx = 0; batch_idx < batch_size;
-         ++batch_idx) {
-      for (int64 idx = 0; idx < index_inner_size; ++idx) {
-        indices_flat(dest_idx++) += batch_offset * batch_idx;
-      }
-    }
-  }
-
  public:
   explicit ResourceGatherOp(OpKernelConstruction* c) : OpKernel(c) {
     OP_REQUIRES_OK(c, c->GetAttr("batch_dims", &batch_dims_));
@@ -741,6 +717,30 @@ class ResourceGatherOp : public OpKernel {
               indices_flat(bad_i), " is not in [0, ", params.dim_size(0), ")"));
     }
   }
+
+ private:
+  // Add the batch offset derrived from params to each batch of indices.
+  // Example: batch_dims = 1, indices = [[0, 1, 2], [0, 1, 2]]
+  // If indexing into a params dimension of size 4, then the indices will become
+  // [0, 1, 2, 4, 5, 6]
+  void AddBatchOffsets(Tensor* indices, const Tensor& params) {
+    int64 batch_size = 1;  // The size of all batch dimensions.
+    for (int idx = 0; idx < batch_dims_; ++idx) {
+      batch_size *= params.dim_size(idx);
+    }
+
+    auto indices_flat = indices->flat<Index>();
+    int64 const index_inner_size = indices->NumElements() / batch_size;
+    int64 const batch_offset = params.dim_size(batch_dims_);
+    for (int64 batch_idx = 0, dest_idx = 0; batch_idx < batch_size;
+         ++batch_idx) {
+      for (int64 idx = 0; idx < index_inner_size; ++idx) {
+        indices_flat(dest_idx++) += batch_offset * batch_idx;
+      }
+    }
+  }
+
+  int32 batch_dims_ = 0;
 };
 
 #define REGISTER_GATHER_FULL(dev, type, index_type)                    \
@@ -850,13 +850,38 @@ TF_CALL_GPU_NUMBER_TYPES(REGISTER_GATHER_ND_GPU);
 template <typename Device, typename T, typename Index, scatter_op::UpdateOp op>
 class ResourceScatterUpdateOp : public OpKernel {
  public:
-  explicit ResourceScatterUpdateOp(OpKernelConstruction* c) : OpKernel(c) {}
+  explicit ResourceScatterUpdateOp(OpKernelConstruction* c) : OpKernel(c) {
+    // We use the same kernel for many operations.
+    // Each operation has a different set of attributes defined in its nodes.
+    Status s = c->GetAttr("use_locking", &use_exclusive_lock_);
+    if (!s.ok()) {
+      use_exclusive_lock_ = false;
+    }
+  }
 
   void Compute(OpKernelContext* c) override {
     core::RefCountPtr<Var> v;
     OP_REQUIRES_OK(c, LookupResource(c, HandleFromInput(c, 0), &v));
     OP_REQUIRES_OK(c, EnsureSparseVariableAccess<Device, T>(c, v.get()));
-    tf_shared_lock ml(*v->mu());
+    const bool is_non_pod_dtype = c->input_dtype(0) == DT_RESOURCE ||
+                                  c->input_dtype(0) == DT_STRING ||
+                                  c->input_dtype(0) == DT_VARIANT;
+    if (is_non_pod_dtype || use_exclusive_lock_) {
+      mutex_lock ml(*v->mu());
+      DoCompute(c);
+    } else {
+      // For POD dtypes, we can safely run the update without the mutex.
+      tf_shared_lock ml(*v->mu());
+      DoCompute(c);
+    }
+  }
+
+ private:
+  bool use_exclusive_lock_;
+
+  void DoCompute(OpKernelContext* c) {
+    core::RefCountPtr<Var> v;
+    OP_REQUIRES_OK(c, LookupResource(c, HandleFromInput(c, 0), &v));
     Tensor* params = v->tensor();
     const Tensor& indices = c->input(1);
     const Tensor& updates = c->input(2);
diff --git a/tensorflow/core/kernels/sample_distorted_bounding_box_op.cc b/tensorflow/core/kernels/sample_distorted_bounding_box_op.cc
index 9de850acd05..2936856ec29 100644
--- a/tensorflow/core/kernels/sample_distorted_bounding_box_op.cc
+++ b/tensorflow/core/kernels/sample_distorted_bounding_box_op.cc
@@ -126,11 +126,19 @@ bool GenerateRandomCrop(int original_width, int original_height,
   int height = static_cast<int>(lrintf(std::sqrt(min_area / aspect_ratio)));
   int max_height = static_cast<int>(lrintf(std::sqrt(max_area / aspect_ratio)));
 
+  // TODO(b/140767341): Rewrite the generation logic to be more tolerant
+  // of floating point behavior.
   if (lrintf(max_height * aspect_ratio) > original_width) {
     // We must find the smallest max_height satisfying
     // round(max_height * aspect_ratio) <= original_width:
     const float kEps = 0.0000001;
     max_height = static_cast<int>((original_width + 0.5 - kEps) / aspect_ratio);
+    // If due some precision issues, we still cannot guarantee
+    // round(max_height * aspect_ratio) <= original_width, subtract 1 from
+    // max height.
+    if (lrintf(max_height * aspect_ratio) > original_width) {
+      max_height -= 1;
+    }
   }
 
   if (max_height > original_height) {
diff --git a/tensorflow/core/kernels/scatter_nd_op.cc b/tensorflow/core/kernels/scatter_nd_op.cc
index 59dad3d8a46..bd6aaa07d9d 100644
--- a/tensorflow/core/kernels/scatter_nd_op.cc
+++ b/tensorflow/core/kernels/scatter_nd_op.cc
@@ -382,6 +382,7 @@ TF_CALL_NUMBER_TYPES(REGISTER_SCATTER_ND_ADD_SUB_CPU);
 TF_CALL_NUMBER_TYPES(REGISTER_SCATTER_ND_UPDATE_CPU);
 TF_CALL_NUMBER_TYPES(REGISTER_SCATTER_ND_CPU);
 TF_CALL_tstring(REGISTER_SCATTER_ND_CPU);
+TF_CALL_tstring(REGISTER_SCATTER_ND_UPDATE_CPU);
 TF_CALL_bool(REGISTER_SCATTER_ND_ADD_SUB_CPU);
 TF_CALL_bool(REGISTER_SCATTER_ND_UPDATE_CPU);
 TF_CALL_bool(REGISTER_SCATTER_ND_CPU);
diff --git a/tensorflow/core/kernels/scoped_allocator_ops.cc b/tensorflow/core/kernels/scoped_allocator_ops.cc
index 79320e08cb2..881d728f632 100644
--- a/tensorflow/core/kernels/scoped_allocator_ops.cc
+++ b/tensorflow/core/kernels/scoped_allocator_ops.cc
@@ -39,7 +39,7 @@ class ScopedAllocatorOp : public OpKernel {
     // the subtensors to be allocated from it, taking into account
     // alignment considerations.
     ScopedAllocatorMgr::PopulateFields(id_, shapes_, dtype_, &fields_);
-    size_t num_bytes = fields_.back().offset + fields_.back().bytes;
+    size_t num_bytes = fields_.back().offset + fields_.back().bytes_allocated;
     num_elements_ = num_bytes / DataTypeSize(dtype_);
     OP_REQUIRES(context, num_bytes % DataTypeSize(dtype_) == 0,
                 errors::InvalidArgument(
diff --git a/tensorflow/core/kernels/scoped_allocator_ops_test.cc b/tensorflow/core/kernels/scoped_allocator_ops_test.cc
index 634f9ba8876..531089decf2 100644
--- a/tensorflow/core/kernels/scoped_allocator_ops_test.cc
+++ b/tensorflow/core/kernels/scoped_allocator_ops_test.cc
@@ -91,8 +91,8 @@ void PrepOp(DataType dtype, int32 id,
   ScopedAllocatorMgr::PopulateFields(id, fields_shapes, dtype, fields);
   // We don't simply allocate a tensor with shape as backing_tensor_shape,
   // because we need to account for padding in the fields.  We actually need a
-  // tensor of size at least (fields[-1].offset + fields[-1].bytes).
-  size_t num_bytes = fields->back().offset + fields->back().bytes;
+  // tensor of size at least (fields[-1].offset + fields[-1].bytes_allocated).
+  size_t num_bytes = fields->back().offset + fields->back().bytes_allocated;
   int32_t num_elements = num_bytes / DataTypeSize(dtype);
   CHECK_EQ(num_bytes % DataTypeSize(dtype), 0);
 
diff --git a/tensorflow/core/kernels/sendrecv_ops.cc b/tensorflow/core/kernels/sendrecv_ops.cc
index 5e09e5ff4bc..f77ac24b8d1 100644
--- a/tensorflow/core/kernels/sendrecv_ops.cc
+++ b/tensorflow/core/kernels/sendrecv_ops.cc
@@ -138,25 +138,21 @@ RecvOp::RecvOp(OpKernelConstruction* ctx) : AsyncOpKernel(ctx) {
 namespace {
 Rendezvous::DoneCallback make_recv_callback(OpKernelContext* ctx,
                                             AsyncOpKernel::DoneCallback done) {
-  using namespace std::placeholders;
-  return std::bind(
-      [ctx](AsyncOpKernel::DoneCallback done,
-            // Begin unbound arguments.
-            const Status& s, const Rendezvous::Args& send_args,
-            const Rendezvous::Args& recv_args, const Tensor& val,
-            bool is_dead) {
-        ctx->SetStatus(s);
-        if (s.ok()) {
-          // 'ctx' allocates the output tensor of the expected type.
-          // The runtime checks whether the tensor received here is
-          // the same type.
-          if (!is_dead) {
-            ctx->set_output(0, val);
-          }
-        }
-        done();
-      },
-      std::move(done), _1, _2, _3, _4, _5);
+  return [ctx, done = std::move(done)](const Status& s,
+                                       const Rendezvous::Args& send_args,
+                                       const Rendezvous::Args& recv_args,
+                                       const Tensor& val, bool is_dead) {
+    ctx->SetStatus(s);
+    if (s.ok()) {
+      // 'ctx' allocates the output tensor of the expected type.
+      // The runtime checks whether the tensor received here is
+      // the same type.
+      if (!is_dead) {
+        ctx->set_output(0, val);
+      }
+    }
+    done();
+  };
 }
 }  // namespace
 
diff --git a/tensorflow/core/kernels/sparse/csr_sparse_matrix_to_dense_op.cc b/tensorflow/core/kernels/sparse/csr_sparse_matrix_to_dense_op.cc
index 68144a66c15..92cb1080ca9 100644
--- a/tensorflow/core/kernels/sparse/csr_sparse_matrix_to_dense_op.cc
+++ b/tensorflow/core/kernels/sparse/csr_sparse_matrix_to_dense_op.cc
@@ -246,18 +246,12 @@ struct COOSparseMatrixToSparseTensor<GPUDevice> {
 };
 extern template struct COOSparseMatrixToSparseTensor<GPUDevice>;
 
-// TODO(ebrevdo): Write a custom batch-friendly impl of this to update
-// the SparseTensor indices directly.
 template <>
-Status CSRSparseMatrixToCOOSparseMatrix<GPUDevice>::operator()(
-    OpKernelContext* c, TTypes<const int>::UnalignedVec csr_row_ptr,
-    TTypes<int>::UnalignedVec coo_row_ind) {
-  CudaSparse cuda_sparse(c);
-  const int nnz = coo_row_ind.size();
-  TF_RETURN_IF_ERROR(cuda_sparse.Initialize());
-  const int m = csr_row_ptr.size() - 1;  // rows
-  return cuda_sparse.Csr2coo(csr_row_ptr.data(), nnz, m, coo_row_ind.data());
-}
+struct CSRSparseMatrixToCOOSparseMatrix<GPUDevice> {
+  Status operator()(OpKernelContext* c,
+                    TTypes<const int>::UnalignedVec csr_row_ptr,
+                    TTypes<int>::UnalignedVec coo_row_ind);
+};
 extern template struct CSRSparseMatrixToCOOSparseMatrix<GPUDevice>;
 
 }  // namespace functor
diff --git a/tensorflow/core/kernels/sparse/csr_sparse_matrix_to_sparse_tensor_op.cc b/tensorflow/core/kernels/sparse/csr_sparse_matrix_to_sparse_tensor_op.cc
index 839d6e35f6a..237401eaf4b 100644
--- a/tensorflow/core/kernels/sparse/csr_sparse_matrix_to_sparse_tensor_op.cc
+++ b/tensorflow/core/kernels/sparse/csr_sparse_matrix_to_sparse_tensor_op.cc
@@ -230,18 +230,12 @@ struct COOSparseMatrixToSparseTensor<GPUDevice> {
 };
 extern template struct COOSparseMatrixToSparseTensor<GPUDevice>;
 
-// TODO(ebrevdo): Write a custom batch-friendly impl of this to update
-// the SparseTensor indices directly.
 template <>
-Status CSRSparseMatrixToCOOSparseMatrix<GPUDevice>::operator()(
-    OpKernelContext* c, TTypes<const int>::UnalignedVec csr_row_ptr,
-    TTypes<int>::UnalignedVec coo_row_ind) {
-  CudaSparse cuda_sparse(c);
-  const int nnz = coo_row_ind.size();
-  TF_RETURN_IF_ERROR(cuda_sparse.Initialize());
-  const int m = csr_row_ptr.size() - 1;  // rows
-  return cuda_sparse.Csr2coo(csr_row_ptr.data(), nnz, m, coo_row_ind.data());
-}
+struct CSRSparseMatrixToCOOSparseMatrix<GPUDevice> {
+  Status operator()(OpKernelContext* c,
+                    TTypes<const int>::UnalignedVec csr_row_ptr,
+                    TTypes<int>::UnalignedVec coo_row_ind);
+};
 extern template struct CSRSparseMatrixToCOOSparseMatrix<GPUDevice>;
 
 }  // namespace functor
diff --git a/tensorflow/core/kernels/sparse/kernels_gpu.cu.cc b/tensorflow/core/kernels/sparse/kernels_gpu.cu.cc
index 518fdd55ff2..2890a109b9f 100644
--- a/tensorflow/core/kernels/sparse/kernels_gpu.cu.cc
+++ b/tensorflow/core/kernels/sparse/kernels_gpu.cu.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include "third_party/gpus/cuda/include/cusparse.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/kernels/cuda_sparse.h"
 #include "tensorflow/core/kernels/gpu_device_array.h"
 #include "tensorflow/core/kernels/gpu_device_array_gpu.h"
 #include "tensorflow/core/kernels/sparse/kernels.h"
@@ -121,6 +122,19 @@ Status CalculateNNZPerBatchMatrixFromIndices<GPUDevice>::operator()(
   return Status::OK();
 }
 
+// TODO(ebrevdo): Write a custom batch-friendly impl of this to update
+// the SparseTensor indices directly.
+template <>
+Status CSRSparseMatrixToCOOSparseMatrix<GPUDevice>::operator()(
+    OpKernelContext* c, TTypes<const int>::UnalignedVec csr_row_ptr,
+    TTypes<int>::UnalignedVec coo_row_ind) {
+  CudaSparse cuda_sparse(c);
+  const int nnz = coo_row_ind.size();
+  TF_RETURN_IF_ERROR(cuda_sparse.Initialize());
+  const int m = csr_row_ptr.size() - 1;  // rows
+  return cuda_sparse.Csr2coo(csr_row_ptr.data(), nnz, m, coo_row_ind.data());
+}
+
 template <int stride>
 __global__ void SparseTensorToCOOMatrixKernel(const int64* indices,
                                               int* coo_rows_out,
diff --git a/tensorflow/core/kernels/training_ops.cc b/tensorflow/core/kernels/training_ops.cc
index 330e02c8490..b276433e8b1 100644
--- a/tensorflow/core/kernels/training_ops.cc
+++ b/tensorflow/core/kernels/training_ops.cc
@@ -2345,43 +2345,32 @@ class ApplyFtrlOp : public OpKernel {
                                 grad.shape().DebugString()));
 
     const Tensor& lr = ctx->input(4);
-    OP_REQUIRES(ctx,
-                TensorShapeUtils::IsScalar(lr.shape()) &&
-                    lr.scalar<T>()() > static_cast<T>(0),
-                errors::InvalidArgument("lr is not a positive scalar: ",
+    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(lr.shape()),
+                errors::InvalidArgument("lr is not a scalar: ",
                                         lr.shape().DebugString()));
     const Tensor& l1 = ctx->input(5);
-    OP_REQUIRES(ctx,
-                TensorShapeUtils::IsScalar(l1.shape()) &&
-                    l1.scalar<T>()() >= static_cast<T>(0),
+    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(l1.shape()),
                 errors::InvalidArgument("l1 regularization strength is not a "
-                                        "non-negative scalar: ",
+                                        "scalar: ",
                                         l1.shape().DebugString()));
     const Tensor& l2 = ctx->input(6);
-    OP_REQUIRES(ctx,
-                TensorShapeUtils::IsScalar(l2.shape()) &&
-                    l2.scalar<T>()() >= static_cast<T>(0),
+    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(l2.shape()),
                 errors::InvalidArgument("l2 regularization strength is not a "
-                                        "non-negative scalar: ",
+                                        "scalar: ",
                                         l2.shape().DebugString()));
     const int lr_power_index = has_l2_shrinkage ? 8 : 7;
     const Tensor& lr_power = ctx->input(lr_power_index);
-    OP_REQUIRES(ctx,
-                TensorShapeUtils::IsScalar(lr_power.shape()) &&
-                    lr_power.scalar<T>()() <= static_cast<T>(0),
-                errors::InvalidArgument("lr_power is not a"
-                                        " non-positive scalar: ",
+    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(lr_power.shape()),
+                errors::InvalidArgument("lr_power is not a scalar",
                                         lr_power.shape().DebugString()));
 
     const Device& device = ctx->template eigen_device<Device>();
     if (has_l2_shrinkage) {
       const Tensor& l2_shrinkage = ctx->input(7);
       OP_REQUIRES(
-          ctx,
-          TensorShapeUtils::IsScalar(l2_shrinkage.shape()) &&
-              l2_shrinkage.scalar<T>()() >= static_cast<T>(0),
+          ctx, TensorShapeUtils::IsScalar(l2_shrinkage.shape()),
           errors::InvalidArgument("l2 shrinkage regularization strength "
-                                  "is not a non-negative scalar: ",
+                                  "is not a scalar: ",
                                   l2_shrinkage.shape().DebugString()));
       functor::ApplyFtrlV2<Device, T>()(
           device, var.flat<T>(), accum.flat<T>(), linear.flat<T>(),
@@ -2420,6 +2409,28 @@ TF_CALL_bfloat16(REGISTER_CPU_KERNELS);
 TF_CALL_float(REGISTER_CPU_KERNELS);
 TF_CALL_double(REGISTER_CPU_KERNELS);
 
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+// Forward declarations of the functor specializations for GPU.
+namespace functor {
+#define DECLARE_GPU_SPEC(T)                                                   \
+  template <>                                                                 \
+  void ApplyFtrl<GPUDevice, T>::operator()(                                   \
+      const GPUDevice& d, typename TTypes<T>::Flat var,                       \
+      typename TTypes<T>::Flat accum, typename TTypes<T>::Flat linear,        \
+      typename TTypes<T>::ConstFlat grad, typename TTypes<T>::ConstScalar lr, \
+      typename TTypes<T>::ConstScalar l1, typename TTypes<T>::ConstScalar l2, \
+      typename TTypes<T>::ConstScalar lr_power);                              \
+  extern template struct ApplyFtrl<GPUDevice, T>;
+DECLARE_GPU_SPEC(Eigen::half);
+DECLARE_GPU_SPEC(float);
+DECLARE_GPU_SPEC(double);
+#undef DECLARE_GPU_SPEC
+}  // namespace functor
+
+REGISTER_KERNELS(GPU, Eigen::half);
+REGISTER_KERNELS(GPU, float);
+REGISTER_KERNELS(GPU, double);
+#endif
 #undef REGISTER_CPU_KERNELS
 #undef REGISTER_KERNELS
 
@@ -2442,6 +2453,29 @@ TF_CALL_bfloat16(REGISTER_CPU_KERNELS);
 TF_CALL_float(REGISTER_CPU_KERNELS);
 TF_CALL_double(REGISTER_CPU_KERNELS);
 
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+// Forward declarations of the functor specializations for GPU.
+namespace functor {
+#define DECLARE_GPU_SPEC(T)                                                   \
+  template <>                                                                 \
+  void ApplyFtrlV2<GPUDevice, T>::operator()(                                 \
+      const GPUDevice& d, typename TTypes<T>::Flat var,                       \
+      typename TTypes<T>::Flat accum, typename TTypes<T>::Flat linear,        \
+      typename TTypes<T>::ConstFlat grad, typename TTypes<T>::ConstScalar lr, \
+      typename TTypes<T>::ConstScalar l1, typename TTypes<T>::ConstScalar l2, \
+      typename TTypes<T>::ConstScalar l2_shrinkage,                           \
+      typename TTypes<T>::ConstScalar lr_power);                              \
+  extern template struct ApplyFtrlV2<GPUDevice, T>;
+DECLARE_GPU_SPEC(Eigen::half);
+DECLARE_GPU_SPEC(float);
+DECLARE_GPU_SPEC(double);
+#undef DECLARE_GPU_SPEC
+}  // namespace functor
+
+REGISTER_KERNELS(GPU, Eigen::half);
+REGISTER_KERNELS(GPU, float);
+REGISTER_KERNELS(GPU, double);
+#endif
 #undef REGISTER_CPU_KERNELS
 #undef REGISTER_KERNELS
 
diff --git a/tensorflow/core/kernels/training_ops_gpu.cu.cc b/tensorflow/core/kernels/training_ops_gpu.cu.cc
index b9240cc5325..4ad84aa62b1 100644
--- a/tensorflow/core/kernels/training_ops_gpu.cu.cc
+++ b/tensorflow/core/kernels/training_ops_gpu.cu.cc
@@ -19,12 +19,54 @@ limitations under the License.
 
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/kernels/training_ops.h"
+#include "tensorflow/core/util/gpu_kernel_helper.h"
 
 namespace tensorflow {
 
 typedef Eigen::GpuDevice GPUDevice;
 
 namespace functor {
+
+template <typename T>
+__global__ void ApplyAdamKernel(int32 data_dim, T* var, T* m, T* v,
+                                const T* const beta1_power_,
+                                const T* const beta2_power_, const T* const lr_,
+                                const T* const beta1_, const T* const beta2_,
+                                const T* const epsilon_, const T* grad,
+                                bool use_nesterov) {
+  eigen_assert(blockDim.y == 1);
+  eigen_assert(blockDim.z == 1);
+  eigen_assert(gridDim.y == 1);
+  eigen_assert(gridDim.z == 1);
+
+  const T mul_factor = (*lr_) * sqrt(static_cast<T>(1.0) - (*beta2_power_)) /
+                       (static_cast<T>(1.0) - (*beta1_power_));
+  const T epsilon = (*epsilon_);
+  const T beta1 = (*beta1_);
+  const T one_minus_beta1 = static_cast<T>(1.0) - (*beta1_);
+  const T one_minus_beta2 = static_cast<T>(1.0) - (*beta2_);
+  const int32 stripe = gridDim.x * blockDim.x;
+
+  for (int32 i = blockIdx.x * blockDim.x + threadIdx.x; i < data_dim;
+       i += stripe) {
+    auto m_i = m[i];
+    auto g_i = grad[i];
+    auto v_i = v[i];
+
+    m_i += one_minus_beta1 * (g_i - m_i);
+    v_i += one_minus_beta2 * (g_i * g_i - v_i);
+    if (use_nesterov) {
+      var[i] -= mul_factor * (m_i * beta1 + one_minus_beta1 * g_i) /
+                (epsilon + sqrt(v_i));
+    } else {
+      var[i] -= mul_factor * m_i / (epsilon + sqrt(v_i));
+    }
+
+    m[i] = m_i;
+    v[i] = v_i;
+  }
+}
+
 template <typename T>
 struct ApplyGradientDescent<GPUDevice, T> {
   void operator()(const GPUDevice& d, typename TTypes<T>::Flat var,
@@ -99,6 +141,81 @@ struct ApplyAdadelta<GPUDevice, T> {
   }
 };
 
+template <typename T>
+struct ApplyFtrl<GPUDevice, T> {
+  void operator()(const GPUDevice& d, typename TTypes<T>::Flat var,
+                  typename TTypes<T>::Flat accum,
+                  typename TTypes<T>::Flat linear,
+                  typename TTypes<T>::ConstFlat grad,
+                  typename TTypes<T>::ConstScalar lr,
+                  typename TTypes<T>::ConstScalar l1,
+                  typename TTypes<T>::ConstScalar l2,
+                  typename TTypes<T>::ConstScalar lr_power) {
+    Eigen::array<typename TTypes<T>::Tensor::Index, 1> bcast;
+    bcast[0] = grad.dimension(0);
+    Eigen::Sizes<1> single;
+
+    auto l1_bcast = l1.reshape(single).broadcast(bcast);
+    auto l2_bcast = l2.reshape(single).broadcast(bcast);
+    auto lr_bcast = lr.reshape(single).broadcast(bcast);
+    auto lr_power_bcast = -lr_power.reshape(single).broadcast(bcast);
+    const auto two = static_cast<T>(2.0);
+
+    auto new_accum = accum + grad.square();
+    auto accum_power = accum.binaryExpr(lr_power_bcast,
+                                        Eigen::internal::scalar_pow_op<T, T>());
+    auto new_accum_power = new_accum.binaryExpr(
+        lr_power_bcast, Eigen::internal::scalar_pow_op<T, T>());
+    linear.device(d) += grad - (new_accum_power - accum_power) * var / lr_bcast;
+    auto x = (l1_bcast * linear.sign() - linear);
+    auto y = (new_accum_power / lr_bcast) + linear.constant(two) * l2_bcast;
+    auto pre_shrink = x / y;
+    var.device(d) = (linear.abs() > l1_bcast)
+                        .select(pre_shrink, var.constant(static_cast<T>(0)));
+    accum.device(d) += grad.square();
+  }
+};
+
+template <typename T>
+struct ApplyFtrlV2<GPUDevice, T> {
+  void operator()(const GPUDevice& d, typename TTypes<T>::Flat var,
+                  typename TTypes<T>::Flat accum,
+                  typename TTypes<T>::Flat linear,
+                  typename TTypes<T>::ConstFlat grad,
+                  typename TTypes<T>::ConstScalar lr,
+                  typename TTypes<T>::ConstScalar l1,
+                  typename TTypes<T>::ConstScalar l2,
+                  typename TTypes<T>::ConstScalar l2_shrinkage,
+                  typename TTypes<T>::ConstScalar lr_power) {
+    Eigen::array<typename TTypes<T>::Tensor::Index, 1> bcast;
+    bcast[0] = grad.dimension(0);
+    Eigen::Sizes<1> single;
+
+    auto l1_bcast = l1.reshape(single).broadcast(bcast);
+    auto l2_bcast = l2.reshape(single).broadcast(bcast);
+    auto l2_shrinkage_bcast = l2_shrinkage.reshape(single).broadcast(bcast);
+    auto lr_bcast = lr.reshape(single).broadcast(bcast);
+    auto lr_power_bcast = -lr_power.reshape(single).broadcast(bcast);
+    const auto two = static_cast<T>(2.0);
+
+    auto new_accum = accum + grad.square();
+    auto accum_power = accum.binaryExpr(lr_power_bcast,
+                                        Eigen::internal::scalar_pow_op<T, T>());
+    auto new_accum_power = new_accum.binaryExpr(
+        lr_power_bcast, Eigen::internal::scalar_pow_op<T, T>());
+    auto grad_with_shrinkage =
+        grad + (var.constant(two) * l2_shrinkage_bcast * var);
+    linear.device(d) +=
+        grad_with_shrinkage - (new_accum_power - accum_power) * var / lr_bcast;
+    auto x = (l1_bcast * linear.sign() - linear);
+    auto y = (new_accum_power / lr_bcast) + linear.constant(two) * l2_bcast;
+    auto pre_shrink = x / y;
+    var.device(d) = (linear.abs() > l1_bcast)
+                        .select(pre_shrink, var.constant(static_cast<T>(0)));
+    accum.device(d) += grad.square();
+  }
+};
+
 template <typename T>
 struct ApplyMomentum<GPUDevice, T> {
   void operator()(const GPUDevice& d, typename TTypes<T>::Flat var,
@@ -152,35 +269,18 @@ struct ApplyAdam<GPUDevice, T> {
                   typename TTypes<T>::ConstScalar beta2,
                   typename TTypes<T>::ConstScalar epsilon,
                   typename TTypes<T>::ConstFlat grad, bool use_nesterov) {
-    Eigen::array<typename TTypes<T>::Tensor::Index, 1> bcast;
-    bcast[0] = grad.dimension(0);
-    Eigen::Sizes<1> single;
-    const auto one = static_cast<T>(1.0);
-    m.device(d) =
-        m + (beta1.constant(one) - beta1).reshape(single).broadcast(bcast) *
-                (grad - m);
-    v.device(d) =
-        v + (beta2.constant(one) - beta2).reshape(single).broadcast(bcast) *
-                (grad.square() - v);
+    int32 data_dim = grad.dimension(0);
+    GpuLaunchConfig config = GetGpuLaunchConfig(data_dim, d);
+    eigen_assert(static_cast<int64>(grad.dimension(0)) +
+                     static_cast<int64>(config.block_count) *
+                         static_cast<int64>(config.thread_per_block) <
+                 std::numeric_limits<int32>::max());
 
-    if (use_nesterov) {
-      var.device(d) -=
-          (lr * (beta2_power.constant(one) - beta2_power).sqrt() /
-           (beta1_power.constant(one) - beta1_power))
-              .reshape(single)
-              .broadcast(bcast) *
-          (m * beta1.reshape(single).broadcast(bcast) +
-           (beta1.constant(one) - beta1).reshape(single).broadcast(bcast) *
-               grad) /
-          (epsilon.reshape(single).broadcast(bcast) + v.sqrt());
-    } else {
-      var.device(d) -= (lr * (beta2_power.constant(one) - beta2_power).sqrt() /
-                        (beta1_power.constant(one) - beta1_power))
-                           .reshape(single)
-                           .broadcast(bcast) *
-                       m /
-                       (epsilon.reshape(single).broadcast(bcast) + v.sqrt());
-    }
+    TF_CHECK_OK(GpuLaunchKernel(
+        ApplyAdamKernel<T>, config.block_count, config.thread_per_block, 0,
+        d.stream(), data_dim, var.data(), m.data(), v.data(),
+        beta1_power.data(), beta2_power.data(), lr.data(), beta1.data(),
+        beta2.data(), epsilon.data(), grad.data(), use_nesterov));
   }
 };
 
@@ -231,15 +331,16 @@ struct ApplyAdaMax<GPUDevice, T> {
     bcast[0] = grad.dimension(0);
     Eigen::Sizes<1> single;
     const auto one = static_cast<T>(1.0);
-    m.device(d) =
-        m + (beta1.constant(one) - beta1).reshape(single).broadcast(bcast) *
-                (grad - m);
+    m.device(d) +=
+        (beta1.constant(one) - beta1).reshape(single).broadcast(bcast) *
+        (grad - m);
     v.device(d) =
         (beta2.reshape(single).broadcast(bcast) * v).cwiseMax(grad.abs());
-    var.device(d) -=
-        lr / (beta1_power.constant(one) -
-                 beta1_power).reshape(single).broadcast(bcast) *
-                     (m / (v + epsilon));
+    var.device(d) -= lr.reshape(single).broadcast(bcast) /
+                     (beta1_power.constant(one) - beta1_power)
+                         .reshape(single)
+                         .broadcast(bcast) *
+                     (m / (v + epsilon.reshape(single).broadcast(bcast)));
   }
 };
 
@@ -375,6 +476,14 @@ template struct functor::ApplyAdadelta<GPUDevice, Eigen::half>;
 template struct functor::ApplyAdadelta<GPUDevice, float>;
 template struct functor::ApplyAdadelta<GPUDevice, double>;
 
+template struct functor::ApplyFtrl<GPUDevice, Eigen::half>;
+template struct functor::ApplyFtrl<GPUDevice, float>;
+template struct functor::ApplyFtrl<GPUDevice, double>;
+
+template struct functor::ApplyFtrlV2<GPUDevice, Eigen::half>;
+template struct functor::ApplyFtrlV2<GPUDevice, float>;
+template struct functor::ApplyFtrlV2<GPUDevice, double>;
+
 template struct functor::ApplyMomentum<GPUDevice, Eigen::half>;
 template struct functor::ApplyMomentum<GPUDevice, float>;
 template struct functor::ApplyMomentum<GPUDevice, double>;
diff --git a/tensorflow/core/lib/core/BUILD b/tensorflow/core/lib/core/BUILD
index afa38ee1363..2a8421fa143 100644
--- a/tensorflow/core/lib/core/BUILD
+++ b/tensorflow/core/lib/core/BUILD
@@ -8,8 +8,7 @@ package(
 )
 
 # Todo(bmzhao): Remaining targets to add to this BUILD file are:
-# arena, blocking_counter, errors, notification, status, threadpool
-# threadpool_interface, threadpool_options, + all tests.
+# arena, threadpool + all tests.
 
 cc_library(
     name = "bitmap",
@@ -21,6 +20,15 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "blocking_counter",
+    hdrs = ["blocking_counter.h"],
+    deps = [
+        "//tensorflow/core/platform:logging",
+        "//tensorflow/core/platform:mutex",
+    ],
+)
+
 cc_library(
     name = "bits",
     hdrs = ["bits.h"],
@@ -42,6 +50,26 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "errors",
+    hdrs = ["errors.h"],
+    deps = [
+        ":status",
+        "//tensorflow/core/lib/strings:string_utils",
+        "//tensorflow/core/platform:logging",
+        "//tensorflow/core/platform:macros",
+        "@com_google_absl//absl/strings",
+    ],
+)
+
+cc_library(
+    name = "notification",
+    hdrs = ["notification.h"],
+    deps = [
+        "//tensorflow/core/platform:notification",
+    ],
+)
+
 cc_library(
     name = "raw_coding",
     hdrs = ["raw_coding.h"],
@@ -57,6 +85,23 @@ cc_library(
     deps = ["//tensorflow/core/platform:logging"],
 )
 
+cc_library(
+    name = "status",
+    srcs = ["status.cc"],
+    hdrs = ["status.h"],
+    deps = [
+        ":error_codes_proto_cc",
+        "//tensorflow/core/lib/strings:string_utils",
+        "//tensorflow/core/platform:logging",
+        "//tensorflow/core/platform:macros",
+        "//tensorflow/core/platform:mutex",
+        "//tensorflow/core/platform:stringpiece",
+        "//tensorflow/core/platform:stringprintf",
+        "//tensorflow/core/platform:types",
+        "@com_google_absl//absl/base",
+    ],
+)
+
 cc_library(
     name = "stringpiece",
     hdrs = ["stringpiece.h"],
@@ -65,6 +110,24 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "threadpool_interface",
+    hdrs = ["threadpool_interface.h"],
+    deps = [
+        "//tensorflow/core/platform:mutex",
+        "//tensorflow/core/platform:types",
+        "//third_party/eigen3",
+    ],
+)
+
+cc_library(
+    name = "threadpool_options",
+    hdrs = ["threadpool_options.h"],
+    deps = [
+        ":threadpool_interface",
+    ],
+)
+
 tf_proto_library(
     name = "error_codes_proto",
     srcs = ["error_codes.proto"],
diff --git a/tensorflow/core/lib/core/status.cc b/tensorflow/core/lib/core/status.cc
index b25ba52efe6..77ee86caafa 100644
--- a/tensorflow/core/lib/core/status.cc
+++ b/tensorflow/core/lib/core/status.cc
@@ -24,8 +24,8 @@ limitations under the License.
 #include "tensorflow/core/lib/core/error_codes.pb.h"
 #include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/strcat.h"
-#include "tensorflow/core/lib/strings/stringprintf.h"
 #include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/stringprintf.h"
 
 namespace tensorflow {
 
diff --git a/tensorflow/core/lib/core/status.h b/tensorflow/core/lib/core/status.h
index 9f0067bc69f..ed5e31875d1 100644
--- a/tensorflow/core/lib/core/status.h
+++ b/tensorflow/core/lib/core/status.h
@@ -20,10 +20,11 @@ limitations under the License.
 #include <iosfwd>
 #include <memory>
 #include <string>
+
 #include "tensorflow/core/lib/core/error_codes.pb.h"
-#include "tensorflow/core/lib/core/stringpiece.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/stringpiece.h"
 #include "tensorflow/core/platform/types.h"
 
 namespace tensorflow {
diff --git a/tensorflow/core/lib/core/threadpool.cc b/tensorflow/core/lib/core/threadpool.cc
index e1a27d4f5a6..f65c395b4ac 100644
--- a/tensorflow/core/lib/core/threadpool.cc
+++ b/tensorflow/core/lib/core/threadpool.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #define EIGEN_USE_THREADS
 
+#include "absl/types/optional.h"
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/lib/core/blocking_counter.h"
 #include "tensorflow/core/platform/context.h"
@@ -117,8 +118,8 @@ void ThreadPool::Schedule(std::function<void()> fn) {
   underlying_threadpool_->Schedule(std::move(fn));
 }
 
-int ThreadPool::NumShardsUsedByTransformRangeConcurrently(
-    const int64 block_size, const int64 total) {
+int ThreadPool::NumShardsUsedByFixedBlockSizeScheduling(
+    const int64 total, const int64 block_size) {
   if (block_size <= 0 || total <= 1 || total <= block_size ||
       NumThreads() == 1) {
     return 1;
@@ -126,13 +127,47 @@ int ThreadPool::NumShardsUsedByTransformRangeConcurrently(
   return (total + block_size - 1) / block_size;
 }
 
-// This functionality is similar to parallelFor, except that reasoning about
-// the number of shards used is significantly easier.
+int ThreadPool::NumShardsUsedByTransformRangeConcurrently(
+    const int64 block_size, const int64 total) {
+  return NumShardsUsedByFixedBlockSizeScheduling(total, block_size);
+}
+
+void ThreadPool::ParallelFor(int64 total,
+                             const SchedulingParams& scheduling_params,
+                             const std::function<void(int64, int64)>& fn) {
+  switch (scheduling_params.strategy()) {
+    case SchedulingStrategy::kAdaptive: {
+      if (scheduling_params.cost_per_unit().has_value()) {
+        ParallelFor(total, *scheduling_params.cost_per_unit(), fn);
+      }
+      break;
+    }
+    case SchedulingStrategy::kFixedBlockSize: {
+      if (scheduling_params.block_size().has_value()) {
+        ParallelForFixedBlockSizeScheduling(
+            total, *scheduling_params.block_size(), fn);
+      }
+      break;
+    }
+  }
+}
+
 void ThreadPool::TransformRangeConcurrently(
     const int64 block_size, const int64 total,
     const std::function<void(int64, int64)>& fn) {
+  ParallelFor(total,
+              SchedulingParams(SchedulingStrategy::kFixedBlockSize,
+                               absl::nullopt /* cost_per_unit */, block_size),
+              fn);
+}
+
+// This functionality is similar to parallelFor, except that reasoning about
+// the number of shards used is significantly easier.
+void ThreadPool::ParallelForFixedBlockSizeScheduling(
+    const int64 total, const int64 block_size,
+    const std::function<void(int64, int64)>& fn) {
   const int num_shards_used =
-      NumShardsUsedByTransformRangeConcurrently(block_size, total);
+      NumShardsUsedByFixedBlockSizeScheduling(total, block_size);
   if (num_shards_used == 1) {
     fn(0, total);
     return;
@@ -166,7 +201,7 @@ void ThreadPool::TransformRangeConcurrently(
 }
 
 void ThreadPool::ParallelFor(int64 total, int64 cost_per_unit,
-                             std::function<void(int64, int64)> fn) {
+                             const std::function<void(int64, int64)>& fn) {
   CHECK_GE(total, 0);
   CHECK_EQ(total, (int64)(Eigen::Index)total);
   threadpool_device_->parallelFor(
@@ -193,6 +228,18 @@ void ThreadPool::ParallelForWithWorkerId(
                                   });
 }
 
+void ThreadPool::ParallelForWithWorkerId(
+    int64 total, const SchedulingParams& scheduling_params,
+    const std::function<void(int64, int64, int)>& fn) {
+  ParallelFor(total, scheduling_params, [this, &fn](int64 start, int64 limit) {
+    // We may use the current thread to do some work synchronously.
+    // When calling CurrentThreadId() from outside of the thread
+    // pool, we get -1, so we can shift every id up by 1.
+    int id = CurrentThreadId() + 1;
+    fn(start, limit, id);
+  });
+}
+
 int ThreadPool::NumThreads() const {
   return underlying_threadpool_->NumThreads();
 }
diff --git a/tensorflow/core/lib/core/threadpool.h b/tensorflow/core/lib/core/threadpool.h
index 51aa83cc625..d168faef670 100644
--- a/tensorflow/core/lib/core/threadpool.h
+++ b/tensorflow/core/lib/core/threadpool.h
@@ -19,6 +19,7 @@ limitations under the License.
 #include <functional>
 #include <memory>
 
+#include "absl/types/optional.h"
 #include "tensorflow/core/lib/core/threadpool_interface.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/macros.h"
@@ -40,6 +41,64 @@ struct EigenEnvironment;
 
 class ThreadPool {
  public:
+  // Scheduling strategies for ParallelFor. The strategy governs how the given
+  // units of work are distributed among the available threads in the
+  // threadpool.
+  enum class SchedulingStrategy {
+    // The Adaptive scheduling strategy adaptively chooses the shard sizes based
+    // on the cost of each unit of work, and the cost model of the underlying
+    // threadpool device.
+    //
+    // The 'cost_per_unit' is an estimate of the number of CPU cycles (or
+    // nanoseconds if not CPU-bound) to complete a unit of work. Overestimating
+    // creates too many shards and CPU time will be dominated by per-shard
+    // overhead, such as Context creation. Underestimating may not fully make
+    // use of the specified parallelism, and may also cause inefficiencies due
+    // to load balancing issues and stragglers.
+    kAdaptive,
+    // The Fixed Block Size scheduling strategy shards the given units of work
+    // into shards of fixed size. In case the total number of units is not
+    // evenly divisible by 'block_size', at most one of the shards may be of
+    // smaller size. The exact number of shards may be found by a call to
+    // NumShardsUsedByFixedBlockSizeScheduling.
+    //
+    // Each shard may be executed on a different thread in parallel, depending
+    // on the number of threads available in the pool. Note that when there
+    // aren't enough threads in the pool to achieve full parallelism, function
+    // calls will be automatically queued.
+    kFixedBlockSize
+  };
+
+  // Contains additional parameters for either the Adaptive or the Fixed Block
+  // Size scheduling strategy.
+  class SchedulingParams {
+   public:
+    explicit SchedulingParams(SchedulingStrategy strategy,
+                              absl::optional<int64> cost_per_unit,
+                              absl::optional<int64> block_size)
+        : strategy_(strategy),
+          cost_per_unit_(cost_per_unit),
+          block_size_(block_size) {}
+
+    SchedulingStrategy strategy() const { return strategy_; }
+    absl::optional<int64> cost_per_unit() const { return cost_per_unit_; }
+    absl::optional<int64> block_size() const { return block_size_; }
+
+   private:
+    // The underlying Scheduling Strategy for which this instance contains
+    // additional parameters.
+    SchedulingStrategy strategy_;
+
+    // The estimated cost per unit of work in number of CPU cycles (or
+    // nanoseconds if not CPU-bound). Only applicable for Adaptive scheduling
+    // strategy.
+    absl::optional<int64> cost_per_unit_;
+
+    // The block size of each shard. Only applicable for Fixed Block Size
+    // scheduling strategy.
+    absl::optional<int64> block_size_;
+  };
+
   // Constructs a pool that contains "num_threads" threads with specified
   // "name". env->StartThread() is used to create individual threads with the
   // given ThreadOptions. If "low_latency_hint" is true the thread pool
@@ -83,17 +142,15 @@ class ThreadPool {
       const std::vector<std::pair<unsigned, unsigned>>& partitions);
 
   void ScheduleWithHint(std::function<void()> fn, int start, int limit);
-  // Requires 0 < block_size <= total.
-  // Spawns k threads and calls fn(i*block_size, (i+1)*block_size) from the
-  // ith thread (i>=0). When (i+1)*block_size > total, fn(i*block_size, total)
-  // is called instead. k = NumShardsUsedByTransformRangeConcurrently(...).
-  // Note that when there aren't enough threads in the pool to achieve full
-  // parallelism, function calls will be automatically queued.
-  void TransformRangeConcurrently(const int64 block_size, const int64 total,
-                                  const std::function<void(int64, int64)>& fn);
+
+  // Returns the number of shards used by ParallelForFixedBlockSizeScheduling
+  // with these parameters.
+  int NumShardsUsedByFixedBlockSizeScheduling(const int64 total,
+                                              const int64 block_size);
 
   // Returns the number of threads spawned by calling TransformRangeConcurrently
   // with these parameters.
+  // Deprecated. Use NumShardsUsedByFixedBlockSizeScheduling.
   int NumShardsUsedByTransformRangeConcurrently(const int64 block_size,
                                                 const int64 total);
 
@@ -106,9 +163,20 @@ class ThreadPool {
   // if not CPU-bound) to complete a unit of work. Overestimating creates too
   // many shards and CPU time will be dominated by per-shard overhead, such as
   // Context creation. Underestimating may not fully make use of the specified
-  // parallelism.
+  // parallelism, and may also cause inefficiencies due to load balancing
+  // issues and stragglers.
   void ParallelFor(int64 total, int64 cost_per_unit,
-                   std::function<void(int64, int64)> fn);
+                   const std::function<void(int64, int64)>& fn);
+
+  // Similar to ParallelFor above, but takes the specified scheduling strategy
+  // into account.
+  void ParallelFor(int64 total, const SchedulingParams& scheduling_params,
+                   const std::function<void(int64, int64)>& fn);
+
+  // Same as ParallelFor with Fixed Block Size scheduling strategy.
+  // Deprecated. Prefer ParallelFor with a SchedulingStrategy argument.
+  void TransformRangeConcurrently(const int64 block_size, const int64 total,
+                                  const std::function<void(int64, int64)>& fn);
 
   // Shards the "total" units of work. For more details, see "ParallelFor".
   //
@@ -129,6 +197,12 @@ class ThreadPool {
       int64 total, int64 cost_per_unit,
       const std::function<void(int64, int64, int)>& fn);
 
+  // Similar to ParallelForWithWorkerId above, but takes the specified
+  // scheduling strategy into account.
+  void ParallelForWithWorkerId(
+      int64 total, const SchedulingParams& scheduling_params,
+      const std::function<void(int64, int64, int)>& fn);
+
   // Returns the number of threads in the pool.
   int NumThreads() const;
 
@@ -142,6 +216,17 @@ class ThreadPool {
   Eigen::ThreadPoolInterface* AsEigenThreadPool() const;
 
  private:
+  // Divides the work represented by the range [0, total) into k shards.
+  // Calls fn(i*block_size, (i+1)*block_size) from the ith shard (0 <= i < k).
+  // Each shard may be executed on a different thread in parallel, depending on
+  // the number of threads available in the pool.
+  // When (i+1)*block_size > total, fn(i*block_size, total) is called instead.
+  // Here, k = NumShardsUsedByFixedBlockSizeScheduling(total, block_size).
+  // Requires 0 < block_size <= total.
+  void ParallelForFixedBlockSizeScheduling(
+      const int64 total, const int64 block_size,
+      const std::function<void(int64, int64)>& fn);
+
   // underlying_threadpool_ is the user_threadpool if user_threadpool is
   // provided in the constructor. Otherwise it is the eigen_threadpool_.
   Eigen::ThreadPoolInterface* underlying_threadpool_;
diff --git a/tensorflow/core/lib/core/threadpool_test.cc b/tensorflow/core/lib/core/threadpool_test.cc
index f972fb4fb47..911645f04f1 100644
--- a/tensorflow/core/lib/core/threadpool_test.cc
+++ b/tensorflow/core/lib/core/threadpool_test.cc
@@ -19,6 +19,7 @@ limitations under the License.
 
 #include "absl/synchronization/barrier.h"
 #include "absl/synchronization/blocking_counter.h"
+#include "absl/types/optional.h"
 #include "tensorflow/core/platform/context.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/mutex.h"
@@ -62,7 +63,59 @@ TEST(ThreadPool, DoWork) {
   }
 }
 
-void RunSharding(int64 block_size, int64 total, ThreadPool* threads) {
+void RunWithFixedBlockSize(int64 block_size, int64 total, ThreadPool* threads) {
+  mutex mu;
+  int64 num_shards = 0;
+  int64 num_done_work = 0;
+  std::vector<std::atomic<bool>> work(total);
+  for (int i = 0; i < total; i++) {
+    work[i] = false;
+  }
+  threads->ParallelFor(
+      total,
+      ThreadPool::SchedulingParams(
+          ThreadPool::SchedulingStrategy::kFixedBlockSize /* strategy */,
+          absl::nullopt /* cost_per_unit */, block_size /* block_size */),
+      [=, &mu, &num_shards, &num_done_work, &work](int64 start, int64 end) {
+        VLOG(1) << "Shard [" << start << "," << end << ")";
+        EXPECT_GE(start, 0);
+        EXPECT_LE(end, total);
+        mutex_lock l(mu);
+        ++num_shards;
+        for (; start < end; ++start) {
+          EXPECT_FALSE(work[start].exchange(true));  // No duplicate
+          ++num_done_work;
+        }
+      });
+  EXPECT_EQ(num_done_work, total);
+  for (int i = 0; i < total; i++) {
+    ASSERT_TRUE(work[i]);
+  }
+  const int64 num_workers = (total + block_size - 1) / block_size;
+  if (num_workers < threads->NumThreads()) {
+    // If the intention is to limit the parallelism explicitly, we'd
+    // better honor it. Ideally, even if per_thread_max_parallelism >
+    // num_workers, we should expect that Shard() implementation do
+    // not over-shard. Unfortunately, ThreadPoolDevice::parallelFor
+    // tends to over-shard.
+    EXPECT_LE(num_shards, 1 + num_workers);
+  }
+}
+
+// Adapted from work_sharder_test.cc
+TEST(ThreadPoolTest, ParallelForFixedBlockSizeScheduling) {
+  ThreadPool threads(Env::Default(), "test", 16);
+  for (auto block_size : {1, 7, 10, 64, 100, 256, 1000, 9999}) {
+    for (auto diff : {0, 1, 11, 102, 1003, 10005, 1000007}) {
+      const int64 total = block_size + diff;
+      RunWithFixedBlockSize(block_size, total, &threads);
+    }
+  }
+}
+
+void RunWithFixedBlockSizeTransformRangeConcurrently(int64 block_size,
+                                                     int64 total,
+                                                     ThreadPool* threads) {
   mutex mu;
   int64 num_shards = 0;
   int64 num_done_work = 0;
@@ -83,7 +136,6 @@ void RunSharding(int64 block_size, int64 total, ThreadPool* threads) {
           ++num_done_work;
         }
       });
-  LOG(INFO) << block_size << " " << total;
   EXPECT_EQ(num_done_work, total);
   for (int i = 0; i < total; i++) {
     ASSERT_TRUE(work[i]);
@@ -100,18 +152,39 @@ void RunSharding(int64 block_size, int64 total, ThreadPool* threads) {
 }
 
 // Adapted from work_sharder_test.cc
-TEST(SparseUtilsTest, TransformRangeConcurrently) {
+TEST(ThreadPoolTest, TransformRangeConcurrently) {
   ThreadPool threads(Env::Default(), "test", 16);
   for (auto block_size : {1, 7, 10, 64, 100, 256, 1000, 9999}) {
     for (auto diff : {0, 1, 11, 102, 1003, 10005, 1000007}) {
       const int64 total = block_size + diff;
-      RunSharding(block_size, total, &threads);
+      RunWithFixedBlockSizeTransformRangeConcurrently(block_size, total,
+                                                      &threads);
     }
   }
 }
 
-TEST(SparseUtilsTest, NumShardsUsedByTransformRangeConcurrently) {
+TEST(ThreadPoolTest, NumShardsUsedByFixedBlockSizeScheduling) {
   ThreadPool threads(Env::Default(), "test", 16);
+
+  EXPECT_EQ(1, threads.NumShardsUsedByFixedBlockSizeScheduling(
+                   3 /* total */, 3 /* block_size */));
+  EXPECT_EQ(2, threads.NumShardsUsedByFixedBlockSizeScheduling(
+                   4 /* total */, 3 /* block_size */));
+  EXPECT_EQ(2, threads.NumShardsUsedByFixedBlockSizeScheduling(
+                   5 /* total */, 3 /* block_size */));
+  EXPECT_EQ(2, threads.NumShardsUsedByFixedBlockSizeScheduling(
+                   6 /* total */, 3 /* block_size */));
+  EXPECT_EQ(3, threads.NumShardsUsedByFixedBlockSizeScheduling(
+                   7 /* total */, 3 /* block_size */));
+  EXPECT_EQ(7, threads.NumShardsUsedByFixedBlockSizeScheduling(
+                   7 /* total */, 1 /* block_size */));
+  EXPECT_EQ(1, threads.NumShardsUsedByFixedBlockSizeScheduling(
+                   7 /* total */, 0 /* block_size */));
+}
+
+TEST(ThreadPoolTest, NumShardsUsedByTransformRangeConcurrently) {
+  ThreadPool threads(Env::Default(), "test", 16);
+
   EXPECT_EQ(1, threads.NumShardsUsedByTransformRangeConcurrently(
                    3 /* block_size */, 3 /* total */));
   EXPECT_EQ(2, threads.NumShardsUsedByTransformRangeConcurrently(
@@ -128,6 +201,63 @@ TEST(SparseUtilsTest, NumShardsUsedByTransformRangeConcurrently) {
                    0 /* block_size */, 7 /* total */));
 }
 
+void RunFixedBlockSizeShardingWithWorkerId(int64 block_size, int64 total,
+                                           ThreadPool* threads) {
+  mutex mu;
+  int64 num_done_work = 0;
+  std::vector<std::atomic<bool>> work(total);
+  for (int i = 0; i < total; i++) {
+    work[i] = false;
+  }
+  const int64 num_threads = threads->NumThreads();
+  std::vector<std::atomic<bool>> threads_running(num_threads + 1);
+  for (int i = 0; i < num_threads + 1; i++) {
+    threads_running[i] = false;
+  }
+
+  threads->ParallelForWithWorkerId(
+      total,
+      ThreadPool::SchedulingParams(
+          ThreadPool::SchedulingStrategy::kFixedBlockSize /* strategy */,
+          absl::nullopt /* cost_per_unit */, block_size /* block_size */),
+      [=, &mu, &num_done_work, &work, &threads_running](int64 start, int64 end,
+                                                        int id) {
+        VLOG(1) << "Shard [" << start << "," << end << ")";
+        EXPECT_GE(start, 0);
+        EXPECT_LE(end, total);
+
+        // Store true for the current thread, and assert that another thread
+        // is not running with the same id.
+        EXPECT_GE(id, 0);
+        EXPECT_LE(id, num_threads);
+        EXPECT_FALSE(threads_running[id].exchange(true));
+
+        mutex_lock l(mu);
+        for (; start < end; ++start) {
+          EXPECT_FALSE(work[start].exchange(true));  // No duplicate
+          ++num_done_work;
+        }
+        EXPECT_TRUE(threads_running[id].exchange(false));
+      });
+
+  EXPECT_EQ(num_done_work, total);
+  for (int i = 0; i < total; i++) {
+    EXPECT_TRUE(work[i]);
+  }
+}
+
+TEST(ThreadPoolTest, ParallelForFixedBlockSizeSchedulingWithWorkerId) {
+  for (int32 num_threads : {1, 2, 3, 9, 16, 31}) {
+    ThreadPool threads(Env::Default(), "test", num_threads);
+    for (int64 block_size : {1, 7, 10, 64, 100, 256, 1000}) {
+      for (int64 diff : {0, 1, 11, 102, 1003}) {
+        const int64 total = block_size + diff;
+        RunFixedBlockSizeShardingWithWorkerId(block_size, total, &threads);
+      }
+    }
+  }
+}
+
 TEST(ThreadPool, ParallelFor) {
   Context outer_context(ContextKind::kThread);
   // Make ParallelFor use as many threads as possible.
@@ -154,6 +284,36 @@ TEST(ThreadPool, ParallelFor) {
   }
 }
 
+TEST(ThreadPool, ParallelForWithAdaptiveSchedulingStrategy) {
+  Context outer_context(ContextKind::kThread);
+  // Make ParallelFor use as many threads as possible.
+  int64 kHugeCost = 1 << 30;
+  for (int num_threads = 1; num_threads < kNumThreads; num_threads++) {
+    fprintf(stderr, "Testing with %d threads\n", num_threads);
+    const int kWorkItems = 15;
+    std::atomic<bool> work[kWorkItems];
+    ThreadPool pool(Env::Default(), "test", num_threads);
+    for (int i = 0; i < kWorkItems; i++) {
+      work[i] = false;
+    }
+    pool.ParallelFor(
+        kWorkItems,
+        ThreadPool::SchedulingParams(
+            ThreadPool::SchedulingStrategy::kAdaptive /* strategy */,
+            kHugeCost /* cost_per_unit */, absl::nullopt /* block_size */),
+        [&outer_context, &work](int64 begin, int64 end) {
+          Context inner_context(ContextKind::kThread);
+          ASSERT_EQ(outer_context, inner_context);
+          for (int64 i = begin; i < end; ++i) {
+            ASSERT_FALSE(work[i].exchange(true));
+          }
+        });
+    for (int i = 0; i < kWorkItems; i++) {
+      ASSERT_TRUE(work[i]);
+    }
+  }
+}
+
 TEST(ThreadPool, ParallelForWithWorkerId) {
   // Make ParallelForWithWorkerId use as many threads as possible.
   int64 kHugeCost = 1 << 30;
diff --git a/tensorflow/core/lib/gtl/stl_util.h b/tensorflow/core/lib/gtl/stl_util.h
index 853a290bf63..37a70248773 100644
--- a/tensorflow/core/lib/gtl/stl_util.h
+++ b/tensorflow/core/lib/gtl/stl_util.h
@@ -32,74 +32,6 @@ limitations under the License.
 namespace tensorflow {
 namespace gtl {
 
-// Returns a char* pointing to the beginning of a string's internal buffer.
-// The result is a valid "null-terminated byte string", even if *str is empty.
-// Up to C++14 it is not valid to *write* to the null terminator; as of C++17,
-// it is valid to write zero to the null terminator (but not any other value).
-inline char* string_as_array(string* str) { return &*str->begin(); }
-
-// The following vector_as_array functions return raw pointers to the underlying
-// data buffer. The return value is unspecified (but valid) if the input range
-// is empty.
-template <typename T, typename Allocator>
-inline T* vector_as_array(std::vector<T, Allocator>* v) {
-  return v->data();
-}
-
-template <typename T, typename Allocator>
-inline const T* vector_as_array(const std::vector<T, Allocator>* v) {
-  return v->data();
-}
-
-namespace gtl_internal {
-
-// HasMember is true_type or false_type, depending on whether or not
-// T has a __resize_default_init member. Resize will call the
-// __resize_default_init member if it exists, and will call the resize
-// member otherwise.
-template <typename string_type, typename = void>
-struct ResizeUninitializedTraits {
-  using HasMember = std::false_type;
-  static void Resize(string_type* s, size_t new_size) { s->resize(new_size); }
-};
-
-// __resize_default_init is provided by libc++ >= 8.0 and by Google's internal
-// ::string implementation.
-template <typename string_type>
-struct ResizeUninitializedTraits<
-    string_type, absl::void_t<decltype(std::declval<string_type&>()
-                                           .__resize_default_init(237))> > {
-  using HasMember = std::true_type;
-  static void Resize(string_type* s, size_t new_size) {
-    s->__resize_default_init(new_size);
-  }
-};
-
-}  // namespace gtl_internal
-
-// Like str->resize(new_size), except any new characters added to "*str" as a
-// result of resizing may be left uninitialized, rather than being filled with
-// '0' bytes. Typically used when code is then going to overwrite the backing
-// store of the string with known data.
-inline void STLStringResizeUninitialized(string* s, size_t new_size) {
-  gtl_internal::ResizeUninitializedTraits<string>::Resize(s, new_size);
-}
-
-// Calls delete (non-array version) on the SECOND item (pointer) in each pair in
-// the range [begin, end).
-//
-// Note: If you're calling this on an entire container, you probably want to
-// call STLDeleteValues(&container) instead, or use ValueDeleter.
-template <typename ForwardIterator>
-void STLDeleteContainerPairSecondPointers(ForwardIterator begin,
-                                          ForwardIterator end) {
-  while (begin != end) {
-    ForwardIterator temp = begin;
-    ++begin;
-    delete temp->second;
-  }
-}
-
 // Deletes all the elements in an STL container and clears the container. This
 // function is suitable for use with a vector, set, hash_set, or any other STL
 // container which defines sensible begin(), end(), and clear() methods.
@@ -132,13 +64,6 @@ void STLDeleteValues(T* container) {
   container->clear();
 }
 
-// Sorts and removes duplicates from a sequence container.
-template <typename T>
-inline void STLSortAndRemoveDuplicates(T* v) {
-  std::sort(v->begin(), v->end());
-  v->erase(std::unique(v->begin(), v->end()), v->end());
-}
-
 }  // namespace gtl
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/lib/hash/BUILD b/tensorflow/core/lib/hash/BUILD
new file mode 100644
index 00000000000..c67c1b3289d
--- /dev/null
+++ b/tensorflow/core/lib/hash/BUILD
@@ -0,0 +1,96 @@
+load(
+    "//tensorflow:tensorflow.bzl",
+    "if_linux_x86_64",
+    "tf_copts",
+)
+
+package(
+    default_visibility = [
+        "//tensorflow:__subpackages__",
+    ],
+    licenses = ["notice"],  # Apache 2.0
+)
+
+# Todo(bmzhao): Remaining targets to add to this BUILD file are: all tests.
+
+cc_library(
+    name = "crc32c",
+    srcs = [
+        "crc32c.cc",
+        "crc32c_accelerate.cc",
+    ],
+    hdrs = ["crc32c.h"],
+    # -msse4.2 enables the use of crc32c compiler builtins.
+    copts = tf_copts() + if_linux_x86_64(["-msse4.2"]),
+    deps = [
+        "//tensorflow/core/lib/core:coding",
+        "//tensorflow/core/platform",
+        "//tensorflow/core/platform:cord",
+        "//tensorflow/core/platform:types",
+    ],
+)
+
+cc_library(
+    name = "hash",
+    srcs = ["hash.cc"],
+    hdrs = ["hash.h"],
+    deps = [
+        "//tensorflow/core/lib/core:raw_coding",
+        "//tensorflow/core/lib/core:stringpiece",
+        "//tensorflow/core/platform:macros",
+        "//tensorflow/core/platform:types",
+    ],
+)
+
+filegroup(
+    name = "legacy_lib_hash_all_headers",
+    srcs = [
+        "crc32c.h",
+        "hash.h",
+    ],
+    visibility = ["//tensorflow/core:__pkg__"],
+)
+
+filegroup(
+    name = "legacy_lib_hash_all_srcs",
+    srcs = [
+        "crc32c.cc",
+        "crc32c_accelerate.cc",
+        "hash.cc",
+    ],
+    visibility = ["//tensorflow/core:__pkg__"],
+)
+
+filegroup(
+    name = "legacy_lib_internal_impl_srcs",
+    srcs = [
+        "crc32c.cc",
+        "hash.cc",
+    ],
+    visibility = ["//tensorflow/core:__pkg__"],
+)
+
+filegroup(
+    name = "legacy_crc32_accelerate_srcs",
+    srcs = [
+        "crc32c_accelerate.cc",
+    ],
+    visibility = ["//tensorflow/core:__pkg__"],
+)
+
+filegroup(
+    name = "legacy_lib_internal_public_headers",
+    srcs = [
+        "hash.h",
+    ],
+    visibility = ["//tensorflow/core:__pkg__"],
+)
+
+filegroup(
+    name = "legacy_lib_hash_all_tests",
+    srcs = [
+        "crc32c_test.cc",
+        "hash_test.cc",
+    ],
+    visibility = ["//tensorflow/core:__pkg__"],
+)
diff --git a/tensorflow/core/lib/io/BUILD b/tensorflow/core/lib/io/BUILD
new file mode 100644
index 00000000000..a7f83976619
--- /dev/null
+++ b/tensorflow/core/lib/io/BUILD
@@ -0,0 +1,184 @@
+package(
+    default_visibility = [
+        "//tensorflow:__subpackages__",
+    ],
+    licenses = ["notice"],  # Apache 2.0
+)
+
+# Todo(bmzhao): Remaining targets to add to this BUILD file are:
+# block, block_builder, buffered_inputstream, format, inputbuffer,
+# random_inputstream, record_reader, record_writer, snappy/snappy_inputbuffer
+# snappy/snappy_outputbuffer, table, table_builder, two_level_iterator,
+# zlib_inputstream, zlib_outputbuffer, zlib_compression_options, and all tests.
+
+cc_library(
+    name = "compression",
+    srcs = ["compression.cc"],
+    hdrs = ["compression.h"],
+)
+
+cc_library(
+    name = "inputstream_interface",
+    srcs = ["inputstream_interface.cc"],
+    hdrs = ["inputstream_interface.h"],
+    deps = [
+        "//tensorflow/core/lib/core:errors",
+        "//tensorflow/core/lib/core:status",
+        "//tensorflow/core/platform:cord",
+        "//tensorflow/core/platform:types",
+    ],
+)
+
+cc_library(
+    name = "iterator",
+    srcs = ["iterator.cc"],
+    hdrs = ["iterator.h"],
+    deps = [
+        "//tensorflow/core/lib/core:status",
+        "//tensorflow/core/lib/core:stringpiece",
+    ],
+)
+
+cc_library(
+    name = "path",
+    srcs = ["path.cc"],
+    hdrs = ["path.h"],
+    deps = [
+        "//tensorflow/core/lib/core:stringpiece",
+        "//tensorflow/core/lib/strings:scanner",
+        "//tensorflow/core/lib/strings:string_utils",
+        "//tensorflow/core/platform:logging",
+        "//tensorflow/core/platform:mutex",
+        "//tensorflow/core/platform:types",
+    ],
+)
+
+cc_library(
+    name = "proto_encode_helper",
+    hdrs = ["proto_encode_helper.h"],
+    deps = [
+        "//tensorflow/core/lib/core:coding",
+        "//tensorflow/core/lib/core:stringpiece",
+        "//tensorflow/core/platform:protobuf",
+    ],
+)
+
+cc_library(
+    name = "table_options",
+    hdrs = ["table_options.h"],
+)
+
+filegroup(
+    name = "legacy_lib_io_all_headers",
+    srcs = [
+        "block.h",
+        "block_builder.h",
+        "buffered_inputstream.h",
+        "compression.h",
+        "format.h",
+        "inputbuffer.h",
+        "inputstream_interface.h",
+        "iterator.h",
+        "path.h",
+        "proto_encode_helper.h",
+        "random_inputstream.h",
+        "record_reader.h",
+        "record_writer.h",
+        "snappy/snappy_inputbuffer.h",
+        "snappy/snappy_outputbuffer.h",
+        "table.h",
+        "table_builder.h",
+        "table_options.h",
+        "two_level_iterator.h",
+        "zlib_compression_options.h",
+        "zlib_inputstream.h",
+        "zlib_outputbuffer.h",
+    ],
+    visibility = ["//tensorflow/core:__pkg__"],
+)
+
+filegroup(
+    name = "legacy_lib_io_all_srcs",
+    srcs = [
+        "block.cc",
+        "block_builder.cc",
+        "buffered_inputstream.cc",
+        "compression.cc",
+        "format.cc",
+        "inputbuffer.cc",
+        "inputstream_interface.cc",
+        "iterator.cc",
+        "path.cc",
+        "random_inputstream.cc",
+        "record_reader.cc",
+        "record_writer.cc",
+        "snappy/snappy_inputbuffer.cc",
+        "snappy/snappy_outputbuffer.cc",
+        "table.cc",
+        "table_builder.cc",
+        "two_level_iterator.cc",
+        "zlib_compression_options.cc",
+        "zlib_inputstream.cc",
+        "zlib_outputbuffer.cc",
+    ],
+    visibility = ["//tensorflow/core:__pkg__"],
+)
+
+filegroup(
+    name = "legacy_lib_io_all_tests",
+    srcs = [
+        "buffered_inputstream_test.cc",
+        "inputbuffer_test.cc",
+        "inputstream_interface_test.cc",
+        "path_test.cc",
+        "random_inputstream_test.cc",
+        "record_reader_writer_test.cc",
+        "recordio_test.cc",
+        "snappy/snappy_buffers_test.cc",
+        "table_test.cc",
+        "zlib_buffers_test.cc",
+    ],
+    visibility = ["//tensorflow/core:__pkg__"],
+)
+
+filegroup(
+    name = "legacy_lib_io_headers",
+    srcs = [
+        "buffered_inputstream.h",
+        "compression.h",
+        "inputstream_interface.h",
+        "path.h",
+        "proto_encode_helper.h",
+        "random_inputstream.h",
+        "record_reader.h",
+        "record_writer.h",
+        "table.h",
+        "table_builder.h",
+        "table_options.h",
+    ],
+    visibility = ["//tensorflow/core:__pkg__"],
+)
+
+filegroup(
+    name = "legacy_lib_internal_public_headers",
+    srcs = [
+        "inputbuffer.h",
+        "iterator.h",
+        "snappy/snappy_inputbuffer.h",
+        "snappy/snappy_outputbuffer.h",
+        "zlib_compression_options.h",
+        "zlib_inputstream.h",
+        "zlib_outputbuffer.h",
+    ],
+    visibility = ["//tensorflow/core:__pkg__"],
+)
+
+filegroup(
+    name = "legacy_lib_test_internal_headers",
+    srcs = [
+        "block.h",
+        "block_builder.h",
+        "format.h",
+    ],
+    visibility = ["//tensorflow/core:__pkg__"],
+)
diff --git a/tensorflow/core/lib/io/compression.cc b/tensorflow/core/lib/io/compression.cc
index 0d25bca9ecc..0aa4caaaef8 100644
--- a/tensorflow/core/lib/io/compression.cc
+++ b/tensorflow/core/lib/io/compression.cc
@@ -21,6 +21,7 @@ namespace compression {
 
 const char kNone[] = "";
 const char kGzip[] = "GZIP";
+const char kSnappy[] = "SNAPPY";
 
 }  // namespace compression
 }  // namespace io
diff --git a/tensorflow/core/lib/io/compression.h b/tensorflow/core/lib/io/compression.h
index 4d8e7788cad..10981846d0a 100644
--- a/tensorflow/core/lib/io/compression.h
+++ b/tensorflow/core/lib/io/compression.h
@@ -22,6 +22,7 @@ namespace compression {
 
 extern const char kNone[];
 extern const char kGzip[];
+extern const char kSnappy[];
 
 }  // namespace compression
 }  // namespace io
diff --git a/tensorflow/core/lib/io/path.h b/tensorflow/core/lib/io/path.h
index 38fb0c5d861..9b076a84bd0 100644
--- a/tensorflow/core/lib/io/path.h
+++ b/tensorflow/core/lib/io/path.h
@@ -41,7 +41,7 @@ string JoinPathImpl(std::initializer_list<tensorflow::StringPiece> paths);
 // Usage:
 // string path = io::JoinPath("/mydir", filename);
 // string path = io::JoinPath(FLAGS_test_srcdir, filename);
-// string path = io::JoinPath("/full", "path", "to", "filename);
+// string path = io::JoinPath("/full", "path", "to", "filename");
 template <typename... T>
 string JoinPath(const T&... args) {
   return internal::JoinPathImpl({args...});
diff --git a/tensorflow/core/lib/io/snappy/snappy_outputbuffer.cc b/tensorflow/core/lib/io/snappy/snappy_outputbuffer.cc
index 3c310167326..b0c8b9a28a1 100644
--- a/tensorflow/core/lib/io/snappy/snappy_outputbuffer.cc
+++ b/tensorflow/core/lib/io/snappy/snappy_outputbuffer.cc
@@ -30,6 +30,45 @@ SnappyOutputBuffer::SnappyOutputBuffer(WritableFile* file,
       next_out_(output_buffer_.get()),
       avail_out_(output_buffer_bytes) {}
 
+SnappyOutputBuffer::~SnappyOutputBuffer() {
+  size_t bytes_to_write = output_buffer_capacity_ - avail_out_;
+  if (bytes_to_write > 0) {
+    LOG(WARNING) << "There is still data in the output buffer. "
+                 << "Possible data loss has occurred.";
+  }
+}
+
+Status SnappyOutputBuffer::Append(StringPiece data) { return Write(data); }
+
+#if defined(PLATFORM_GOOGLE)
+Status SnappyOutputBuffer::Append(const absl::Cord& cord) {
+  absl::CordReader reader(cord);
+  absl::string_view fragment;
+  while (reader.ReadFragment(&fragment)) {
+    TF_RETURN_IF_ERROR(Append(fragment));
+  }
+  return Status::OK();
+}
+#endif
+
+Status SnappyOutputBuffer::Close() {
+  // Given that we do not own `file`, we don't close it.
+  return Flush();
+}
+
+Status SnappyOutputBuffer::Name(StringPiece* result) const {
+  return file_->Name(result);
+}
+
+Status SnappyOutputBuffer::Sync() {
+  TF_RETURN_IF_ERROR(Flush());
+  return file_->Sync();
+}
+
+Status SnappyOutputBuffer::Tell(int64* position) {
+  return file_->Tell(position);
+}
+
 Status SnappyOutputBuffer::Write(StringPiece data) {
   //
   // The deflated output is accumulated in output_buffer_ and gets written to
@@ -63,7 +102,7 @@ Status SnappyOutputBuffer::Write(StringPiece data) {
 
   TF_RETURN_IF_ERROR(Deflate());
 
-  DCHECK(avail_in_ == 0);  // All input will be used up.
+  DCHECK_EQ(avail_in_, 0);  // All input will be used up.
 
   next_in_ = input_buffer_.get();
 
@@ -132,7 +171,7 @@ Status SnappyOutputBuffer::AddToOutputBuffer(const char* data, size_t length) {
 
 Status SnappyOutputBuffer::DeflateBuffered() {
   TF_RETURN_IF_ERROR(Deflate());
-  DCHECK(avail_in_ == 0);
+  DCHECK_EQ(avail_in_, 0);
   next_in_ = input_buffer_.get();
   return Status::OK();
 }
diff --git a/tensorflow/core/lib/io/snappy/snappy_outputbuffer.h b/tensorflow/core/lib/io/snappy/snappy_outputbuffer.h
index 5aea503846d..b865f2d4a06 100644
--- a/tensorflow/core/lib/io/snappy/snappy_outputbuffer.h
+++ b/tensorflow/core/lib/io/snappy/snappy_outputbuffer.h
@@ -13,13 +13,15 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_CORE_LIB_IO_SNAPPY_OUTPUTBUFFER_H_
-#define TENSORFLOW_CORE_LIB_IO_SNAPPY_OUTPUTBUFFER_H_
+#ifndef TENSORFLOW_CORE_LIB_IO_SNAPPY_SNAPPY_OUTPUTBUFFER_H_
+#define TENSORFLOW_CORE_LIB_IO_SNAPPY_SNAPPY_OUTPUTBUFFER_H_
 
 #include <string>
+
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/platform.h"
 #include "tensorflow/core/platform/snappy.h"
 #include "tensorflow/core/platform/types.h"
 
@@ -43,7 +45,7 @@ namespace io {
 // _compressed_ block _excluding_ this header. The compressed
 // block (excluding the 4 byte header) is a valid snappy block and can directly
 // be uncompressed using Snappy_Uncompress.
-class SnappyOutputBuffer {
+class SnappyOutputBuffer : public WritableFile {
  public:
   // Create an SnappyOutputBuffer for `file` with two buffers that cache the
   // 1. input data to be deflated
@@ -53,6 +55,40 @@ class SnappyOutputBuffer {
   SnappyOutputBuffer(WritableFile* file, int32 input_buffer_bytes,
                      int32 output_buffer_bytes);
 
+  // Per convention, the dtor does not call Flush() or Close(). We expect the
+  // caller to call those manually when done.
+  ~SnappyOutputBuffer() override;
+
+  // Adds `data` to the compression pipeline.
+  //
+  // The input data is buffered internally and will be written to disk at a
+  // later time. To immediately write contents to file call `Flush()`.
+  Status Append(StringPiece data) override;
+
+#if defined(PLATFORM_GOOGLE)
+  Status Append(const absl::Cord& cord) override;
+#endif
+
+  // Compresses any buffered input and writes all output to file. This must be
+  // called before the destructor to avoid any data loss.
+  //
+  // Contrary to `Flush()` this informs snappy that it should not expect any
+  // further input.
+  //
+  // After calling this, any further calls to `Write()`, `Flush()` or `Close()`
+  // will fail.
+  Status Close() override;
+
+  // Returns the name of the underlying file.
+  Status Name(StringPiece* result) const override;
+
+  // Deflates any cached input, writes all output to file and syncs it.
+  Status Sync() override;
+
+  // Returns the write position in the underlying file. The position does not
+  // reflect buffered, un-flushed data.
+  Status Tell(int64* position) override;
+
   // Adds `data` to the compression pipeline.
   //
   // The input data is buffered in `input_buffer_` and is compressed in bulk
@@ -117,4 +153,4 @@ class SnappyOutputBuffer {
 }  // namespace io
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_CORE_LIB_IO_SNAPPY_OUTPUTBUFFER_H_
+#endif  // TENSORFLOW_CORE_LIB_IO_SNAPPY_SNAPPY_OUTPUTBUFFER_H_
diff --git a/tensorflow/core/lib/io/zlib_buffers_test.cc b/tensorflow/core/lib/io/zlib_buffers_test.cc
index 2aeeec38fdb..3983a249afa 100644
--- a/tensorflow/core/lib/io/zlib_buffers_test.cc
+++ b/tensorflow/core/lib/io/zlib_buffers_test.cc
@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include "tensorflow/core/lib/core/error_codes.pb.h"
+#include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/lib/io/random_inputstream.h"
 #include "tensorflow/core/lib/io/zlib_compression_options.h"
@@ -292,6 +294,42 @@ void TestSkipNBytes(CompressionOptions input_options,
   }
 }
 
+void TestSoftErrorOnDecompress(CompressionOptions input_options) {
+  Env* env = Env::Default();
+  string fname = testing::TmpDir() + "/garbage_data";
+
+  input_options.soft_fail_on_error = true;
+
+  std::unique_ptr<WritableFile> file_writer;
+  TF_ASSERT_OK(env->NewWritableFile(fname, &file_writer));
+  TF_ASSERT_OK(file_writer->Append("nonsense non-gzip data"));
+  TF_ASSERT_OK(file_writer->Flush());
+  TF_ASSERT_OK(file_writer->Close());
+
+  // Test `ReadNBytes` returns an error.
+  {
+    std::unique_ptr<RandomAccessFile> file_reader;
+    TF_ASSERT_OK(env->NewRandomAccessFile(fname, &file_reader));
+    std::unique_ptr<RandomAccessInputStream> input_stream(
+        new RandomAccessInputStream(file_reader.get()));
+    ZlibInputStream in(input_stream.get(), 100, 100, input_options);
+
+    tstring unused;
+    EXPECT_TRUE(errors::IsDataLoss(in.ReadNBytes(5, &unused)));
+  }
+
+  // Test `SkipNBytes` returns an error.
+  {
+    std::unique_ptr<RandomAccessFile> file_reader;
+    TF_ASSERT_OK(env->NewRandomAccessFile(fname, &file_reader));
+    std::unique_ptr<RandomAccessInputStream> input_stream(
+        new RandomAccessInputStream(file_reader.get()));
+    ZlibInputStream in(input_stream.get(), 100, 100, input_options);
+
+    EXPECT_TRUE(errors::IsDataLoss(in.SkipNBytes(5)));
+  }
+}
+
 TEST(ZlibInputStream, TellDefaultOptions) {
   TestTell(CompressionOptions::DEFAULT(), CompressionOptions::DEFAULT());
 }
@@ -316,5 +354,17 @@ TEST(ZlibInputStream, SkipNBytesGzip) {
   TestSkipNBytes(CompressionOptions::GZIP(), CompressionOptions::GZIP());
 }
 
+TEST(ZlibInputStream, TestSoftErrorOnDecompressDefaultOptions) {
+  TestSoftErrorOnDecompress(CompressionOptions::DEFAULT());
+}
+
+TEST(ZlibInputStream, TestSoftErrorOnDecompressRaw) {
+  TestSoftErrorOnDecompress(CompressionOptions::RAW());
+}
+
+TEST(ZlibInputStream, TestSoftErrorOnDecompressGzip) {
+  TestSoftErrorOnDecompress(CompressionOptions::GZIP());
+}
+
 }  // namespace io
 }  // namespace tensorflow
diff --git a/tensorflow/core/lib/io/zlib_compression_options.h b/tensorflow/core/lib/io/zlib_compression_options.h
index 238c1464fb0..26636523243 100644
--- a/tensorflow/core/lib/io/zlib_compression_options.h
+++ b/tensorflow/core/lib/io/zlib_compression_options.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_LIB_IO_ZLIB_COMPRESSION_OPTIONS_H_
-#define TENSORFLOW_LIB_IO_ZLIB_COMPRESSION_OPTIONS_H_
+#ifndef TENSORFLOW_CORE_LIB_IO_ZLIB_COMPRESSION_OPTIONS_H_
+#define TENSORFLOW_CORE_LIB_IO_ZLIB_COMPRESSION_OPTIONS_H_
 
 #include "tensorflow/core/platform/types.h"
 
@@ -110,6 +110,13 @@ class ZlibCompressionOptions {
   // appropriately. Z_FIXED prevents the use of dynamic Huffman codes, allowing
   // for a simpler decoder for special applications.
   int8 compression_strategy;
+
+  // When this is set to true and we are unable to find the header to correctly
+  // decompress a file, we return an error when `ReadNBytes` is called instead
+  // of CHECK-failing. Defaults to false (i.e. CHECK-failing).
+  //
+  // This option is ignored for `ZlibOutputBuffer`.
+  bool soft_fail_on_error = false;  // NOLINT
 };
 
 inline ZlibCompressionOptions ZlibCompressionOptions::DEFAULT() {
@@ -131,4 +138,4 @@ inline ZlibCompressionOptions ZlibCompressionOptions::GZIP() {
 }  // namespace io
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_LIB_IO_ZLIB_COMPRESSION_OPTIONS_H_
+#endif  // TENSORFLOW_CORE_LIB_IO_ZLIB_COMPRESSION_OPTIONS_H_
diff --git a/tensorflow/core/lib/io/zlib_inputstream.cc b/tensorflow/core/lib/io/zlib_inputstream.cc
index addaa6a3575..39404dfac39 100644
--- a/tensorflow/core/lib/io/zlib_inputstream.cc
+++ b/tensorflow/core/lib/io/zlib_inputstream.cc
@@ -76,7 +76,7 @@ ZlibInputStream::ZlibInputStream(InputStreamInterface* input_stream,
                       zlib_options, false) {}
 
 ZlibInputStream::~ZlibInputStream() {
-  if (z_stream_def_->stream) {
+  if (z_stream_def_->stream && !init_error_) {
     inflateEnd(z_stream_def_->stream.get());
   }
   if (owns_input_stream_) {
@@ -85,6 +85,9 @@ ZlibInputStream::~ZlibInputStream() {
 }
 
 Status ZlibInputStream::Reset() {
+  if (init_error_) {
+    return errors::DataLoss("unable to reset stream, cannot decompress.");
+  }
   TF_RETURN_IF_ERROR(input_stream_->Reset());
   inflateEnd(z_stream_def_->stream.get());
   InitZlibBuffer();
@@ -104,6 +107,10 @@ void ZlibInputStream::InitZlibBuffer() {
   int status =
       inflateInit2(z_stream_def_->stream.get(), zlib_options_.window_bits);
 
+  if (zlib_options_.soft_fail_on_error && status != Z_OK) {
+    init_error_ = true;
+    return;
+  }
   CHECK_EQ(status, Z_OK) << "inflateInit failed with status " << status;
 
   z_stream_def_->stream->next_in = z_stream_def_->input.get();
@@ -187,6 +194,10 @@ size_t ZlibInputStream::NumUnreadBytes() const {
 }
 
 Status ZlibInputStream::ReadNBytes(int64 bytes_to_read, tstring* result) {
+  if (init_error_) {
+    return errors::DataLoss("Unable to decompress Zlib file.");
+  }
+
   result->clear();
   // Read as many bytes as possible from cache.
   bytes_to_read -= ReadBytesFromCache(bytes_to_read, result);
diff --git a/tensorflow/core/lib/io/zlib_inputstream.h b/tensorflow/core/lib/io/zlib_inputstream.h
index 5ffba2d9372..427daa74c8f 100644
--- a/tensorflow/core/lib/io/zlib_inputstream.h
+++ b/tensorflow/core/lib/io/zlib_inputstream.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_LIB_IO_ZLIB_INPUTSTREAM_H_
-#define TENSORFLOW_LIB_IO_ZLIB_INPUTSTREAM_H_
+#ifndef TENSORFLOW_CORE_LIB_IO_ZLIB_INPUTSTREAM_H_
+#define TENSORFLOW_CORE_LIB_IO_ZLIB_INPUTSTREAM_H_
 
 #include <string>
 
@@ -80,6 +80,7 @@ class ZlibInputStream : public InputStreamInterface {
   size_t input_buffer_capacity_;   // Size of z_stream_input_
   size_t output_buffer_capacity_;  // Size of z_stream_output_
   char* next_unread_byte_;         // Next unread byte in z_stream_output_
+  bool init_error_ = false;        // Whether we encountered an error in init.
 
   ZlibCompressionOptions const zlib_options_;
 
@@ -133,4 +134,4 @@ class ZlibInputStream : public InputStreamInterface {
 }  // namespace io
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_LIB_IO_ZLIB_INPUTSTREAM_H_
+#endif  // TENSORFLOW_CORE_LIB_IO_ZLIB_INPUTSTREAM_H_
diff --git a/tensorflow/core/nccl/BUILD b/tensorflow/core/nccl/BUILD
index 24ea6416084..1931d0fa47c 100644
--- a/tensorflow/core/nccl/BUILD
+++ b/tensorflow/core/nccl/BUILD
@@ -30,12 +30,13 @@ cc_library(
     ]),
     copts = tf_copts(),
     deps = if_cuda([
-        "@com_google_absl//absl/memory",
         "@local_config_nccl//:nccl",
     ]) + if_rocm([
         "@local_config_rocm//rocm:rccl",
+        "//tensorflow/core:gpu_runtime",
     ]) + if_cuda_or_rocm([
         "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/memory",
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:framework",
         "//tensorflow/core:gpu_headers_lib",
@@ -51,7 +52,6 @@ tf_cuda_cc_test(
     srcs = ["nccl_manager_test.cc"],
     tags = tf_cuda_tests_tags() + [
         "no_cuda_on_cpu_tap",  # TODO(b/120284216): re-enable multi_gpu
-        "no_rocm",
     ],
     deps = [
         "//tensorflow/core:test",
diff --git a/tensorflow/core/nccl/nccl_manager.cc b/tensorflow/core/nccl/nccl_manager.cc
index 4a439a46525..2bdfbe34584 100644
--- a/tensorflow/core/nccl/nccl_manager.cc
+++ b/tensorflow/core/nccl/nccl_manager.cc
@@ -40,6 +40,7 @@ using se::rocm::ScopedActivateExecutorContext;
 #define cudaGetDevice hipGetDevice
 #define cudaSetDevice hipSetDevice
 #define cudaSuccess hipSuccess
+int NcclManager::instance_count = 0;
 #endif
 
 #define NCCL_RETURN_IF_ERROR(...)                               \
@@ -69,7 +70,12 @@ struct NcclManager::NcclStream : public core::RefCounted {
 
   // The stream on which to run the nccl collective.
   // This is a different stream than the tensorflow compute stream.
+#if TENSORFLOW_USE_ROCM
+  // On ROCm, we borrow the nccl stream from the device context.
+  se::Stream* stream = nullptr;
+#else
   std::unique_ptr<se::Stream> stream;
+#endif
 
   // `mu` protects access to `pending_launches_`, which is the list of
   // collectives ready but whose kernels are yet to be launched.  When the
@@ -155,6 +161,16 @@ struct NcclManager::Collective : public core::RefCounted {
         single_node(num_local_devices_in == num_global_devices_in),
         communicator_key(communicator_key_in) {
     participants.reserve(num_local_devices_in);
+#if TENSORFLOW_USE_ROCM
+    // On ROCm platform, this allows caller to either use the singleton instance
+    // or to manage one non-singleton NcclManager instance.
+    // For example, the nccl_manager_test will use both paradigms in the same
+    // executable, but not running concurrently (which would hang otherwise).
+    if (NcclManager::instance_count > 1) {
+      status = errors::Internal(
+          "ROCm cannot use multi-node NCCL collectives on a single node");
+    }
+#endif
   }
 
   const string collective_key;  // A unique key for debugging.
@@ -193,9 +209,17 @@ struct NcclManager::Collective : public core::RefCounted {
   Status status;
 };
 
-NcclManager::NcclManager() { VLOG(2) << "New NcclManager " << this; }
+NcclManager::NcclManager() {
+  VLOG(2) << "New NcclManager " << this;
+#if TENSORFLOW_USE_ROCM
+  ++instance_count;
+#endif
+}
 NcclManager::~NcclManager() {
   VLOG(2) << "~NcclManager " << this;
+#if TENSORFLOW_USE_ROCM
+  --instance_count;
+#endif
   for (auto& it : device_to_comm_streams_) {
     for (NcclStream* nccl_stream : it.second) {
       {
@@ -209,6 +233,12 @@ NcclManager::~NcclManager() {
 }
 NcclManager* NcclManager::instance() {
   static NcclManager* instance = new NcclManager();
+#if TENSORFLOW_USE_ROCM
+  // singleton does not count against total instances
+  // see comment above in Collective constructor concerning ROCm platform
+  static std::once_flag once;
+  std::call_once(once, [] { --NcclManager::instance_count; });
+#endif
   return instance;
 }
 
@@ -313,8 +343,12 @@ Status NcclManager::GetCommunicator(NcclManager::Collective* collective,
     if (nccl_stream == nullptr) {
       nccl_stream = new NcclStream();
       nccl_stream->executor = executor;
+#if TENSORFLOW_USE_ROCM
+      nccl_stream->stream = collective->participants[i]->context->nccl_stream();
+#else
       nccl_stream->stream.reset(new se::Stream(executor));
       nccl_stream->stream->Init();
+#endif
 
       streams.emplace_back(nccl_stream);
       used_streams.insert(nccl_stream);
@@ -604,7 +638,11 @@ void NcclManager::RunCollective(Collective* collective) {
 }
 
 void NcclManager::LoopKernelLaunches(NcclStream* nccl_stream) {
+#if TENSORFLOW_USE_ROCM
+  se::Stream* comm_stream = nccl_stream->stream;
+#else
   se::Stream* comm_stream = nccl_stream->stream.get();
+#endif
   ScopedActivateExecutorContext scoped_context(nccl_stream->executor);
   const cudaStream_t* cu_stream = reinterpret_cast<const cudaStream_t*>(
       comm_stream->implementation()->GpuStreamMemberHack());
diff --git a/tensorflow/core/nccl/nccl_manager.h b/tensorflow/core/nccl/nccl_manager.h
index 9f4ef255ab3..b0b4441b776 100644
--- a/tensorflow/core/nccl/nccl_manager.h
+++ b/tensorflow/core/nccl/nccl_manager.h
@@ -32,8 +32,10 @@ limitations under the License.
 #include "third_party/nccl/nccl.h"
 #elif TENSORFLOW_USE_ROCM
 #include "rocm/include/rccl/rccl.h"
+#include "tensorflow/core/common_runtime/gpu_device_context.h"
 #endif
 #include "tensorflow/core/common_runtime/gpu/gpu_event_mgr.h"
+#include "tensorflow/core/framework/device_base.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/platform/stream_executor.h"
@@ -53,6 +55,10 @@ class NcclManager {
 
   static NcclManager* instance();
 
+#if TENSORFLOW_USE_ROCM
+  static int instance_count;
+#endif
+
   // Calls `ncclGetUniqueId` and returns the id as a string.  The returned value
   // may be shared with other participants on different nodes and passed in to
   // multi-node collective invocations.
@@ -61,12 +67,15 @@ class NcclManager {
   // A participant in a Collective.
   struct Participant {
     Participant(se::StreamExecutor* executor, se::Stream* tensor_stream,
-                EventMgr* event_mgr, int gpu_device_id, const Tensor* input,
+                const DeviceBase::GpuDeviceInfo* info, const Tensor* input,
                 Tensor* output, int global_rank, DoneCallback done_callback)
         : executor(executor),
           tensor_stream(tensor_stream),
-          event_mgr(event_mgr),
-          gpu_device_id(gpu_device_id),
+          event_mgr(info->event_mgr),
+          gpu_device_id(info->gpu_id),
+#if TENSORFLOW_USE_ROCM
+          context(static_cast<GPUDeviceContext*>(info->default_context)),
+#endif
           input(input),
           input_event(nullptr),
           output(output),
@@ -101,6 +110,10 @@ class NcclManager {
 
     const int gpu_device_id;
 
+#if TENSORFLOW_USE_ROCM
+    GPUDeviceContext* const context;
+#endif
+
     // Owned by the caller, who must keep it live until `done_callback` is
     // called. Is NULL for participants that only receive data.
     const Tensor* input;
diff --git a/tensorflow/core/nccl/nccl_manager_test.cc b/tensorflow/core/nccl/nccl_manager_test.cc
index 9b650c66fa7..8d4e48c9e33 100644
--- a/tensorflow/core/nccl/nccl_manager_test.cc
+++ b/tensorflow/core/nccl/nccl_manager_test.cc
@@ -303,13 +303,13 @@ class NcclManagerTest : public ::testing::Test {
           for (int local_rank = 0; local_rank < num_ranks_per_node;
                ++local_rank) {
             auto* device = this->GetDevice(local_rank);
-            auto* event_mgr = device->tensorflow_gpu_device_info()->event_mgr;
+            auto* info = device->tensorflow_gpu_device_info();
             auto* stream = device->tensorflow_gpu_device_info()->stream;
             const int global_rank = node * num_ranks_per_node + local_rank;
             auto participant = absl::make_unique<NcclManager::Participant>(
-                device->executor(), stream, event_mgr, device->gpu_id(),
-                &test_case->ins[global_rank], &test_case->outs[global_rank],
-                global_rank, this->CreateDoneCallback(test_case.get()));
+                device->executor(), stream, info, &test_case->ins[global_rank],
+                &test_case->outs[global_rank], global_rank,
+                this->CreateDoneCallback(test_case.get()));
             node_states[node].nccl_manager.AddToAllReduce(
                 std::move(participant),
                 {collective_key, num_ranks_per_node, num_global_ranks,
@@ -351,7 +351,7 @@ class NcclManagerTest : public ::testing::Test {
                         src_global_rank, local_rank, &node_states,
                         &collective_key, &communicator_key, &test_case]() {
           auto* device = this->GetDevice(local_rank);
-          auto* event_mgr = device->tensorflow_gpu_device_info()->event_mgr;
+          auto* info = device->tensorflow_gpu_device_info();
           auto* stream = device->tensorflow_gpu_device_info()->stream;
           const int global_rank = node * num_ranks_per_node + local_rank;
           auto* input = global_rank == src_global_rank
@@ -361,18 +361,14 @@ class NcclManagerTest : public ::testing::Test {
                              ? nullptr
                              : &test_case->outs[global_rank];
           auto participant = absl::make_unique<NcclManager::Participant>(
-              device->executor(), stream, event_mgr, device->gpu_id(), input,
-              output, global_rank, this->CreateDoneCallback(test_case.get()));
+              device->executor(), stream, info, input, output, global_rank,
+              this->CreateDoneCallback(test_case.get()));
           if (global_rank == src_global_rank) {
-            VLOG(1) << "AddBroadcastSend node " << node << " global_rank "
-                    << global_rank;
             node_states[node].nccl_manager.AddBroadcastSend(
                 std::move(participant),
                 {collective_key, num_ranks_per_node, num_global_ranks,
                  communicator_key, src_global_rank});
           } else {
-            VLOG(1) << "AddBroadcastRecv node " << node << " global_rank "
-                    << global_rank;
             node_states[node].nccl_manager.AddBroadcastRecv(
                 std::move(participant),
                 {collective_key, num_ranks_per_node, num_global_ranks,
@@ -442,11 +438,11 @@ TYPED_TEST(NcclManagerTest, BasicSumReduction) {
     for (int rank = 0; rank < num_ranks; ++rank) {
       auto* device = this->GetDevice(rank);
       VLOG(2) << "rank " << rank << " device " << device->name();
-      auto* event_mgr = device->tensorflow_gpu_device_info()->event_mgr;
+      auto* info = device->tensorflow_gpu_device_info();
       auto* stream = device->tensorflow_gpu_device_info()->stream;
       auto participant = absl::make_unique<NcclManager::Participant>(
-          device->executor(), stream, event_mgr, device->gpu_id(),
-          &test_case->ins[rank], &test_case->outs[rank], /*global_rank=*/-1,
+          device->executor(), stream, info, &test_case->ins[rank],
+          &test_case->outs[rank], /*global_rank=*/-1,
           this->CreateDoneCallback(test_case.get()));
       NcclManager::instance()->AddToAllReduce(
           std::move(participant),
@@ -508,12 +504,12 @@ TYPED_TEST(NcclManagerTest, MultipleCallers) {
           case_and_rank.pop_back();
         }
         auto* device = this->GetDevice(rank);
-        auto* event_mgr = device->tensorflow_gpu_device_info()->event_mgr;
+        auto* info = device->tensorflow_gpu_device_info();
         auto* stream = device->tensorflow_gpu_device_info()->stream;
         typename TestFixture::TestCase* test_case = test_cases[test_num].get();
         auto participant = absl::make_unique<NcclManager::Participant>(
-            device->executor(), stream, event_mgr, device->gpu_id(),
-            &test_case->ins[rank], &test_case->outs[rank], /*global_rank=*/-1,
+            device->executor(), stream, info, &test_case->ins[rank],
+            &test_case->outs[rank], /*global_rank=*/-1,
             this->CreateDoneCallback(test_case));
         NcclManager::instance()->AddToAllReduce(
             std::move(participant),
@@ -551,11 +547,11 @@ TYPED_TEST(NcclManagerTest, BasicAllGather) {
     for (int rank = 0; rank < num_ranks; ++rank) {
       auto* device = this->GetDevice(rank);
       VLOG(2) << "rank " << rank << " device " << device->name();
-      auto* event_mgr = device->tensorflow_gpu_device_info()->event_mgr;
+      auto* info = device->tensorflow_gpu_device_info();
       auto* stream = device->tensorflow_gpu_device_info()->stream;
       auto participant = absl::make_unique<NcclManager::Participant>(
-          device->executor(), stream, event_mgr, device->gpu_id(),
-          &test_case->ins[rank], &test_case->outs[rank], rank,
+          device->executor(), stream, info, &test_case->ins[rank],
+          &test_case->outs[rank], rank,
           this->CreateDoneCallback(test_case.get()));
       NcclManager::instance()->AddToAllGather(
           std::move(participant),
@@ -585,7 +581,12 @@ TYPED_TEST(NcclManagerTest, InPlaceBroadcast) {
 
 // Test broadcast with increasing ranks.
 TYPED_TEST(NcclManagerTest, BroadcastWithDifferentRanks) {
-  for (int num_ranks = 4; num_ranks <= 8; ++num_ranks) {
+#if TENSORFLOW_USE_ROCM
+  for (int num_ranks = 1; num_ranks <= 4; ++num_ranks)
+#else
+  for (int num_ranks = 4; num_ranks <= 8; ++num_ranks)
+#endif
+  {
     const int src_rank = static_cast<int>(random::New64() % num_ranks);
     for (int in_place_idx = 0; in_place_idx <= 1; ++in_place_idx) {
       const bool in_place = in_place_idx == 0;
@@ -603,12 +604,14 @@ TEST(NcclManagerTest, CommunicatorKey) {
   EXPECT_EQ(communicator_key.size(), NCCL_UNIQUE_ID_BYTES);
 }
 
+#if !TENSORFLOW_USE_ROCM
 // This test creates `num_nodes` NcclManagers to simulate a multi-node
 // environment.  It works on a single node and reuses GPUs.  It enqueues NCCL
 // kernels on separate stream per rank.
 TYPED_TEST(NcclManagerTest, MultiNode) {
   this->RunMultiNodeAllReduceTest(/*num_nodes=*/2, /*num_ranks_per_node=*/4);
 }
+#endif
 
 // Tests that specifying `communicator_key` with a single node NCCL collective
 // works well.
@@ -618,9 +621,15 @@ TYPED_TEST(NcclManagerTest, MultiNodeSingle) {
 
 // Multi-node broadcast.
 TYPED_TEST(NcclManagerTest, MultiNodeBroadcast) {
+#if TENSORFLOW_USE_ROCM
+  this->RunMultiNodeBroadcastTest(/*num_nodes=*/1, /*num_ranks_per_node=*/4,
+                                  /*src_node=*/0, /*src_local_rank=*/3,
+                                  /*in_place=*/true);
+#else
   this->RunMultiNodeBroadcastTest(/*num_nodes=*/4, /*num_ranks_per_node=*/8,
                                   /*src_node=*/2, /*src_local_rank=*/3,
                                   /*in_place=*/true);
+#endif
 }
 
 // Checks that we return error status if a collective_key is used for different
@@ -633,11 +642,11 @@ TYPED_TEST(NcclManagerTest, ConsistentCollectiveType) {
                                   TensorShape({2, 3}), 0.0f));
   for (int rank = 0; rank < num_ranks; ++rank) {
     auto* device = this->GetDevice(rank);
-    auto* event_mgr = device->tensorflow_gpu_device_info()->event_mgr;
+    auto* info = device->tensorflow_gpu_device_info();
     auto* stream = device->tensorflow_gpu_device_info()->stream;
     auto participant = absl::make_unique<NcclManager::Participant>(
-        device->executor(), stream, event_mgr, device->gpu_id(),
-        &test_case->ins[rank], &test_case->outs[rank], /*global_rank=*/-1,
+        device->executor(), stream, info, &test_case->ins[rank],
+        &test_case->outs[rank], /*global_rank=*/-1,
         this->CreateDoneCallback(test_case.get()));
     if (rank == 0) {
       NcclManager::instance()->AddToAllReduce(std::move(participant),
@@ -670,11 +679,11 @@ TYPED_TEST(NcclManagerTest, ConsistentCommunicatorKey) {
                                   TensorShape({2, 3}), 0.0f));
   for (int rank = 0; rank < num_ranks; ++rank) {
     auto* device = this->GetDevice(rank);
-    auto* event_mgr = device->tensorflow_gpu_device_info()->event_mgr;
+    auto* info = device->tensorflow_gpu_device_info();
     auto* stream = device->tensorflow_gpu_device_info()->stream;
     auto participant = absl::make_unique<NcclManager::Participant>(
-        device->executor(), stream, event_mgr, device->gpu_id(),
-        &test_case->ins[rank], &test_case->outs[rank], /*global_rank=*/-1,
+        device->executor(), stream, info, &test_case->ins[rank],
+        &test_case->outs[rank], /*global_rank=*/-1,
         this->CreateDoneCallback(test_case.get()));
     NcclManager::instance()->AddToAllReduce(
         std::move(participant),
@@ -699,12 +708,12 @@ TYPED_TEST(NcclManagerTest, ConsistentNumberOfDevices) {
                                   TensorShape({2, 3}), 0.0f));
   for (int rank = 0; rank < num_ranks; ++rank) {
     auto* device = this->GetDevice(rank);
-    auto* event_mgr = device->tensorflow_gpu_device_info()->event_mgr;
+    auto* info = device->tensorflow_gpu_device_info();
     auto* stream = device->tensorflow_gpu_device_info()->stream;
     int num_devices = rank == 0 ? num_ranks : num_ranks + 1;
     auto participant = absl::make_unique<NcclManager::Participant>(
-        device->executor(), stream, event_mgr, device->gpu_id(),
-        &test_case->ins[rank], &test_case->outs[rank], /*global_rank=*/-1,
+        device->executor(), stream, info, &test_case->ins[rank],
+        &test_case->outs[rank], /*global_rank=*/-1,
         this->CreateDoneCallback(test_case.get()));
     NcclManager::instance()->AddToAllReduce(std::move(participant),
                                             {"bad_coll_type",
@@ -728,11 +737,10 @@ TYPED_TEST(NcclManagerTest, BroadcastNoSource) {
                                   /*src_rank=*/-1, false));
   for (int rank = 0; rank < num_ranks; ++rank) {
     auto* device = this->GetDevice(rank);
-    auto* event_mgr = device->tensorflow_gpu_device_info()->event_mgr;
+    auto* info = device->tensorflow_gpu_device_info();
     auto* stream = device->tensorflow_gpu_device_info()->stream;
     auto participant = absl::make_unique<NcclManager::Participant>(
-        device->executor(), stream, event_mgr, device->gpu_id(), nullptr,
-        &test_case->outs[rank], rank,
+        device->executor(), stream, info, nullptr, &test_case->outs[rank], rank,
         this->CreateDoneCallback(test_case.get()));
     NcclManager::instance()->AddBroadcastRecv(std::move(participant),
                                               {"bcast_no_send",
@@ -755,11 +763,11 @@ TYPED_TEST(NcclManagerTest, BroadcastMultipleSends) {
                                   /*src_rank=*/-1, false));
   for (int rank = 0; rank < num_ranks; ++rank) {
     auto* device = this->GetDevice(rank);
-    auto* event_mgr = device->tensorflow_gpu_device_info()->event_mgr;
+    auto* info = device->tensorflow_gpu_device_info();
     auto* stream = device->tensorflow_gpu_device_info()->stream;
     auto participant = absl::make_unique<NcclManager::Participant>(
-        device->executor(), stream, event_mgr, device->gpu_id(),
-        &test_case->outs[rank], &test_case->outs[rank], rank,
+        device->executor(), stream, info, &test_case->outs[rank],
+        &test_case->outs[rank], rank,
         this->CreateDoneCallback(test_case.get()));
     NcclManager::instance()->AddBroadcastSend(std::move(participant),
                                               {"bcast_multiple_send",
@@ -783,11 +791,11 @@ TYPED_TEST(NcclManagerTest, BroadcastInconsistentSource) {
                                   /*src_rank=*/-1, false));
   for (int rank = 0; rank < num_ranks; ++rank) {
     auto* device = this->GetDevice(rank);
-    auto* event_mgr = device->tensorflow_gpu_device_info()->event_mgr;
+    auto* info = device->tensorflow_gpu_device_info();
     auto* stream = device->tensorflow_gpu_device_info()->stream;
     auto participant = absl::make_unique<NcclManager::Participant>(
-        device->executor(), stream, event_mgr, device->gpu_id(),
-        &test_case->outs[rank], &test_case->outs[rank], rank,
+        device->executor(), stream, info, &test_case->outs[rank],
+        &test_case->outs[rank], rank,
         this->CreateDoneCallback(test_case.get()));
     NcclManager::instance()->AddBroadcastRecv(std::move(participant),
                                               {"bcast_inconsistent_source",
diff --git a/tensorflow/core/ops/array_ops.cc b/tensorflow/core/ops/array_ops.cc
index f53210a4d46..dce08cfbe4f 100644
--- a/tensorflow/core/ops/array_ops.cc
+++ b/tensorflow/core/ops/array_ops.cc
@@ -3042,13 +3042,28 @@ REGISTER_OP("QuantizeV2")
     .Attr(
         "round_mode: {'HALF_AWAY_FROM_ZERO', 'HALF_TO_EVEN'} = "
         "'HALF_AWAY_FROM_ZERO'")
+    .Attr("narrow_range: bool = false")
+    .Attr("axis: int = -1")
     .SetShapeFn([](InferenceContext* c) {
+      int axis = -1;
+      Status s = c->GetAttr("axis", &axis);
+      if (!s.ok() && s.code() != error::NOT_FOUND) {
+        return s;
+      }
+      const int minmax_rank = (axis == -1) ? 0 : 1;
       TF_RETURN_IF_ERROR(shape_inference::UnchangedShape(c));
-      ShapeHandle unused;
-      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &unused));
-      TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 0, &unused));
-      c->set_output(1, c->Scalar());
-      c->set_output(2, c->Scalar());
+      ShapeHandle minmax;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), minmax_rank, &minmax));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(2), minmax_rank, &minmax));
+      if (axis != -1) {
+        ShapeHandle input;
+        TF_RETURN_IF_ERROR(c->WithRankAtLeast(c->input(0), axis + 1, &input));
+        DimensionHandle depth;
+        TF_RETURN_IF_ERROR(
+            c->Merge(c->Dim(minmax, 0), c->Dim(input, axis), &depth));
+      }
+      c->set_output(1, minmax);
+      c->set_output(2, minmax);
       return Status::OK();
     });
 
@@ -3059,11 +3074,25 @@ REGISTER_OP("Dequantize")
     .Output("output: float")
     .Attr("T: quantizedtype")
     .Attr("mode: {'MIN_COMBINED', 'MIN_FIRST', 'SCALED'} = 'MIN_COMBINED'")
+    .Attr("axis: int = -1")
     .SetShapeFn([](InferenceContext* c) {
+      int axis = -1;
+      Status s = c->GetAttr("axis", &axis);
+      if (!s.ok() && s.code() != error::NOT_FOUND) {
+        return s;
+      }
+      const int minmax_rank = (axis == -1) ? 0 : 1;
       TF_RETURN_IF_ERROR(shape_inference::UnchangedShape(c));
-      ShapeHandle unused;
-      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &unused));
-      TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 0, &unused));
+      ShapeHandle minmax;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), minmax_rank, &minmax));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(2), minmax_rank, &minmax));
+      if (axis != -1) {
+        ShapeHandle input;
+        TF_RETURN_IF_ERROR(c->WithRankAtLeast(c->input(0), axis + 1, &input));
+        DimensionHandle depth;
+        TF_RETURN_IF_ERROR(
+            c->Merge(c->Dim(minmax, 0), c->Dim(input, axis), &depth));
+      }
       return Status::OK();
     });
 
diff --git a/tensorflow/core/ops/array_ops_test.cc b/tensorflow/core/ops/array_ops_test.cc
index e255553f385..641e72bbfff 100644
--- a/tensorflow/core/ops/array_ops_test.cc
+++ b/tensorflow/core/ops/array_ops_test.cc
@@ -1694,7 +1694,16 @@ TEST(ArrayOpsTest, StridedSliceGrad_ShapeFn) {
 TEST(ArrayOpsTest, UnchangedWithQuantizationScalars_ShapeFn) {
   for (const char* op_name : {"Dequantize", "FakeQuantWithMinMaxVars"}) {
     ShapeInferenceTestOp op(op_name);
-
+    if (op_name[0] == 'D') {
+      TF_ASSERT_OK(NodeDefBuilder("test", "Dequantize")
+                       .Input("input", 0, DT_QINT8)
+                       .Input("input_min", 1, DT_FLOAT)
+                       .Input("input_max", 2, DT_FLOAT)
+                       .Attr("T", DataTypeToEnum<qint8>::v())
+                       .Attr("mode", "SCALED")
+                       .Attr("axis", -1)
+                       .Finalize(&op.node_def));
+    }
     INFER_OK(op, "?;?;?", "in0");
     INFER_OK(op, "[1,?,3];[];[]", "in0");
 
diff --git a/tensorflow/core/ops/compat/ops_history_v1/Abs.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/Abs.pbtxt
index 80e7c7f5c18..5f44f9c6dca 100644
--- a/tensorflow/core/ops/compat/ops_history_v1/Abs.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v1/Abs.pbtxt
@@ -72,3 +72,30 @@ op {
     }
   }
 }
+op {
+  name: "Abs"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT8
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/Dequantize.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/Dequantize.pbtxt
index 6471f9d1675..5a1eb3b5fb9 100644
--- a/tensorflow/core/ops/compat/ops_history_v1/Dequantize.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v1/Dequantize.pbtxt
@@ -135,3 +135,56 @@ op {
     }
   }
 }
+op {
+  name: "Dequantize"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "min_range"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_range"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "mode"
+    type: "string"
+    default_value {
+      s: "MIN_COMBINED"
+    }
+    allowed_values {
+      list {
+        s: "MIN_COMBINED"
+        s: "MIN_FIRST"
+        s: "SCALED"
+      }
+    }
+  }
+  attr {
+    name: "axis"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/Equal.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/Equal.pbtxt
index 590849b602b..2ebe636cb53 100644
--- a/tensorflow/core/ops/compat/ops_history_v1/Equal.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v1/Equal.pbtxt
@@ -117,3 +117,50 @@ op {
   }
   is_commutative: true
 }
+op {
+  name: "Equal"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type: DT_BOOL
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_UINT8
+        type: DT_INT8
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_QUINT8
+        type: DT_QINT8
+        type: DT_QINT32
+        type: DT_STRING
+        type: DT_BOOL
+        type: DT_COMPLEX128
+      }
+    }
+  }
+  attr {
+    name: "incompatible_shape_error"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  is_commutative: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/IRFFT.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/IRFFT.pbtxt
index 0975c353fcf..8ac3dfc979d 100644
--- a/tensorflow/core/ops/compat/ops_history_v1/IRFFT.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v1/IRFFT.pbtxt
@@ -13,3 +13,44 @@ op {
     type: DT_FLOAT
   }
 }
+op {
+  name: "IRFFT"
+  input_arg {
+    name: "input"
+    type_attr: "Tcomplex"
+  }
+  input_arg {
+    name: "fft_length"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type_attr: "Treal"
+  }
+  attr {
+    name: "Treal"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "Tcomplex"
+    type: "type"
+    default_value {
+      type: DT_COMPLEX64
+    }
+    allowed_values {
+      list {
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/IRFFT2D.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/IRFFT2D.pbtxt
index b850a6a1bbc..5d1f872d605 100644
--- a/tensorflow/core/ops/compat/ops_history_v1/IRFFT2D.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v1/IRFFT2D.pbtxt
@@ -13,3 +13,44 @@ op {
     type: DT_FLOAT
   }
 }
+op {
+  name: "IRFFT2D"
+  input_arg {
+    name: "input"
+    type_attr: "Tcomplex"
+  }
+  input_arg {
+    name: "fft_length"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type_attr: "Treal"
+  }
+  attr {
+    name: "Treal"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "Tcomplex"
+    type: "type"
+    default_value {
+      type: DT_COMPLEX64
+    }
+    allowed_values {
+      list {
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/IRFFT3D.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/IRFFT3D.pbtxt
index 1cc8666e5b7..b69417ee1a7 100644
--- a/tensorflow/core/ops/compat/ops_history_v1/IRFFT3D.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v1/IRFFT3D.pbtxt
@@ -13,3 +13,44 @@ op {
     type: DT_FLOAT
   }
 }
+op {
+  name: "IRFFT3D"
+  input_arg {
+    name: "input"
+    type_attr: "Tcomplex"
+  }
+  input_arg {
+    name: "fft_length"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type_attr: "Treal"
+  }
+  attr {
+    name: "Treal"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "Tcomplex"
+    type: "type"
+    default_value {
+      type: DT_COMPLEX64
+    }
+    allowed_values {
+      list {
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/LoadTPUEmbeddingADAMParameters.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/LoadTPUEmbeddingADAMParameters.pbtxt
index bdf33d62308..8654e22e573 100644
--- a/tensorflow/core/ops/compat/ops_history_v1/LoadTPUEmbeddingADAMParameters.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v1/LoadTPUEmbeddingADAMParameters.pbtxt
@@ -38,3 +38,50 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "LoadTPUEmbeddingADAMParameters"
+  input_arg {
+    name: "parameters"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "momenta"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "velocities"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "table_id"
+    type: "int"
+    default_value {
+      i: -1
+    }
+    has_minimum: true
+    minimum: -1
+  }
+  attr {
+    name: "table_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "num_shards"
+    type: "int"
+  }
+  attr {
+    name: "shard_id"
+    type: "int"
+  }
+  attr {
+    name: "config"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/LoadTPUEmbeddingADAMParametersGradAccumDebug.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/LoadTPUEmbeddingADAMParametersGradAccumDebug.pbtxt
index a03310563e3..20557c7dbd4 100644
--- a/tensorflow/core/ops/compat/ops_history_v1/LoadTPUEmbeddingADAMParametersGradAccumDebug.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v1/LoadTPUEmbeddingADAMParametersGradAccumDebug.pbtxt
@@ -42,3 +42,54 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "LoadTPUEmbeddingADAMParametersGradAccumDebug"
+  input_arg {
+    name: "parameters"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "momenta"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "velocities"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "gradient_accumulators"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "table_id"
+    type: "int"
+    default_value {
+      i: -1
+    }
+    has_minimum: true
+    minimum: -1
+  }
+  attr {
+    name: "table_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "num_shards"
+    type: "int"
+  }
+  attr {
+    name: "shard_id"
+    type: "int"
+  }
+  attr {
+    name: "config"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/LoadTPUEmbeddingAdadeltaParameters.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/LoadTPUEmbeddingAdadeltaParameters.pbtxt
index 02136e511b9..85df87e93d2 100644
--- a/tensorflow/core/ops/compat/ops_history_v1/LoadTPUEmbeddingAdadeltaParameters.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v1/LoadTPUEmbeddingAdadeltaParameters.pbtxt
@@ -38,3 +38,50 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "LoadTPUEmbeddingAdadeltaParameters"
+  input_arg {
+    name: "parameters"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "accumulators"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "updates"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "table_id"
+    type: "int"
+    default_value {
+      i: -1
+    }
+    has_minimum: true
+    minimum: -1
+  }
+  attr {
+    name: "table_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "num_shards"
+    type: "int"
+  }
+  attr {
+    name: "shard_id"
+    type: "int"
+  }
+  attr {
+    name: "config"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/LoadTPUEmbeddingAdadeltaParametersGradAccumDebug.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/LoadTPUEmbeddingAdadeltaParametersGradAccumDebug.pbtxt
index 32485bf9fa8..1da256eadeb 100644
--- a/tensorflow/core/ops/compat/ops_history_v1/LoadTPUEmbeddingAdadeltaParametersGradAccumDebug.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v1/LoadTPUEmbeddingAdadeltaParametersGradAccumDebug.pbtxt
@@ -42,3 +42,54 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "LoadTPUEmbeddingAdadeltaParametersGradAccumDebug"
+  input_arg {
+    name: "parameters"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "accumulators"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "updates"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "gradient_accumulators"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "table_id"
+    type: "int"
+    default_value {
+      i: -1
+    }
+    has_minimum: true
+    minimum: -1
+  }
+  attr {
+    name: "table_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "num_shards"
+    type: "int"
+  }
+  attr {
+    name: "shard_id"
+    type: "int"
+  }
+  attr {
+    name: "config"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/LoadTPUEmbeddingAdagradParameters.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/LoadTPUEmbeddingAdagradParameters.pbtxt
index e40d457949a..fafdd6d5a23 100644
--- a/tensorflow/core/ops/compat/ops_history_v1/LoadTPUEmbeddingAdagradParameters.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v1/LoadTPUEmbeddingAdagradParameters.pbtxt
@@ -34,3 +34,46 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "LoadTPUEmbeddingAdagradParameters"
+  input_arg {
+    name: "parameters"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "accumulators"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "table_id"
+    type: "int"
+    default_value {
+      i: -1
+    }
+    has_minimum: true
+    minimum: -1
+  }
+  attr {
+    name: "table_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "num_shards"
+    type: "int"
+  }
+  attr {
+    name: "shard_id"
+    type: "int"
+  }
+  attr {
+    name: "config"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/LoadTPUEmbeddingAdagradParametersGradAccumDebug.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/LoadTPUEmbeddingAdagradParametersGradAccumDebug.pbtxt
index ba403c1893a..8dc47266098 100644
--- a/tensorflow/core/ops/compat/ops_history_v1/LoadTPUEmbeddingAdagradParametersGradAccumDebug.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v1/LoadTPUEmbeddingAdagradParametersGradAccumDebug.pbtxt
@@ -38,3 +38,50 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "LoadTPUEmbeddingAdagradParametersGradAccumDebug"
+  input_arg {
+    name: "parameters"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "accumulators"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "gradient_accumulators"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "table_id"
+    type: "int"
+    default_value {
+      i: -1
+    }
+    has_minimum: true
+    minimum: -1
+  }
+  attr {
+    name: "table_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "num_shards"
+    type: "int"
+  }
+  attr {
+    name: "shard_id"
+    type: "int"
+  }
+  attr {
+    name: "config"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/LoadTPUEmbeddingCenteredRMSPropParameters.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/LoadTPUEmbeddingCenteredRMSPropParameters.pbtxt
index 36280a5e8e7..1b3f6bf4558 100644
--- a/tensorflow/core/ops/compat/ops_history_v1/LoadTPUEmbeddingCenteredRMSPropParameters.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v1/LoadTPUEmbeddingCenteredRMSPropParameters.pbtxt
@@ -42,3 +42,54 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "LoadTPUEmbeddingCenteredRMSPropParameters"
+  input_arg {
+    name: "parameters"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "ms"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "mom"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "mg"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "table_id"
+    type: "int"
+    default_value {
+      i: -1
+    }
+    has_minimum: true
+    minimum: -1
+  }
+  attr {
+    name: "table_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "num_shards"
+    type: "int"
+  }
+  attr {
+    name: "shard_id"
+    type: "int"
+  }
+  attr {
+    name: "config"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/LoadTPUEmbeddingFTRLParameters.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/LoadTPUEmbeddingFTRLParameters.pbtxt
index 8785f4e86d1..50f7fdcb71b 100644
--- a/tensorflow/core/ops/compat/ops_history_v1/LoadTPUEmbeddingFTRLParameters.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v1/LoadTPUEmbeddingFTRLParameters.pbtxt
@@ -38,3 +38,50 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "LoadTPUEmbeddingFTRLParameters"
+  input_arg {
+    name: "parameters"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "accumulators"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "linears"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "table_id"
+    type: "int"
+    default_value {
+      i: -1
+    }
+    has_minimum: true
+    minimum: -1
+  }
+  attr {
+    name: "table_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "num_shards"
+    type: "int"
+  }
+  attr {
+    name: "shard_id"
+    type: "int"
+  }
+  attr {
+    name: "config"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/LoadTPUEmbeddingFTRLParametersGradAccumDebug.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/LoadTPUEmbeddingFTRLParametersGradAccumDebug.pbtxt
index 640801bd6f6..6f009f23dd0 100644
--- a/tensorflow/core/ops/compat/ops_history_v1/LoadTPUEmbeddingFTRLParametersGradAccumDebug.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v1/LoadTPUEmbeddingFTRLParametersGradAccumDebug.pbtxt
@@ -42,3 +42,54 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "LoadTPUEmbeddingFTRLParametersGradAccumDebug"
+  input_arg {
+    name: "parameters"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "accumulators"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "linears"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "gradient_accumulators"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "table_id"
+    type: "int"
+    default_value {
+      i: -1
+    }
+    has_minimum: true
+    minimum: -1
+  }
+  attr {
+    name: "table_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "num_shards"
+    type: "int"
+  }
+  attr {
+    name: "shard_id"
+    type: "int"
+  }
+  attr {
+    name: "config"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/LoadTPUEmbeddingMDLAdagradLightParameters.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/LoadTPUEmbeddingMDLAdagradLightParameters.pbtxt
index 2b86a8e499b..44e7ddcc9f9 100644
--- a/tensorflow/core/ops/compat/ops_history_v1/LoadTPUEmbeddingMDLAdagradLightParameters.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v1/LoadTPUEmbeddingMDLAdagradLightParameters.pbtxt
@@ -42,3 +42,54 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "LoadTPUEmbeddingMDLAdagradLightParameters"
+  input_arg {
+    name: "parameters"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "accumulators"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "weights"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "benefits"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "table_id"
+    type: "int"
+    default_value {
+      i: -1
+    }
+    has_minimum: true
+    minimum: -1
+  }
+  attr {
+    name: "table_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "num_shards"
+    type: "int"
+  }
+  attr {
+    name: "shard_id"
+    type: "int"
+  }
+  attr {
+    name: "config"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/LoadTPUEmbeddingMomentumParameters.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/LoadTPUEmbeddingMomentumParameters.pbtxt
index 1622c9abda5..6793adb2339 100644
--- a/tensorflow/core/ops/compat/ops_history_v1/LoadTPUEmbeddingMomentumParameters.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v1/LoadTPUEmbeddingMomentumParameters.pbtxt
@@ -34,3 +34,46 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "LoadTPUEmbeddingMomentumParameters"
+  input_arg {
+    name: "parameters"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "momenta"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "table_id"
+    type: "int"
+    default_value {
+      i: -1
+    }
+    has_minimum: true
+    minimum: -1
+  }
+  attr {
+    name: "table_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "num_shards"
+    type: "int"
+  }
+  attr {
+    name: "shard_id"
+    type: "int"
+  }
+  attr {
+    name: "config"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/LoadTPUEmbeddingMomentumParametersGradAccumDebug.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/LoadTPUEmbeddingMomentumParametersGradAccumDebug.pbtxt
index fe66f27c756..274e073d0c1 100644
--- a/tensorflow/core/ops/compat/ops_history_v1/LoadTPUEmbeddingMomentumParametersGradAccumDebug.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v1/LoadTPUEmbeddingMomentumParametersGradAccumDebug.pbtxt
@@ -38,3 +38,50 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "LoadTPUEmbeddingMomentumParametersGradAccumDebug"
+  input_arg {
+    name: "parameters"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "momenta"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "gradient_accumulators"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "table_id"
+    type: "int"
+    default_value {
+      i: -1
+    }
+    has_minimum: true
+    minimum: -1
+  }
+  attr {
+    name: "table_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "num_shards"
+    type: "int"
+  }
+  attr {
+    name: "shard_id"
+    type: "int"
+  }
+  attr {
+    name: "config"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/LoadTPUEmbeddingProximalAdagradParameters.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/LoadTPUEmbeddingProximalAdagradParameters.pbtxt
index 75a3ca574aa..dd163203099 100644
--- a/tensorflow/core/ops/compat/ops_history_v1/LoadTPUEmbeddingProximalAdagradParameters.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v1/LoadTPUEmbeddingProximalAdagradParameters.pbtxt
@@ -34,3 +34,46 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "LoadTPUEmbeddingProximalAdagradParameters"
+  input_arg {
+    name: "parameters"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "accumulators"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "table_id"
+    type: "int"
+    default_value {
+      i: -1
+    }
+    has_minimum: true
+    minimum: -1
+  }
+  attr {
+    name: "table_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "num_shards"
+    type: "int"
+  }
+  attr {
+    name: "shard_id"
+    type: "int"
+  }
+  attr {
+    name: "config"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/LoadTPUEmbeddingProximalAdagradParametersGradAccumDebug.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/LoadTPUEmbeddingProximalAdagradParametersGradAccumDebug.pbtxt
index 58ea4050fba..8ac11e435f6 100644
--- a/tensorflow/core/ops/compat/ops_history_v1/LoadTPUEmbeddingProximalAdagradParametersGradAccumDebug.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v1/LoadTPUEmbeddingProximalAdagradParametersGradAccumDebug.pbtxt
@@ -38,3 +38,50 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "LoadTPUEmbeddingProximalAdagradParametersGradAccumDebug"
+  input_arg {
+    name: "parameters"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "accumulators"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "gradient_accumulators"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "table_id"
+    type: "int"
+    default_value {
+      i: -1
+    }
+    has_minimum: true
+    minimum: -1
+  }
+  attr {
+    name: "table_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "num_shards"
+    type: "int"
+  }
+  attr {
+    name: "shard_id"
+    type: "int"
+  }
+  attr {
+    name: "config"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/LoadTPUEmbeddingRMSPropParameters.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/LoadTPUEmbeddingRMSPropParameters.pbtxt
index 2867fdaea95..129fcd944a8 100644
--- a/tensorflow/core/ops/compat/ops_history_v1/LoadTPUEmbeddingRMSPropParameters.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v1/LoadTPUEmbeddingRMSPropParameters.pbtxt
@@ -38,3 +38,50 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "LoadTPUEmbeddingRMSPropParameters"
+  input_arg {
+    name: "parameters"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "ms"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "mom"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "table_id"
+    type: "int"
+    default_value {
+      i: -1
+    }
+    has_minimum: true
+    minimum: -1
+  }
+  attr {
+    name: "table_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "num_shards"
+    type: "int"
+  }
+  attr {
+    name: "shard_id"
+    type: "int"
+  }
+  attr {
+    name: "config"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/LoadTPUEmbeddingRMSPropParametersGradAccumDebug.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/LoadTPUEmbeddingRMSPropParametersGradAccumDebug.pbtxt
index 506e17e1382..42618345b03 100644
--- a/tensorflow/core/ops/compat/ops_history_v1/LoadTPUEmbeddingRMSPropParametersGradAccumDebug.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v1/LoadTPUEmbeddingRMSPropParametersGradAccumDebug.pbtxt
@@ -42,3 +42,54 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "LoadTPUEmbeddingRMSPropParametersGradAccumDebug"
+  input_arg {
+    name: "parameters"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "ms"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "mom"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "gradient_accumulators"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "table_id"
+    type: "int"
+    default_value {
+      i: -1
+    }
+    has_minimum: true
+    minimum: -1
+  }
+  attr {
+    name: "table_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "num_shards"
+    type: "int"
+  }
+  attr {
+    name: "shard_id"
+    type: "int"
+  }
+  attr {
+    name: "config"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/LoadTPUEmbeddingStochasticGradientDescentParameters.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/LoadTPUEmbeddingStochasticGradientDescentParameters.pbtxt
index 2c69b16a46e..d754cd2c807 100644
--- a/tensorflow/core/ops/compat/ops_history_v1/LoadTPUEmbeddingStochasticGradientDescentParameters.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v1/LoadTPUEmbeddingStochasticGradientDescentParameters.pbtxt
@@ -30,3 +30,42 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "LoadTPUEmbeddingStochasticGradientDescentParameters"
+  input_arg {
+    name: "parameters"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "table_id"
+    type: "int"
+    default_value {
+      i: -1
+    }
+    has_minimum: true
+    minimum: -1
+  }
+  attr {
+    name: "table_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "num_shards"
+    type: "int"
+  }
+  attr {
+    name: "shard_id"
+    type: "int"
+  }
+  attr {
+    name: "config"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/NotEqual.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/NotEqual.pbtxt
index b08a3ccffc2..3c9fcbce46c 100644
--- a/tensorflow/core/ops/compat/ops_history_v1/NotEqual.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v1/NotEqual.pbtxt
@@ -117,3 +117,50 @@ op {
   }
   is_commutative: true
 }
+op {
+  name: "NotEqual"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type: DT_BOOL
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_UINT8
+        type: DT_INT8
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_QUINT8
+        type: DT_QINT8
+        type: DT_QINT32
+        type: DT_STRING
+        type: DT_BOOL
+        type: DT_COMPLEX128
+      }
+    }
+  }
+  attr {
+    name: "incompatible_shape_error"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  is_commutative: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/QuantizeV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/QuantizeV2.pbtxt
index 93869068b53..4b61fb376b9 100644
--- a/tensorflow/core/ops/compat/ops_history_v1/QuantizeV2.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v1/QuantizeV2.pbtxt
@@ -239,3 +239,84 @@ op {
     }
   }
 }
+op {
+  name: "QuantizeV2"
+  input_arg {
+    name: "input"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min_range"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_range"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output_min"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output_max"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "mode"
+    type: "string"
+    default_value {
+      s: "MIN_COMBINED"
+    }
+    allowed_values {
+      list {
+        s: "MIN_COMBINED"
+        s: "MIN_FIRST"
+        s: "SCALED"
+      }
+    }
+  }
+  attr {
+    name: "round_mode"
+    type: "string"
+    default_value {
+      s: "HALF_AWAY_FROM_ZERO"
+    }
+    allowed_values {
+      list {
+        s: "HALF_AWAY_FROM_ZERO"
+        s: "HALF_TO_EVEN"
+      }
+    }
+  }
+  attr {
+    name: "narrow_range"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "axis"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/RFFT.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/RFFT.pbtxt
index 0d65e7cc82f..02456ea217b 100644
--- a/tensorflow/core/ops/compat/ops_history_v1/RFFT.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v1/RFFT.pbtxt
@@ -13,3 +13,44 @@ op {
     type: DT_COMPLEX64
   }
 }
+op {
+  name: "RFFT"
+  input_arg {
+    name: "input"
+    type_attr: "Treal"
+  }
+  input_arg {
+    name: "fft_length"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type_attr: "Tcomplex"
+  }
+  attr {
+    name: "Treal"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "Tcomplex"
+    type: "type"
+    default_value {
+      type: DT_COMPLEX64
+    }
+    allowed_values {
+      list {
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/RFFT2D.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/RFFT2D.pbtxt
index 4e4ef532e4e..f3676f45cde 100644
--- a/tensorflow/core/ops/compat/ops_history_v1/RFFT2D.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v1/RFFT2D.pbtxt
@@ -13,3 +13,44 @@ op {
     type: DT_COMPLEX64
   }
 }
+op {
+  name: "RFFT2D"
+  input_arg {
+    name: "input"
+    type_attr: "Treal"
+  }
+  input_arg {
+    name: "fft_length"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type_attr: "Tcomplex"
+  }
+  attr {
+    name: "Treal"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "Tcomplex"
+    type: "type"
+    default_value {
+      type: DT_COMPLEX64
+    }
+    allowed_values {
+      list {
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/RFFT3D.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/RFFT3D.pbtxt
index 2f044b33c24..6475cd47316 100644
--- a/tensorflow/core/ops/compat/ops_history_v1/RFFT3D.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v1/RFFT3D.pbtxt
@@ -13,3 +13,44 @@ op {
     type: DT_COMPLEX64
   }
 }
+op {
+  name: "RFFT3D"
+  input_arg {
+    name: "input"
+    type_attr: "Treal"
+  }
+  input_arg {
+    name: "fft_length"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type_attr: "Tcomplex"
+  }
+  attr {
+    name: "Treal"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "Tcomplex"
+    type: "type"
+    default_value {
+      type: DT_COMPLEX64
+    }
+    allowed_values {
+      list {
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/RetrieveTPUEmbeddingADAMParameters.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/RetrieveTPUEmbeddingADAMParameters.pbtxt
index 4a31692cc95..027a923c8ec 100644
--- a/tensorflow/core/ops/compat/ops_history_v1/RetrieveTPUEmbeddingADAMParameters.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v1/RetrieveTPUEmbeddingADAMParameters.pbtxt
@@ -38,3 +38,50 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "RetrieveTPUEmbeddingADAMParameters"
+  output_arg {
+    name: "parameters"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "momenta"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "velocities"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "table_id"
+    type: "int"
+    default_value {
+      i: -1
+    }
+    has_minimum: true
+    minimum: -1
+  }
+  attr {
+    name: "table_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "num_shards"
+    type: "int"
+  }
+  attr {
+    name: "shard_id"
+    type: "int"
+  }
+  attr {
+    name: "config"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/RetrieveTPUEmbeddingADAMParametersGradAccumDebug.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/RetrieveTPUEmbeddingADAMParametersGradAccumDebug.pbtxt
index dd1651c3266..270435545c0 100644
--- a/tensorflow/core/ops/compat/ops_history_v1/RetrieveTPUEmbeddingADAMParametersGradAccumDebug.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v1/RetrieveTPUEmbeddingADAMParametersGradAccumDebug.pbtxt
@@ -42,3 +42,54 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "RetrieveTPUEmbeddingADAMParametersGradAccumDebug"
+  output_arg {
+    name: "parameters"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "momenta"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "velocities"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "gradient_accumulators"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "table_id"
+    type: "int"
+    default_value {
+      i: -1
+    }
+    has_minimum: true
+    minimum: -1
+  }
+  attr {
+    name: "table_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "num_shards"
+    type: "int"
+  }
+  attr {
+    name: "shard_id"
+    type: "int"
+  }
+  attr {
+    name: "config"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/RetrieveTPUEmbeddingAdadeltaParameters.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/RetrieveTPUEmbeddingAdadeltaParameters.pbtxt
index 145e322b1fe..4359645baf8 100644
--- a/tensorflow/core/ops/compat/ops_history_v1/RetrieveTPUEmbeddingAdadeltaParameters.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v1/RetrieveTPUEmbeddingAdadeltaParameters.pbtxt
@@ -38,3 +38,50 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "RetrieveTPUEmbeddingAdadeltaParameters"
+  output_arg {
+    name: "parameters"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "accumulators"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "updates"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "table_id"
+    type: "int"
+    default_value {
+      i: -1
+    }
+    has_minimum: true
+    minimum: -1
+  }
+  attr {
+    name: "table_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "num_shards"
+    type: "int"
+  }
+  attr {
+    name: "shard_id"
+    type: "int"
+  }
+  attr {
+    name: "config"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/RetrieveTPUEmbeddingAdadeltaParametersGradAccumDebug.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/RetrieveTPUEmbeddingAdadeltaParametersGradAccumDebug.pbtxt
index 64bb295eb0c..3646cc6dc3c 100644
--- a/tensorflow/core/ops/compat/ops_history_v1/RetrieveTPUEmbeddingAdadeltaParametersGradAccumDebug.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v1/RetrieveTPUEmbeddingAdadeltaParametersGradAccumDebug.pbtxt
@@ -42,3 +42,54 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "RetrieveTPUEmbeddingAdadeltaParametersGradAccumDebug"
+  output_arg {
+    name: "parameters"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "accumulators"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "updates"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "gradient_accumulators"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "table_id"
+    type: "int"
+    default_value {
+      i: -1
+    }
+    has_minimum: true
+    minimum: -1
+  }
+  attr {
+    name: "table_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "num_shards"
+    type: "int"
+  }
+  attr {
+    name: "shard_id"
+    type: "int"
+  }
+  attr {
+    name: "config"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/RetrieveTPUEmbeddingAdagradParameters.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/RetrieveTPUEmbeddingAdagradParameters.pbtxt
index ceb4b68a8f1..44496db2c7e 100644
--- a/tensorflow/core/ops/compat/ops_history_v1/RetrieveTPUEmbeddingAdagradParameters.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v1/RetrieveTPUEmbeddingAdagradParameters.pbtxt
@@ -34,3 +34,46 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "RetrieveTPUEmbeddingAdagradParameters"
+  output_arg {
+    name: "parameters"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "accumulators"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "table_id"
+    type: "int"
+    default_value {
+      i: -1
+    }
+    has_minimum: true
+    minimum: -1
+  }
+  attr {
+    name: "table_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "num_shards"
+    type: "int"
+  }
+  attr {
+    name: "shard_id"
+    type: "int"
+  }
+  attr {
+    name: "config"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/RetrieveTPUEmbeddingAdagradParametersGradAccumDebug.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/RetrieveTPUEmbeddingAdagradParametersGradAccumDebug.pbtxt
index 9959a8edf0e..1e28971f29c 100644
--- a/tensorflow/core/ops/compat/ops_history_v1/RetrieveTPUEmbeddingAdagradParametersGradAccumDebug.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v1/RetrieveTPUEmbeddingAdagradParametersGradAccumDebug.pbtxt
@@ -38,3 +38,50 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "RetrieveTPUEmbeddingAdagradParametersGradAccumDebug"
+  output_arg {
+    name: "parameters"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "accumulators"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "gradient_accumulators"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "table_id"
+    type: "int"
+    default_value {
+      i: -1
+    }
+    has_minimum: true
+    minimum: -1
+  }
+  attr {
+    name: "table_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "num_shards"
+    type: "int"
+  }
+  attr {
+    name: "shard_id"
+    type: "int"
+  }
+  attr {
+    name: "config"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/RetrieveTPUEmbeddingCenteredRMSPropParameters.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/RetrieveTPUEmbeddingCenteredRMSPropParameters.pbtxt
index 27e66ba43de..a67b6eddad6 100644
--- a/tensorflow/core/ops/compat/ops_history_v1/RetrieveTPUEmbeddingCenteredRMSPropParameters.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v1/RetrieveTPUEmbeddingCenteredRMSPropParameters.pbtxt
@@ -42,3 +42,54 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "RetrieveTPUEmbeddingCenteredRMSPropParameters"
+  output_arg {
+    name: "parameters"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "ms"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "mom"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "mg"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "table_id"
+    type: "int"
+    default_value {
+      i: -1
+    }
+    has_minimum: true
+    minimum: -1
+  }
+  attr {
+    name: "table_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "num_shards"
+    type: "int"
+  }
+  attr {
+    name: "shard_id"
+    type: "int"
+  }
+  attr {
+    name: "config"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/RetrieveTPUEmbeddingFTRLParameters.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/RetrieveTPUEmbeddingFTRLParameters.pbtxt
index 28b74a13849..7f9375e6019 100644
--- a/tensorflow/core/ops/compat/ops_history_v1/RetrieveTPUEmbeddingFTRLParameters.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v1/RetrieveTPUEmbeddingFTRLParameters.pbtxt
@@ -38,3 +38,50 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "RetrieveTPUEmbeddingFTRLParameters"
+  output_arg {
+    name: "parameters"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "accumulators"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "linears"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "table_id"
+    type: "int"
+    default_value {
+      i: -1
+    }
+    has_minimum: true
+    minimum: -1
+  }
+  attr {
+    name: "table_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "num_shards"
+    type: "int"
+  }
+  attr {
+    name: "shard_id"
+    type: "int"
+  }
+  attr {
+    name: "config"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/RetrieveTPUEmbeddingFTRLParametersGradAccumDebug.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/RetrieveTPUEmbeddingFTRLParametersGradAccumDebug.pbtxt
index 917d4a16c92..8b98bb39398 100644
--- a/tensorflow/core/ops/compat/ops_history_v1/RetrieveTPUEmbeddingFTRLParametersGradAccumDebug.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v1/RetrieveTPUEmbeddingFTRLParametersGradAccumDebug.pbtxt
@@ -42,3 +42,54 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "RetrieveTPUEmbeddingFTRLParametersGradAccumDebug"
+  output_arg {
+    name: "parameters"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "accumulators"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "linears"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "gradient_accumulators"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "table_id"
+    type: "int"
+    default_value {
+      i: -1
+    }
+    has_minimum: true
+    minimum: -1
+  }
+  attr {
+    name: "table_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "num_shards"
+    type: "int"
+  }
+  attr {
+    name: "shard_id"
+    type: "int"
+  }
+  attr {
+    name: "config"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/RetrieveTPUEmbeddingMDLAdagradLightParameters.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/RetrieveTPUEmbeddingMDLAdagradLightParameters.pbtxt
index 2510f7e3ba9..2e70aeea06d 100644
--- a/tensorflow/core/ops/compat/ops_history_v1/RetrieveTPUEmbeddingMDLAdagradLightParameters.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v1/RetrieveTPUEmbeddingMDLAdagradLightParameters.pbtxt
@@ -42,3 +42,54 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "RetrieveTPUEmbeddingMDLAdagradLightParameters"
+  output_arg {
+    name: "parameters"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "accumulators"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "weights"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "benefits"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "table_id"
+    type: "int"
+    default_value {
+      i: -1
+    }
+    has_minimum: true
+    minimum: -1
+  }
+  attr {
+    name: "table_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "num_shards"
+    type: "int"
+  }
+  attr {
+    name: "shard_id"
+    type: "int"
+  }
+  attr {
+    name: "config"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/RetrieveTPUEmbeddingMomentumParameters.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/RetrieveTPUEmbeddingMomentumParameters.pbtxt
index 555a8c108f5..bd5c48ccffc 100644
--- a/tensorflow/core/ops/compat/ops_history_v1/RetrieveTPUEmbeddingMomentumParameters.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v1/RetrieveTPUEmbeddingMomentumParameters.pbtxt
@@ -34,3 +34,46 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "RetrieveTPUEmbeddingMomentumParameters"
+  output_arg {
+    name: "parameters"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "momenta"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "table_id"
+    type: "int"
+    default_value {
+      i: -1
+    }
+    has_minimum: true
+    minimum: -1
+  }
+  attr {
+    name: "table_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "num_shards"
+    type: "int"
+  }
+  attr {
+    name: "shard_id"
+    type: "int"
+  }
+  attr {
+    name: "config"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/RetrieveTPUEmbeddingMomentumParametersGradAccumDebug.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/RetrieveTPUEmbeddingMomentumParametersGradAccumDebug.pbtxt
index fba454ac774..2587727105a 100644
--- a/tensorflow/core/ops/compat/ops_history_v1/RetrieveTPUEmbeddingMomentumParametersGradAccumDebug.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v1/RetrieveTPUEmbeddingMomentumParametersGradAccumDebug.pbtxt
@@ -38,3 +38,50 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "RetrieveTPUEmbeddingMomentumParametersGradAccumDebug"
+  output_arg {
+    name: "parameters"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "momenta"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "gradient_accumulators"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "table_id"
+    type: "int"
+    default_value {
+      i: -1
+    }
+    has_minimum: true
+    minimum: -1
+  }
+  attr {
+    name: "table_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "num_shards"
+    type: "int"
+  }
+  attr {
+    name: "shard_id"
+    type: "int"
+  }
+  attr {
+    name: "config"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/RetrieveTPUEmbeddingProximalAdagradParameters.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/RetrieveTPUEmbeddingProximalAdagradParameters.pbtxt
index fdbcf9d00d6..920e8fd5d9b 100644
--- a/tensorflow/core/ops/compat/ops_history_v1/RetrieveTPUEmbeddingProximalAdagradParameters.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v1/RetrieveTPUEmbeddingProximalAdagradParameters.pbtxt
@@ -34,3 +34,46 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "RetrieveTPUEmbeddingProximalAdagradParameters"
+  output_arg {
+    name: "parameters"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "accumulators"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "table_id"
+    type: "int"
+    default_value {
+      i: -1
+    }
+    has_minimum: true
+    minimum: -1
+  }
+  attr {
+    name: "table_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "num_shards"
+    type: "int"
+  }
+  attr {
+    name: "shard_id"
+    type: "int"
+  }
+  attr {
+    name: "config"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/RetrieveTPUEmbeddingProximalAdagradParametersGradAccumDebug.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/RetrieveTPUEmbeddingProximalAdagradParametersGradAccumDebug.pbtxt
index 1fbf9a248f0..ce3df05efba 100644
--- a/tensorflow/core/ops/compat/ops_history_v1/RetrieveTPUEmbeddingProximalAdagradParametersGradAccumDebug.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v1/RetrieveTPUEmbeddingProximalAdagradParametersGradAccumDebug.pbtxt
@@ -38,3 +38,50 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "RetrieveTPUEmbeddingProximalAdagradParametersGradAccumDebug"
+  output_arg {
+    name: "parameters"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "accumulators"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "gradient_accumulators"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "table_id"
+    type: "int"
+    default_value {
+      i: -1
+    }
+    has_minimum: true
+    minimum: -1
+  }
+  attr {
+    name: "table_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "num_shards"
+    type: "int"
+  }
+  attr {
+    name: "shard_id"
+    type: "int"
+  }
+  attr {
+    name: "config"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/RetrieveTPUEmbeddingRMSPropParameters.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/RetrieveTPUEmbeddingRMSPropParameters.pbtxt
index 73ae099b92f..33e0fa6eb5a 100644
--- a/tensorflow/core/ops/compat/ops_history_v1/RetrieveTPUEmbeddingRMSPropParameters.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v1/RetrieveTPUEmbeddingRMSPropParameters.pbtxt
@@ -38,3 +38,50 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "RetrieveTPUEmbeddingRMSPropParameters"
+  output_arg {
+    name: "parameters"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "ms"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "mom"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "table_id"
+    type: "int"
+    default_value {
+      i: -1
+    }
+    has_minimum: true
+    minimum: -1
+  }
+  attr {
+    name: "table_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "num_shards"
+    type: "int"
+  }
+  attr {
+    name: "shard_id"
+    type: "int"
+  }
+  attr {
+    name: "config"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/RetrieveTPUEmbeddingRMSPropParametersGradAccumDebug.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/RetrieveTPUEmbeddingRMSPropParametersGradAccumDebug.pbtxt
index 193af7d5c45..1c63ddf9d62 100644
--- a/tensorflow/core/ops/compat/ops_history_v1/RetrieveTPUEmbeddingRMSPropParametersGradAccumDebug.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v1/RetrieveTPUEmbeddingRMSPropParametersGradAccumDebug.pbtxt
@@ -42,3 +42,54 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "RetrieveTPUEmbeddingRMSPropParametersGradAccumDebug"
+  output_arg {
+    name: "parameters"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "ms"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "mom"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "gradient_accumulators"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "table_id"
+    type: "int"
+    default_value {
+      i: -1
+    }
+    has_minimum: true
+    minimum: -1
+  }
+  attr {
+    name: "table_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "num_shards"
+    type: "int"
+  }
+  attr {
+    name: "shard_id"
+    type: "int"
+  }
+  attr {
+    name: "config"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/RetrieveTPUEmbeddingStochasticGradientDescentParameters.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/RetrieveTPUEmbeddingStochasticGradientDescentParameters.pbtxt
index 7c70f9f2a4d..15afbaa54de 100644
--- a/tensorflow/core/ops/compat/ops_history_v1/RetrieveTPUEmbeddingStochasticGradientDescentParameters.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v1/RetrieveTPUEmbeddingStochasticGradientDescentParameters.pbtxt
@@ -30,3 +30,42 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "RetrieveTPUEmbeddingStochasticGradientDescentParameters"
+  output_arg {
+    name: "parameters"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "table_id"
+    type: "int"
+    default_value {
+      i: -1
+    }
+    has_minimum: true
+    minimum: -1
+  }
+  attr {
+    name: "table_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "num_shards"
+    type: "int"
+  }
+  attr {
+    name: "shard_id"
+    type: "int"
+  }
+  attr {
+    name: "config"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/math_ops.cc b/tensorflow/core/ops/math_ops.cc
index 5d25e92bae0..dde2db3d034 100644
--- a/tensorflow/core/ops/math_ops.cc
+++ b/tensorflow/core/ops/math_ops.cc
@@ -194,7 +194,7 @@ _HostCast requires its input and produces its output in host memory.
 REGISTER_OP("Abs")
     .Input("x: T")
     .Output("y: T")
-    .Attr("T: {bfloat16, half, float, double, int32, int64}")
+    .Attr("T: {bfloat16, half, float, double, int8, int16, int32, int64}")
     .SetShapeFn(shape_inference::UnchangedShape);
 
 REGISTER_OP("ComplexAbs")
@@ -395,6 +395,7 @@ REGISTER_OP("AddV2")
     .SetIsAggregate()
     .SetIsCommutative();
 
+#ifdef INTEL_MKL
 REGISTER_OP("_MklAdd")
     .Input("x: T")
     .Input("y: T")
@@ -413,6 +414,21 @@ Returns `x` + `y` element-wise.
 [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html).
 )doc");
 
+REGISTER_OP("_MklAddV2")
+    .Input("x: T")
+    .Input("y: T")
+    .Input("mkl_x: uint8")
+    .Input("mkl_y: uint8")
+    .Output("z: T")
+    .Output("mkl_z: uint8")
+    .Attr(
+        "T: {bfloat16, half, float, double, uint8, int8, int16, int32, int64, "
+        "complex64, complex128}")
+    .SetShapeFn(shape_inference::BroadcastBinaryOpShapeFn)
+    .SetIsAggregate()
+    .SetIsCommutative();
+#endif  // INTEL_MKL
+
 REGISTER_OP("Sub").BINARY_MORE().SetShapeFn(
     shape_inference::BroadcastBinaryOpShapeFn);
 
@@ -684,7 +700,19 @@ REGISTER_OP("GreaterEqual").COMPARISON();
           "T: {bfloat16, half, float, double, uint8, int8, int16, int32, " \
           "int64, complex64, quint8, qint8, qint32, string, bool, "        \
           "complex128}")                                                   \
-      .SetShapeFn(shape_inference::BroadcastBinaryOpShapeFn)
+      .Attr("incompatible_shape_error: bool = true")                       \
+      .SetShapeFn([](InferenceContext* c) {                                \
+        ShapeHandle x = c->input(0);                                       \
+        ShapeHandle y = c->input(1);                                       \
+        ShapeHandle output;                                                \
+        bool incompatible_shape_error;                                     \
+        TF_RETURN_IF_ERROR(c->GetAttr("incompatible_shape_error",          \
+                                      &incompatible_shape_error));         \
+        TF_RETURN_IF_ERROR(BroadcastBinaryOpOutputShapeFnHelper(           \
+            c, x, y, incompatible_shape_error, &output));                  \
+        c->set_output(0, output);                                          \
+        return Status::OK();                                               \
+      })
 
 REGISTER_OP("Equal").EQUALITY_COMPARISON();
 
@@ -861,10 +889,10 @@ REGISTER_OP("SelectV2")
       ShapeHandle else_ = c->input(2);
       ShapeHandle other;
       TF_RETURN_IF_ERROR(
-          BroadcastBinaryOpOutputShapeFnHelper(c, then, else_, &other));
+          BroadcastBinaryOpOutputShapeFnHelper(c, then, else_, true, &other));
       ShapeHandle output;
       TF_RETURN_IF_ERROR(
-          BroadcastBinaryOpOutputShapeFnHelper(c, cond, other, &output));
+          BroadcastBinaryOpOutputShapeFnHelper(c, cond, other, true, &output));
       c->set_output(0, output);
       return Status::OK();
     });
@@ -1365,23 +1393,23 @@ Status RangeSize(const Tensor* start_t, const Tensor* limit_t,
   T start = start_t->scalar<T>()();
   T limit = limit_t->scalar<T>()();
   T delta = delta_t->scalar<T>()();
-  if (start > limit && delta > 0) {
+  if (start > limit && delta > T(0)) {
     return errors::InvalidArgument(
         "Requires start <= limit when delta > 0: ", start, "/", limit);
   }
-  if (start < limit && delta < 0) {
+  if (start < limit && delta < T(0)) {
     return errors::InvalidArgument(
         "Requires start >= limit when delta < 0: ", start, "/", limit);
   }
-  if (delta == 0) {
+  if (delta == T(0)) {
     return errors::InvalidArgument("Requires delta != 0");
   }
 
-  int64 size =
-      (std::is_integral<T>::value
-           ? ((std::abs(limit - start) + std::abs(delta) - 1) / std::abs(delta))
-           : std::ceil(std::abs((limit - start) / delta)));
-  c->set_output(0, c->Vector(size));
+  auto size = (std::is_integral<T>::value
+                   ? ((std::abs(limit - start) + std::abs(delta) - T(1)) /
+                      std::abs(delta))
+                   : (std::ceil(std::abs((limit - start) / delta))));
+  c->set_output(0, c->Vector(static_cast<int64>(size)));
   return Status::OK();
 }
 
@@ -1416,8 +1444,12 @@ REGISTER_OP("Range")
         return RangeSize<int64>(start_t, limit_t, delta_t, c);
       } else if (dtype == DT_FLOAT) {
         return RangeSize<float>(start_t, limit_t, delta_t, c);
-      } else {
+      } else if (dtype == DT_DOUBLE) {
         return RangeSize<double>(start_t, limit_t, delta_t, c);
+      } else if (dtype == DT_BFLOAT16) {
+        return RangeSize<bfloat16>(start_t, limit_t, delta_t, c);
+      } else {
+        return errors::InvalidArgument("Unsupported dtype", dtype);
       }
       return Status::OK();
     });
diff --git a/tensorflow/core/ops/math_ops_test.cc b/tensorflow/core/ops/math_ops_test.cc
index 8325f643ab5..24cd530eaef 100644
--- a/tensorflow/core/ops/math_ops_test.cc
+++ b/tensorflow/core/ops/math_ops_test.cc
@@ -110,28 +110,17 @@ TEST(MathOpsTest, Segment_ShapeFn) {
 }
 
 TEST(MathOpsTest, BroadcastBinaryOps_ShapeFn) {
-  for (const auto* op_name : {"Add",        "Complex",
-                              "Div",        "Equal",
-                              "Greater",    "GreaterEqual",
-                              "Igamma",     "Igammac",
-                              "Zeta",       "Polygamma",
-                              "Less",       "LessEqual",
-                              "LogicalAnd", "LogicalOr",
-                              "Maximum",    "Minimum",
-                              "Mod",        "Mul",
-                              "NotEqual",   "Pow",
-                              "Sub",        "SquaredDifference",
-                              "DivNoNan"}) {
-    ShapeInferenceTestOp op(op_name);
+  auto test_shapes = [&](ShapeInferenceTestOp& op,
+                         bool incompatible_shape_error) {
     INFER_OK(op, "?;?", "?");
     INFER_OK(op, "[1,2];?", "?");
     INFER_OK(op, "?;[1,2]", "?");
 
     INFER_OK(op, "[?];[1]", "[d0_0]");
     INFER_OK(op, "[1];[?]", "[d1_0]");
-    INFER_OK(op, "[?];[2]", "[d1_0]");
-    INFER_OK(op, "[2];[?]", "[d0_0]");
-    INFER_OK(op, "[?];[?]", "[?]");
+    INFER_OK(op, "[?];[2]", incompatible_shape_error ? "[d1_0]" : "?");
+    INFER_OK(op, "[2];[?]", incompatible_shape_error ? "[d0_0]" : "?");
+    INFER_OK(op, "[?];[?]", incompatible_shape_error ? "[?]" : "?");
     INFER_OK(op, "[];[?]", "[d1_0]");
     INFER_OK(op, "[?];[]", "[d0_0]");
 
@@ -144,7 +133,7 @@ TEST(MathOpsTest, BroadcastBinaryOps_ShapeFn) {
     INFER_OK(op, "[1];[2]", "[d1_0]");
     INFER_OK(op, "[2];[1]", "[d0_0]");
     INFER_OK(op, "[2];[]", "[d0_0]");
-    INFER_OK(op, "[2];[?]", "[d0_0]");
+    INFER_OK(op, "[2];[?]", incompatible_shape_error ? "[d0_0]" : "?");
 
     INFER_OK(op, "[0];[0]", "[d0_0|d1_0]");
     INFER_OK(op, "[];[0]", "[d1_0]");
@@ -152,14 +141,46 @@ TEST(MathOpsTest, BroadcastBinaryOps_ShapeFn) {
     INFER_OK(op, "[0];[1]", "[d0_0]");
     INFER_OK(op, "[0];[]", "[d0_0]");
 
-    INFER_OK(op, "[2];[?,?]", "[d1_0,d0_0]");
-    INFER_OK(op, "[2,2];[?,?,?]", "[d1_0,d0_0,d0_1]");
+    INFER_OK(op, "[2];[?,?]", incompatible_shape_error ? "[d1_0,d0_0]" : "?");
+    INFER_OK(op, "[2,2];[?,?,?]",
+             incompatible_shape_error ? "[d1_0,d0_0,d0_1]" : "?");
 
     // Multiple dimension cases (same test cases, switching x and y).
     INFER_OK(op, "[?,1,2,3,4,5];[3,1,?]",
-             "[d0_0,d0_1,d0_2,d0_3|d1_0,d0_4,d0_5]");
+             incompatible_shape_error ? "[d0_0,d0_1,d0_2,d0_3|d1_0,d0_4,d0_5]"
+                                      : "?");
     INFER_OK(op, "[3,1,?];[?,1,2,3,4,5]",
-             "[d1_0,d1_1,d1_2,d1_3|d0_0,d1_4,d1_5]");
+             incompatible_shape_error ? "[d1_0,d1_1,d1_2,d1_3|d0_0,d1_4,d1_5]"
+                                      : "?");
+
+    if (incompatible_shape_error) {
+      INFER_ERROR("Dimensions must be equal", op, "[2];[3]");
+    } else {
+      INFER_OK(op, "[2];[3]", "[]");
+    }
+  };
+
+  for (string op_name : {"Add",        "Complex",
+                         "Div",        "Equal",
+                         "Greater",    "GreaterEqual",
+                         "Igamma",     "Igammac",
+                         "Zeta",       "Polygamma",
+                         "Less",       "LessEqual",
+                         "LogicalAnd", "LogicalOr",
+                         "Maximum",    "Minimum",
+                         "Mod",        "Mul",
+                         "NotEqual",   "Pow",
+                         "Sub",        "SquaredDifference",
+                         "DivNoNan"}) {
+    ShapeInferenceTestOp op(op_name);
+    AddNodeAttr("incompatible_shape_error", true, &op.node_def);
+    test_shapes(op, true);
+
+    if ((op_name == "Equal") || (op_name == "NotEqual")) {
+      ShapeInferenceTestOp op(op_name);
+      AddNodeAttr("incompatible_shape_error", false, &op.node_def);
+      test_shapes(op, false);
+    }
   }
 }
 
diff --git a/tensorflow/core/ops/ops.pbtxt b/tensorflow/core/ops/ops.pbtxt
index dd5128b994d..82dc00efc63 100644
--- a/tensorflow/core/ops/ops.pbtxt
+++ b/tensorflow/core/ops/ops.pbtxt
@@ -34,6 +34,8 @@ op {
         type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
+        type: DT_INT8
+        type: DT_INT16
         type: DT_INT32
         type: DT_INT64
       }
@@ -11486,6 +11488,13 @@ op {
       }
     }
   }
+  attr {
+    name: "axis"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
 }
 op {
   name: "DeserializeIterator"
@@ -12758,6 +12767,13 @@ op {
       }
     }
   }
+  attr {
+    name: "incompatible_shape_error"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
   is_commutative: true
 }
 op {
@@ -17129,7 +17145,7 @@ op {
   name: "IRFFT"
   input_arg {
     name: "input"
-    type: DT_COMPLEX64
+    type_attr: "Tcomplex"
   }
   input_arg {
     name: "fft_length"
@@ -17137,14 +17153,40 @@ op {
   }
   output_arg {
     name: "output"
-    type: DT_FLOAT
+    type_attr: "Treal"
+  }
+  attr {
+    name: "Treal"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "Tcomplex"
+    type: "type"
+    default_value {
+      type: DT_COMPLEX64
+    }
+    allowed_values {
+      list {
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
   }
 }
 op {
   name: "IRFFT2D"
   input_arg {
     name: "input"
-    type: DT_COMPLEX64
+    type_attr: "Tcomplex"
   }
   input_arg {
     name: "fft_length"
@@ -17152,14 +17194,40 @@ op {
   }
   output_arg {
     name: "output"
-    type: DT_FLOAT
+    type_attr: "Treal"
+  }
+  attr {
+    name: "Treal"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "Tcomplex"
+    type: "type"
+    default_value {
+      type: DT_COMPLEX64
+    }
+    allowed_values {
+      list {
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
   }
 }
 op {
   name: "IRFFT3D"
   input_arg {
     name: "input"
-    type: DT_COMPLEX64
+    type_attr: "Tcomplex"
   }
   input_arg {
     name: "fft_length"
@@ -17167,7 +17235,33 @@ op {
   }
   output_arg {
     name: "output"
-    type: DT_FLOAT
+    type_attr: "Treal"
+  }
+  attr {
+    name: "Treal"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "Tcomplex"
+    type: "type"
+    default_value {
+      type: DT_COMPLEX64
+    }
+    allowed_values {
+      list {
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
   }
 }
 op {
@@ -19227,6 +19321,13 @@ op {
     name: "shard_id"
     type: "int"
   }
+  attr {
+    name: "config"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
   is_stateful: true
 }
 op {
@@ -19271,6 +19372,13 @@ op {
     name: "shard_id"
     type: "int"
   }
+  attr {
+    name: "config"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
   is_stateful: true
 }
 op {
@@ -19311,6 +19419,13 @@ op {
     name: "shard_id"
     type: "int"
   }
+  attr {
+    name: "config"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
   is_stateful: true
 }
 op {
@@ -19355,6 +19470,13 @@ op {
     name: "shard_id"
     type: "int"
   }
+  attr {
+    name: "config"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
   is_stateful: true
 }
 op {
@@ -19391,6 +19513,13 @@ op {
     name: "shard_id"
     type: "int"
   }
+  attr {
+    name: "config"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
   is_stateful: true
 }
 op {
@@ -19431,6 +19560,13 @@ op {
     name: "shard_id"
     type: "int"
   }
+  attr {
+    name: "config"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
   is_stateful: true
 }
 op {
@@ -19475,6 +19611,13 @@ op {
     name: "shard_id"
     type: "int"
   }
+  attr {
+    name: "config"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
   is_stateful: true
 }
 op {
@@ -19515,6 +19658,13 @@ op {
     name: "shard_id"
     type: "int"
   }
+  attr {
+    name: "config"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
   is_stateful: true
 }
 op {
@@ -19559,6 +19709,13 @@ op {
     name: "shard_id"
     type: "int"
   }
+  attr {
+    name: "config"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
   is_stateful: true
 }
 op {
@@ -19603,6 +19760,13 @@ op {
     name: "shard_id"
     type: "int"
   }
+  attr {
+    name: "config"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
   is_stateful: true
 }
 op {
@@ -19639,6 +19803,13 @@ op {
     name: "shard_id"
     type: "int"
   }
+  attr {
+    name: "config"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
   is_stateful: true
 }
 op {
@@ -19679,6 +19850,13 @@ op {
     name: "shard_id"
     type: "int"
   }
+  attr {
+    name: "config"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
   is_stateful: true
 }
 op {
@@ -19715,6 +19893,13 @@ op {
     name: "shard_id"
     type: "int"
   }
+  attr {
+    name: "config"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
   is_stateful: true
 }
 op {
@@ -19755,6 +19940,13 @@ op {
     name: "shard_id"
     type: "int"
   }
+  attr {
+    name: "config"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
   is_stateful: true
 }
 op {
@@ -19795,6 +19987,13 @@ op {
     name: "shard_id"
     type: "int"
   }
+  attr {
+    name: "config"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
   is_stateful: true
 }
 op {
@@ -19839,6 +20038,13 @@ op {
     name: "shard_id"
     type: "int"
   }
+  attr {
+    name: "config"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
   is_stateful: true
 }
 op {
@@ -19871,6 +20077,13 @@ op {
     name: "shard_id"
     type: "int"
   }
+  attr {
+    name: "config"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
   is_stateful: true
 }
 op {
@@ -23974,6 +24187,13 @@ op {
       }
     }
   }
+  attr {
+    name: "incompatible_shape_error"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
   is_commutative: true
 }
 op {
@@ -26909,6 +27129,20 @@ op {
       }
     }
   }
+  attr {
+    name: "narrow_range"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "axis"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
 }
 op {
   name: "QuantizedAdd"
@@ -30585,7 +30819,7 @@ op {
   name: "RFFT"
   input_arg {
     name: "input"
-    type: DT_FLOAT
+    type_attr: "Treal"
   }
   input_arg {
     name: "fft_length"
@@ -30593,14 +30827,40 @@ op {
   }
   output_arg {
     name: "output"
-    type: DT_COMPLEX64
+    type_attr: "Tcomplex"
+  }
+  attr {
+    name: "Treal"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "Tcomplex"
+    type: "type"
+    default_value {
+      type: DT_COMPLEX64
+    }
+    allowed_values {
+      list {
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
   }
 }
 op {
   name: "RFFT2D"
   input_arg {
     name: "input"
-    type: DT_FLOAT
+    type_attr: "Treal"
   }
   input_arg {
     name: "fft_length"
@@ -30608,14 +30868,40 @@ op {
   }
   output_arg {
     name: "output"
-    type: DT_COMPLEX64
+    type_attr: "Tcomplex"
+  }
+  attr {
+    name: "Treal"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "Tcomplex"
+    type: "type"
+    default_value {
+      type: DT_COMPLEX64
+    }
+    allowed_values {
+      list {
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
   }
 }
 op {
   name: "RFFT3D"
   input_arg {
     name: "input"
-    type: DT_FLOAT
+    type_attr: "Treal"
   }
   input_arg {
     name: "fft_length"
@@ -30623,7 +30909,33 @@ op {
   }
   output_arg {
     name: "output"
-    type: DT_COMPLEX64
+    type_attr: "Tcomplex"
+  }
+  attr {
+    name: "Treal"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "Tcomplex"
+    type: "type"
+    default_value {
+      type: DT_COMPLEX64
+    }
+    allowed_values {
+      list {
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
   }
 }
 op {
@@ -36185,6 +36497,13 @@ op {
     name: "shard_id"
     type: "int"
   }
+  attr {
+    name: "config"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
   is_stateful: true
 }
 op {
@@ -36229,6 +36548,13 @@ op {
     name: "shard_id"
     type: "int"
   }
+  attr {
+    name: "config"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
   is_stateful: true
 }
 op {
@@ -36269,6 +36595,13 @@ op {
     name: "shard_id"
     type: "int"
   }
+  attr {
+    name: "config"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
   is_stateful: true
 }
 op {
@@ -36313,6 +36646,13 @@ op {
     name: "shard_id"
     type: "int"
   }
+  attr {
+    name: "config"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
   is_stateful: true
 }
 op {
@@ -36349,6 +36689,13 @@ op {
     name: "shard_id"
     type: "int"
   }
+  attr {
+    name: "config"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
   is_stateful: true
 }
 op {
@@ -36389,6 +36736,13 @@ op {
     name: "shard_id"
     type: "int"
   }
+  attr {
+    name: "config"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
   is_stateful: true
 }
 op {
@@ -36433,6 +36787,13 @@ op {
     name: "shard_id"
     type: "int"
   }
+  attr {
+    name: "config"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
   is_stateful: true
 }
 op {
@@ -36473,6 +36834,13 @@ op {
     name: "shard_id"
     type: "int"
   }
+  attr {
+    name: "config"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
   is_stateful: true
 }
 op {
@@ -36517,6 +36885,13 @@ op {
     name: "shard_id"
     type: "int"
   }
+  attr {
+    name: "config"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
   is_stateful: true
 }
 op {
@@ -36561,6 +36936,13 @@ op {
     name: "shard_id"
     type: "int"
   }
+  attr {
+    name: "config"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
   is_stateful: true
 }
 op {
@@ -36597,6 +36979,13 @@ op {
     name: "shard_id"
     type: "int"
   }
+  attr {
+    name: "config"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
   is_stateful: true
 }
 op {
@@ -36637,6 +37026,13 @@ op {
     name: "shard_id"
     type: "int"
   }
+  attr {
+    name: "config"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
   is_stateful: true
 }
 op {
@@ -36673,6 +37069,13 @@ op {
     name: "shard_id"
     type: "int"
   }
+  attr {
+    name: "config"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
   is_stateful: true
 }
 op {
@@ -36713,6 +37116,13 @@ op {
     name: "shard_id"
     type: "int"
   }
+  attr {
+    name: "config"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
   is_stateful: true
 }
 op {
@@ -36753,6 +37163,13 @@ op {
     name: "shard_id"
     type: "int"
   }
+  attr {
+    name: "config"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
   is_stateful: true
 }
 op {
@@ -36797,6 +37214,13 @@ op {
     name: "shard_id"
     type: "int"
   }
+  attr {
+    name: "config"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
   is_stateful: true
 }
 op {
@@ -36829,6 +37253,13 @@ op {
     name: "shard_id"
     type: "int"
   }
+  attr {
+    name: "config"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
   is_stateful: true
 }
 op {
diff --git a/tensorflow/core/ops/spectral_ops.cc b/tensorflow/core/ops/spectral_ops.cc
index b1ae7040f02..3b9b962143b 100644
--- a/tensorflow/core/ops/spectral_ops.cc
+++ b/tensorflow/core/ops/spectral_ops.cc
@@ -109,39 +109,51 @@ Status RFFTShape(InferenceContext* c, const bool forward, const int rank) {
 }
 
 REGISTER_OP("RFFT")
-    .Input("input: float")
+    .Input("input: Treal")
     .Input("fft_length: int32")
-    .Output("output: complex64")
+    .Output("output: Tcomplex")
+    .Attr("Treal: {float32, float64} = DT_FLOAT")
+    .Attr("Tcomplex: {complex64, complex128} = DT_COMPLEX64")
     .SetShapeFn([](InferenceContext* c) { return RFFTShape(c, true, 1); });
 
 REGISTER_OP("IRFFT")
-    .Input("input: complex64")
+    .Input("input: Tcomplex")
     .Input("fft_length: int32")
-    .Output("output: float")
+    .Output("output: Treal")
+    .Attr("Treal: {float32, float64} = DT_FLOAT")
+    .Attr("Tcomplex: {complex64, complex128} = DT_COMPLEX64")
     .SetShapeFn([](InferenceContext* c) { return RFFTShape(c, false, 1); });
 
 REGISTER_OP("RFFT2D")
-    .Input("input: float")
+    .Input("input: Treal")
     .Input("fft_length: int32")
-    .Output("output: complex64")
+    .Output("output: Tcomplex")
+    .Attr("Treal: {float32, float64} = DT_FLOAT")
+    .Attr("Tcomplex: {complex64, complex128} = DT_COMPLEX64")
     .SetShapeFn([](InferenceContext* c) { return RFFTShape(c, true, 2); });
 
 REGISTER_OP("IRFFT2D")
-    .Input("input: complex64")
+    .Input("input: Tcomplex")
     .Input("fft_length: int32")
-    .Output("output: float")
+    .Output("output: Treal")
+    .Attr("Treal: {float32, float64} = DT_FLOAT")
+    .Attr("Tcomplex: {complex64, complex128} = DT_COMPLEX64")
     .SetShapeFn([](InferenceContext* c) { return RFFTShape(c, false, 2); });
 
 REGISTER_OP("RFFT3D")
-    .Input("input: float")
+    .Input("input: Treal")
     .Input("fft_length: int32")
-    .Output("output: complex64")
+    .Output("output: Tcomplex")
+    .Attr("Treal: {float32, float64} = DT_FLOAT")
+    .Attr("Tcomplex: {complex64, complex128} = DT_COMPLEX64")
     .SetShapeFn([](InferenceContext* c) { return RFFTShape(c, true, 3); });
 
 REGISTER_OP("IRFFT3D")
-    .Input("input: complex64")
+    .Input("input: Tcomplex")
     .Input("fft_length: int32")
-    .Output("output: float")
+    .Output("output: Treal")
+    .Attr("Treal: {float32, float64} = DT_FLOAT")
+    .Attr("Tcomplex: {complex64, complex128} = DT_COMPLEX64")
     .SetShapeFn([](InferenceContext* c) { return RFFTShape(c, false, 3); });
 
 // Deprecated ops:
diff --git a/tensorflow/core/platform/BUILD b/tensorflow/core/platform/BUILD
index 6441f5f80b1..281dda4b4de 100644
--- a/tensorflow/core/platform/BUILD
+++ b/tensorflow/core/platform/BUILD
@@ -10,7 +10,6 @@
 
 load(
     "//tensorflow/core/platform:default/build_config.bzl",
-    "tf_additional_device_tracer_srcs",
     "tf_additional_lib_hdrs",
     "tf_additional_lib_srcs",
     "tf_additional_libdevice_srcs",
@@ -29,11 +28,16 @@ load(
 load(
     "//tensorflow/core/platform:default/build_refactor.bzl",
     "tf_instantiate_platform_libraries",
+    "tf_platform_helper_deps",
 )
 load(
     "//tensorflow:tensorflow.bzl",
     "tf_copts",
 )
+load(
+    "@local_config_rocm//rocm:build_defs.bzl",
+    "if_rocm_is_configured",
+)
 
 package(
     default_visibility = [
@@ -50,6 +54,19 @@ package(
 tf_instantiate_platform_libraries(names = [
     "context",
     "cord",
+    "cuda_libdevice_path",
+    "dynamic_annotations",
+    "env_time",
+    "human_readable_json",
+    "load_library",
+    "mutex",
+    "net",
+    "notification",
+    "platform_port",
+    "rocm_rocdl_path",
+    "strong_hash",
+    "subprocess",
+    "wide_char",
 ])
 
 cc_library(
@@ -120,6 +137,23 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "cuda",
+    hdrs = ["cuda.h"],
+    deps = [
+        ":platform",
+        "//tensorflow/stream_executor/cuda:cuda_activation_header",
+    ],
+)
+
+cc_library(
+    name = "cuda_libdevice_path",
+    textual_hdrs = ["cuda_libdevice_path.h"],
+    deps = [
+        ":cuda_libdevice_path_impl",
+    ],
+)
+
 cc_library(
     name = "denormal",
     srcs = ["denormal.cc"],
@@ -134,11 +168,29 @@ cc_library(
 )
 
 cc_library(
-    name = "env_time",
-    srcs = tf_platform_srcs(["env_time.cc"]),
-    hdrs = ["env_time.h"],
+    name = "dynamic_annotations",
+    hdrs = ["dynamic_annotations.h"],
     deps = [
+        ":dynamic_annotations_impl",
+        ":platform",
+    ],
+)
+
+cc_library(
+    name = "env_time",
+    textual_hdrs = ["env_time.h"],
+    deps = tf_platform_helper_deps("env_time_impl"),
+)
+
+cc_library(
+    name = "error",
+    srcs = ["error.cc"],
+    hdrs = ["error.h"],
+    deps = [
+        ":platform",
         ":types",
+        "//tensorflow/core/lib/core:status",
+        "//tensorflow/core/lib/strings:string_utils",
     ],
 )
 
@@ -148,12 +200,36 @@ cc_library(
     deps = [":types"],
 )
 
+cc_library(
+    name = "grpc_services",
+    hdrs = ["grpc_services.h"],
+    deps = [
+        ":platform",
+        "//tensorflow/core/profiler:profiler_analysis_proto_cc",
+        "//tensorflow/core/profiler:profiler_service_proto_cc",
+    ],
+)
+
 cc_library(
     name = "host_info",
     hdrs = ["host_info.h"],
     deps = [":types"],
 )
 
+cc_library(
+    name = "human_readable_json",
+    textual_hdrs = ["human_readable_json.h"],
+    deps = [
+        ":human_readable_json_impl",
+    ],
+)
+
+cc_library(
+    name = "load_library",
+    textual_hdrs = ["load_library.h"],
+    deps = tf_platform_helper_deps("load_library_impl"),
+)
+
 cc_library(
     name = "logging",
     srcs = tf_platform_hdrs(["logging.h"]) + tf_platform_srcs(["logging.cc"]),
@@ -172,6 +248,29 @@ cc_library(
     hdrs = ["macros.h"],
 )
 
+cc_library(
+    name = "mutex",
+    textual_hdrs = ["mutex.h"],
+    deps = [
+        ":mutex_impl",
+    ],
+)
+
+cc_library(
+    name = "net",
+    textual_hdrs = ["net.h"],
+    deps = tf_platform_helper_deps("net_impl"),
+)
+
+cc_library(
+    name = "notification",
+    hdrs = ["notification.h"],
+    deps = [
+        ":notification_impl",
+        ":platform",
+    ],
+)
+
 cc_library(
     name = "numbers",
     srcs = ["numbers.cc"],
@@ -188,18 +287,30 @@ cc_library(
 )
 
 cc_library(
-    name = "rocm_rocdl_path",
-    srcs = ["rocm_rocdl_path.cc"] + tf_additional_rocdl_srcs(),
-    hdrs = ["rocm_rocdl_path.h"],
-    deps = [
-        ":types",
-        "//tensorflow/core:lib",
-    ] + tf_additional_rocdl_deps(),
+    name = "platform",
+    hdrs = ["platform.h"],
 )
 
 cc_library(
-    name = "platform",
-    hdrs = ["platform.h"],
+    name = "platform_port",
+    srcs = [
+        "cpu_info.cc",
+    ],
+    copts = tf_copts(),
+    textual_hdrs = [
+        "cpu_info.h",
+        "demangle.h",
+        "host_info.h",
+        "init_main.h",
+        "mem.h",
+        "numa.h",
+        "snappy.h",
+    ],
+    deps = [
+        ":logging",
+        ":platform",
+        ":types",
+    ] + tf_platform_helper_deps("platform_port_impl"),
 )
 
 cc_library(
@@ -236,6 +347,34 @@ cc_library(
     deps = tf_protobuf_compiler_deps(),
 )
 
+cc_library(
+    name = "regexp",
+    hdrs = ["regexp.h"],
+    deps = [
+        ":platform",
+        ":types",
+        "@com_google_absl//absl/strings",
+        "@com_googlesource_code_re2//:re2",
+    ],
+)
+
+cc_library(
+    name = "rocm_rocdl_path",
+    textual_hdrs = ["rocm_rocdl_path.h"],
+    deps = [
+        ":rocm_rocdl_path_impl",
+    ],
+)
+
+cc_library(
+    name = "rocm",
+    hdrs = if_rocm_is_configured(["rocm.h"]),
+    deps = if_rocm_is_configured([
+        ":platform",
+        "//tensorflow/stream_executor/rocm:rocm_activation",
+    ]),
+)
+
 cc_library(
     name = "scanner",
     srcs = ["scanner.cc"],
@@ -249,7 +388,10 @@ cc_library(
 
 cc_library(
     name = "stacktrace",
-    srcs = glob(["*/stacktrace.h"]),
+    srcs = tf_platform_srcs([
+        "stacktrace.h",
+        "stacktrace.cc",
+    ]),
     hdrs = ["stacktrace.h"],
     deps = [
         ":abi",
@@ -322,6 +464,24 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "strong_hash",
+    hdrs = ["strong_hash.h"],
+    deps = [
+        ":platform",
+        ":strong_hash_impl",
+        ":types",
+    ],
+)
+
+cc_library(
+    name = "subprocess",
+    textual_hdrs = [
+        "subprocess.h",
+    ],
+    deps = tf_platform_helper_deps("subprocess_impl"),
+)
+
 cc_library(
     name = "thread_annotations",
     hdrs = ["thread_annotations.h"],
@@ -369,8 +529,6 @@ filegroup(
             "**/monitoring.cc",
             "**/stream_executor.h",
             "**/env_time.cc",
-            "**/device_tracer.cc",
-            "**/tpu_tracer.cc",
             "**/logger.cc",
             "**/logging.cc",
             "**/human_readable_json.cc",
@@ -468,8 +626,6 @@ filegroup(
             "**/env_time.cc",
             "**/monitoring.cc",
             "**/cuda_libdevice_path.cc",
-            "**/device_tracer.cc",
-            "**/tpu_tracer.cc",
             "**/logger.cc",
             "**/logging.cc",
             "**/human_readable_json.cc",
@@ -495,12 +651,6 @@ filegroup(
     visibility = ["//tensorflow/core:__pkg__"],
 )
 
-filegroup(
-    name = "legacy_device_tracer_srcs",
-    srcs = tf_additional_device_tracer_srcs(),
-    visibility = ["//tensorflow/core:__pkg__"],
-)
-
 filegroup(
     name = "legacy_minimal_lib_srcs",
     srcs = tf_additional_minimal_lib_srcs(),
diff --git a/tensorflow/core/platform/default/build_config.bzl b/tensorflow/core/platform/default/build_config.bzl
index b822effa14e..467dd3bcfe8 100644
--- a/tensorflow/core/platform/default/build_config.bzl
+++ b/tensorflow/core/platform/default/build_config.bzl
@@ -479,15 +479,6 @@ def tf_platform_srcs(files):
     windows_set = base_set + ["windows/" + f for f in files]
     posix_set = base_set + ["posix/" + f for f in files]
 
-    # Handle cases where we must also bring the posix file in. Usually, the list
-    # of files to build on windows builds is just all the stuff in the
-    # windows_set. However, in some cases the implementations in 'posix/' are
-    # just what is necessary and historically we choose to simply use the posix
-    # file instead of making a copy in 'windows'.
-    for f in files:
-        if f == "error.cc":
-            windows_set.append("posix/" + f)
-
     return select({
         "//tensorflow:windows": native.glob(windows_set),
         "//conditions:default": native.glob(posix_set),
@@ -498,7 +489,9 @@ def tf_additional_lib_hdrs(exclude = []):
         "default/*.h",
         "windows/*.h",
         "posix/error.h",
-    ], exclude = exclude)
+    ], exclude = exclude + [
+        "default/subprocess.h",
+    ])
     return select({
         "//tensorflow:windows": windows_hdrs,
         "//conditions:default": native.glob([
@@ -512,7 +505,13 @@ def tf_additional_lib_srcs(exclude = []):
         "default/*.cc",
         "windows/*.cc",
         "posix/error.cc",
-    ], exclude = exclude)
+    ], exclude = exclude + [
+        "default/env_time.cc",
+        "default/load_library.cc",
+        "default/net.cc",
+        "default/port.cc",
+        "default/subprocess.cc",
+    ])
     return select({
         "//tensorflow:windows": windows_srcs,
         "//conditions:default": native.glob([
@@ -582,24 +581,11 @@ def tf_protos_grappler():
     )
 
 def tf_additional_device_tracer_srcs():
-    return ["default/device_tracer.cc"]
+    return ["device_tracer.cc"]
 
 def tf_additional_cupti_utils_cuda_deps():
     return []
 
-def tf_additional_device_tracer_cuda_deps():
-    return [
-        "//tensorflow/stream_executor/cuda:cupti_stub",
-        "@com_google_absl//absl/base",
-        "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/strings:str_format",
-        "@com_google_absl//absl/container:node_hash_map",
-        "@com_google_absl//absl/container:flat_hash_map",
-    ]
-
-def tf_additional_device_tracer_test_flags():
-    return []
-
 def tf_additional_cupti_test_flags():
     return []
 
@@ -682,38 +668,6 @@ def tf_additional_core_deps():
         ],
     })
 
-# TODO(jart, jhseu): Delete when GCP is default on.
-def tf_additional_cloud_op_deps():
-    return select({
-        "//tensorflow:android": [],
-        "//tensorflow:ios": [],
-        "//tensorflow:linux_s390x": [],
-        "//tensorflow:windows": [],
-        "//tensorflow:api_version_2": [],
-        "//tensorflow:windows_and_api_version_2": [],
-        "//tensorflow:no_gcp_support": [],
-        "//conditions:default": [
-            "//tensorflow/contrib/cloud:bigquery_reader_ops_op_lib",
-            "//tensorflow/contrib/cloud:gcs_config_ops_op_lib",
-        ],
-    })
-
-# TODO(jhseu): Delete when GCP is default on.
-def tf_additional_cloud_kernel_deps():
-    return select({
-        "//tensorflow:android": [],
-        "//tensorflow:ios": [],
-        "//tensorflow:linux_s390x": [],
-        "//tensorflow:windows": [],
-        "//tensorflow:api_version_2": [],
-        "//tensorflow:windows_and_api_version_2": [],
-        "//tensorflow:no_gcp_support": [],
-        "//conditions:default": [
-            "//tensorflow/contrib/cloud/kernels:bigquery_reader_ops",
-            "//tensorflow/contrib/cloud/kernels:gcs_config_ops",
-        ],
-    })
-
 def tf_lib_proto_parsing_deps():
     return [
         ":protos_all_cc",
diff --git a/tensorflow/core/platform/default/build_refactor.bzl b/tensorflow/core/platform/default/build_refactor.bzl
index 3007566d41a..c74ac706a5e 100644
--- a/tensorflow/core/platform/default/build_refactor.bzl
+++ b/tensorflow/core/platform/default/build_refactor.bzl
@@ -7,6 +7,16 @@ Build targets for default implementations of tf/core/platform libraries.
 # and add real BUILD files under tensorflow/core/platform/default and
 # tensorflow/core/platform/windows after the refactoring is complete.
 
+load(
+    "//tensorflow/core/platform:default/build_config.bzl",
+    "tf_additional_numa_copts",
+    "tf_additional_numa_deps",
+)
+load(
+    "//tensorflow:tensorflow.bzl",
+    "tf_copts",
+)
+
 TF_PLATFORM_LIBRARIES = {
     "context": {
         "name": "context_impl",
@@ -16,14 +26,322 @@ TF_PLATFORM_LIBRARIES = {
             "//tensorflow/core/platform",
         ],
         "visibility": ["//visibility:private"],
+        "tags": ["no_oss", "manual"],
     },
     "cord": {
         "name": "cord_impl",
         "hdrs": ["//tensorflow/core/platform:default/cord.h"],
         "visibility": ["//visibility:private"],
+        "tags": ["no_oss", "manual"],
+    },
+    "env_time": {
+        "name": "env_time_impl",
+        "hdrs": [
+            "//tensorflow/core/platform:env_time.h",
+        ],
+        "srcs": [
+            "//tensorflow/core/platform:default/env_time.cc",
+        ],
+        "deps": [
+            "//tensorflow/core/platform:types",
+        ],
+        "visibility": ["//visibility:private"],
+        "tags": ["no_oss", "manual"],
+    },
+    "cuda_libdevice_path": {
+        "name": "cuda_libdevice_path_impl",
+        "hdrs": [
+            "//tensorflow/core/platform:cuda_libdevice_path.h",
+        ],
+        "srcs": [
+            "//tensorflow/core/platform:default/cuda_libdevice_path.cc",
+        ],
+        "deps": [
+            "@local_config_cuda//cuda:cuda_headers",
+            "//tensorflow/core/platform:logging",
+            "//tensorflow/core/platform:types",
+        ],
+        "visibility": ["//visibility:private"],
+        "tags": ["no_oss", "manual"],
+    },
+    "dynamic_annotations": {
+        "name": "dynamic_annotations_impl",
+        "hdrs": [
+            "//tensorflow/core/platform:default/dynamic_annotations.h",
+        ],
+        "visibility": ["//visibility:private"],
+        "tags": ["no_oss", "manual"],
+    },
+    "human_readable_json": {
+        "name": "human_readable_json_impl",
+        "hdrs": [
+            "//tensorflow/core/platform:human_readable_json.h",
+        ],
+        "srcs": [
+            "//tensorflow/core/platform:default/human_readable_json.cc",
+        ],
+        "deps": [
+            "//tensorflow/core/lib/core:errors",
+            "//tensorflow/core/lib/core:status",
+            "//tensorflow/core/lib/strings:string_utils",
+            "//tensorflow/core/platform:protobuf",
+        ],
+        "visibility": ["//visibility:private"],
+        "tags": ["no_oss", "manual"],
+    },
+    "load_library": {
+        "name": "load_library_impl",
+        "hdrs": [
+            "//tensorflow/core/platform:load_library.h",
+        ],
+        "srcs": [
+            "//tensorflow/core/platform:default/load_library.cc",
+        ],
+        "deps": [
+            "//tensorflow/core/lib/core:errors",
+            "//tensorflow/core/lib/core:status",
+        ],
+        "visibility": ["//visibility:private"],
+        "tags": ["no_oss", "manual"],
+    },
+    "mutex": {
+        "name": "mutex_impl",
+        "hdrs": [
+            "//tensorflow/core/platform:mutex.h",
+        ],
+        "textual_hdrs": [
+            "//tensorflow/core/platform:default/mutex.h",
+        ],
+        "srcs": [
+            "//tensorflow/core/platform:default/mutex.cc",
+            "//tensorflow/core/platform:default/mutex_data.h",
+        ],
+        "deps": [
+            "@nsync//:nsync_cpp",
+            "//tensorflow/core/platform",
+            "//tensorflow/core/platform:macros",
+            "//tensorflow/core/platform:thread_annotations",
+            "//tensorflow/core/platform:types",
+        ],
+        "visibility": ["//visibility:private"],
+        "tags": ["no_oss", "manual"],
+    },
+    "net": {
+        "name": "net_impl",
+        "hdrs": [
+            "//tensorflow/core/platform:net.h",
+        ],
+        "srcs": [
+            "//tensorflow/core/platform:default/net.cc",
+        ],
+        "deps": [
+            "//tensorflow/core/lib/strings:string_utils",
+            "//tensorflow/core/platform:logging",
+        ],
+        "visibility": ["//visibility:private"],
+        "tags": ["no_oss", "manual"],
+    },
+    "notification": {
+        "name": "notification_impl",
+        "hdrs": [
+            "//tensorflow/core/platform:default/notification.h",
+        ],
+        "deps": [
+            "//tensorflow/core/platform:mutex",
+            "//tensorflow/core/platform:types",
+        ],
+        "visibility": ["//visibility:private"],
+        "tags": ["no_oss", "manual"],
+    },
+    "rocm_rocdl_path": {
+        "name": "rocm_rocdl_path_impl",
+        "hdrs": [
+            "//tensorflow/core/platform:rocm_rocdl_path.h",
+        ],
+        "srcs": [
+            "//tensorflow/core/platform:default/rocm_rocdl_path.cc",
+        ],
+        "deps": [
+            "@local_config_rocm//rocm:rocm_headers",
+            "//tensorflow/core/lib/io:path",
+            "//tensorflow/core/platform:types",
+        ],
+        "visibility": ["//visibility:private"],
+        "tags": ["no_oss", "manual"],
+    },
+    "strong_hash": {
+        "name": "strong_hash_impl",
+        "textual_hdrs": [
+            "//tensorflow/core/platform:default/strong_hash.h",
+        ],
+        "deps": [
+            "@highwayhash//:sip_hash",
+        ],
+        "visibility": ["//visibility:private"],
+        "tags": ["no_oss", "manual"],
+    },
+    "subprocess": {
+        "name": "subprocess_impl",
+        "textual_hdrs": [
+            "//tensorflow/core/platform:default/subprocess.h",
+        ],
+        "hdrs": [
+            "//tensorflow/core/platform:subprocess.h",
+        ],
+        "srcs": [
+            "//tensorflow/core/platform:default/subprocess.cc",
+        ],
+        "deps": [
+            "//tensorflow/core/platform",
+            "//tensorflow/core/platform:logging",
+            "//tensorflow/core/platform:macros",
+            "//tensorflow/core/platform:mutex",
+            "//tensorflow/core/platform:types",
+        ],
+        "tags": ["no_oss", "manual"],
+        "visibility": ["//visibility:private"],
+    },
+}
+
+TF_WINDOWS_PLATFORM_LIBRARIES = {
+    "env_time": {
+        "name": "windows_env_time_impl",
+        "hdrs": [
+            "//tensorflow/core/platform:env_time.h",
+        ],
+        "srcs": [
+            "//tensorflow/core/platform:windows/env_time.cc",
+        ],
+        "deps": [
+            "//tensorflow/core/platform:types",
+        ],
+        "visibility": ["//visibility:private"],
+        "tags": ["no_oss", "manual"],
+    },
+    "load_library": {
+        "name": "windows_load_library_impl",
+        "hdrs": [
+            "//tensorflow/core/platform:load_library.h",
+        ],
+        "srcs": [
+            "//tensorflow/core/platform:windows/load_library.cc",
+        ],
+        "deps": [
+            "//tensorflow/core/lib/core:errors",
+            "//tensorflow/core/lib/core:status",
+            "//tensorflow/core/platform:windows_wide_char_impl",
+        ],
+        "visibility": ["//visibility:private"],
+        "tags": ["no_oss", "manual"],
+    },
+    "net": {
+        "name": "windows_net_impl",
+        "hdrs": [
+            "//tensorflow/core/platform:net.h",
+        ],
+        "srcs": [
+            "//tensorflow/core/platform:windows/net.cc",
+        ],
+        "deps": [
+            "//tensorflow/core/platform:error",
+            "//tensorflow/core/platform:logging",
+        ],
+        "visibility": ["//visibility:private"],
+        "tags": ["no_oss", "manual"],
+    },
+    "subprocess": {
+        "name": "windows_subprocess_impl",
+        "textual_hdrs": [
+            "//tensorflow/core/platform:windows/subprocess.h",
+        ],
+        "hdrs": [
+            "//tensorflow/core/platform:subprocess.h",
+        ],
+        "deps": [
+            "//tensorflow/core/platform",
+            "//tensorflow/core/platform:logging",
+            "//tensorflow/core/platform:macros",
+            "//tensorflow/core/platform:types",
+        ],
+        "tags": ["no_oss", "manual"],
+        "visibility": ["//visibility:private"],
+    },
+    "wide_char": {
+        "name": "windows_wide_char_impl",
+        "hdrs": [
+            "//tensorflow/core/platform:windows/wide_char.h",
+        ],
+        "tags": ["no_oss", "manual"],
+        "visibility": ["//visibility:private"],
     },
 }
 
 def tf_instantiate_platform_libraries(names = []):
     for name in names:
-        native.cc_library(**TF_PLATFORM_LIBRARIES[name])
+        # Unfortunately, this target cannot be represented as a dictionary
+        # because it uses "select"
+        if name == "platform_port":
+            native.cc_library(
+                name = "platform_port_impl",
+                srcs = [
+                    "//tensorflow/core/platform:default/port.cc",
+                ],
+                hdrs = [
+                    "//tensorflow/core/platform:cpu_info.h",
+                    "//tensorflow/core/platform:demangle.h",
+                    "//tensorflow/core/platform:host_info.h",
+                    "//tensorflow/core/platform:init_main.h",
+                    "//tensorflow/core/platform:mem.h",
+                    "//tensorflow/core/platform:numa.h",
+                    "//tensorflow/core/platform:snappy.h",
+                ],
+                copts = tf_copts() + tf_additional_numa_copts(),
+                deps = [
+                    "@com_google_absl//absl/base",
+                    "//tensorflow/core/platform:byte_order",
+                    "//tensorflow/core/platform:dynamic_annotations",
+                    "//tensorflow/core/platform:logging",
+                    "//tensorflow/core/platform:types",
+                    "//tensorflow/core/platform",
+                    "@snappy",
+                ] + tf_additional_numa_deps(),
+                visibility = ["//visibility:private"],
+                tags = ["no_oss", "manual"],
+            )
+            native.cc_library(
+                name = "windows_platform_port_impl",
+                srcs = [
+                    "//tensorflow/core/platform:windows/port.cc",
+                ],
+                hdrs = [
+                    "//tensorflow/core/platform:cpu_info.h",
+                    "//tensorflow/core/platform:demangle.h",
+                    "//tensorflow/core/platform:host_info.h",
+                    "//tensorflow/core/platform:init_main.h",
+                    "//tensorflow/core/platform:mem.h",
+                    "//tensorflow/core/platform:numa.h",
+                    "//tensorflow/core/platform:snappy.h",
+                ],
+                copts = tf_copts(),
+                deps = [
+                    "//tensorflow/core/platform",
+                    "//tensorflow/core/platform:byte_order",
+                    "//tensorflow/core/platform:dynamic_annotations",
+                    "//tensorflow/core/platform:logging",
+                    "//tensorflow/core/platform:types",
+                    "@snappy",
+                ],
+                visibility = ["//visibility:private"],
+                tags = ["no_oss", "manual"],
+            )
+        else:
+            if name in TF_PLATFORM_LIBRARIES:
+                native.cc_library(**TF_PLATFORM_LIBRARIES[name])
+            if name in TF_WINDOWS_PLATFORM_LIBRARIES:
+                native.cc_library(**TF_WINDOWS_PLATFORM_LIBRARIES[name])
+
+def tf_platform_helper_deps(name):
+    return select({
+        "//tensorflow:windows": [":windows_" + name],
+        "//conditions:default": [":" + name],
+    })
diff --git a/tensorflow/core/platform/default/device_tracer.cc b/tensorflow/core/platform/default/device_tracer.cc
deleted file mode 100644
index cd34b32593e..00000000000
--- a/tensorflow/core/platform/default/device_tracer.cc
+++ /dev/null
@@ -1,681 +0,0 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#if GOOGLE_CUDA
-
-#include <stdlib.h>
-
-#include <memory>
-
-#include "absl/base/casts.h"
-#include "absl/container/flat_hash_map.h"
-#include "absl/container/node_hash_map.h"
-#include "absl/strings/ascii.h"
-#include "absl/strings/str_cat.h"
-#include "absl/strings/str_format.h"
-#include "third_party/gpus/cuda/extras/CUPTI/include/cupti.h"
-#include "tensorflow/core/common_runtime/step_stats_collector.h"
-#include "tensorflow/core/framework/step_stats.pb.h"
-#include "tensorflow/core/lib/core/errors.h"
-#include "tensorflow/core/lib/core/status.h"
-#include "tensorflow/core/lib/hash/hash.h"
-#include "tensorflow/core/lib/strings/strcat.h"
-#include "tensorflow/core/lib/strings/stringprintf.h"
-#include "tensorflow/core/platform/abi.h"
-#include "tensorflow/core/platform/annotation.h"
-#include "tensorflow/core/platform/env.h"
-#include "tensorflow/core/platform/macros.h"
-#include "tensorflow/core/platform/mem.h"
-#include "tensorflow/core/platform/mutex.h"
-#include "tensorflow/core/platform/tracing.h"
-#include "tensorflow/core/profiler/internal/profiler_interface.h"
-#include "tensorflow/core/util/env_var.h"
-
-namespace tensorflow {
-namespace {
-Status ToStatus(CUptiResult result) {
-  if (result == CUPTI_SUCCESS) {
-    return Status::OK();
-  }
-  const char* str = nullptr;
-  cuptiGetResultString(result, &str);
-  return errors::Unavailable("CUPTI error: ", str ? str : "<unknown>");
-}
-
-Status ToStatus(CUresult result) {
-  if (result == CUDA_SUCCESS) {
-    return Status::OK();
-  }
-  const char* str = nullptr;
-  cuGetErrorName(result, &str);
-  return errors::Unavailable("CUDA error: ", str ? str : "<unknown>");
-}
-
-void LogIfError(const Status& status) {
-  if (status.ok()) {
-    return;
-  }
-  LOG(ERROR) << status.error_message();
-}
-
-bool IsAscii(string& str) {
-  for (auto& ch : str) {
-    if (!absl::ascii_isascii(ch)) {
-      return false;
-    }
-  }
-  return true;
-}
-
-struct KernelRecord {
-  const char* kernel_name;
-  // TODO(csigg): cuStreamGetCtx introduced in CUDA 9.2 would allow us to only
-  // record the stream and infer the context during collection.
-  CUcontext context;
-  CUstream stream;
-  CUevent start_event;
-  CUevent stop_event;
-  const std::string* annotation;
-};
-
-struct MemcpyRecord {
-  CUmemorytype src_type;
-  CUmemorytype dst_type;
-  size_t size_bytes;
-  CUcontext context;
-  CUstream stream;
-  CUevent start_event;
-  CUevent stop_event;
-  const std::string* annotation;
-};
-
-Status CreateAndRecordEvent(CUevent* event, CUstream stream) {
-  TF_RETURN_IF_ERROR(ToStatus(cuEventCreate(event, CU_EVENT_DEFAULT)));
-  return ToStatus(cuEventRecord(*event, stream));
-}
-
-// Stores a series of kernel and memcpy records.
-class CudaEventRecorder {
- public:
-  // Registers the start of a kernel launch. The returned index should be passed
-  // to StopKernel() after the kernel launch has completed.
-  size_t StartKernel(const char* kernel_name, CUcontext context,
-                     CUstream stream) {
-    KernelRecord record = {kernel_name, context, stream};
-    LogIfError(CreateAndRecordEvent(&record.start_event, stream));
-    mutex_lock lock(mutex_);
-    if (tracing::ScopedAnnotation::IsEnabled()) {
-      record.annotation =
-          &*annotations_.emplace(Annotation::CurrentAnnotation()).first;
-    }
-    kernel_records_.push_back(record);
-    return kernel_records_.size() - 1;
-  }
-  void StopKernel(size_t index) {
-    mutex_lock lock(mutex_);
-    auto& record = kernel_records_[index];
-    LogIfError(CreateAndRecordEvent(&record.stop_event, record.stream));
-  }
-
-  // Registers the start of a copy operation. The returned index should be
-  // passed to StopMemcpy() after the kernel launch has completed.
-  size_t StartMemcpy(CUmemorytype src_type, CUmemorytype dst_type,
-                     size_t size_bytes, CUcontext context, CUstream stream) {
-    MemcpyRecord record = {src_type, dst_type, size_bytes, context, stream};
-    LogIfError(CreateAndRecordEvent(&record.start_event, stream));
-    mutex_lock lock(mutex_);
-    if (tracing::ScopedAnnotation::IsEnabled()) {
-      record.annotation =
-          &*annotations_.emplace(Annotation::CurrentAnnotation()).first;
-    }
-    memcpy_records_.push_back(record);
-    return memcpy_records_.size() - 1;
-  }
-  void StopMemcpy(size_t index) {
-    mutex_lock lock(mutex_);
-    auto& record = memcpy_records_[index];
-    LogIfError(CreateAndRecordEvent(&record.stop_event, record.stream));
-  }
-
-  std::vector<KernelRecord> ConsumeKernelRecords() {
-    mutex_lock lock(mutex_);
-    return std::move(kernel_records_);
-  }
-  std::vector<MemcpyRecord> ConsumeMemcpyRecords() {
-    mutex_lock lock(mutex_);
-    return std::move(memcpy_records_);
-  }
-
- private:
-  mutex mutex_;
-  std::unordered_set<std::string> annotations_ GUARDED_BY(mutex_);
-  std::vector<KernelRecord> kernel_records_ GUARDED_BY(mutex_);
-  std::vector<MemcpyRecord> memcpy_records_ GUARDED_BY(mutex_);
-};
-
-// Instances register callbacks with CUPTI to notify the event recorder before
-// and after kernel launches and memory copies.
-class CuptiCallbackHook {
- public:
-  CuptiCallbackHook() : subscriber_(nullptr) {}
-
-  Status Enable(CudaEventRecorder* recorder) {
-    TF_RETURN_IF_ERROR(
-        ToStatus(cuptiSubscribe(&subscriber_, &CuptiCallback, recorder)));
-    for (auto cbid : {CUPTI_DRIVER_TRACE_CBID_cuLaunchKernel,
-                      CUPTI_DRIVER_TRACE_CBID_cuMemcpy,
-                      CUPTI_DRIVER_TRACE_CBID_cuMemcpyAsync,
-                      CUPTI_DRIVER_TRACE_CBID_cuMemcpyHtoD_v2,
-                      CUPTI_DRIVER_TRACE_CBID_cuMemcpyHtoDAsync_v2,
-                      CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoH_v2,
-                      CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoHAsync_v2,
-                      CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoD_v2,
-                      CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoDAsync_v2}) {
-      TF_RETURN_IF_ERROR(ToStatus(cuptiEnableCallback(
-          /*enable=*/1, subscriber_, CUPTI_CB_DOMAIN_DRIVER_API, cbid)));
-    }
-    return Status::OK();
-  }
-
-  ~CuptiCallbackHook() { LogIfError(ToStatus(cuptiUnsubscribe(subscriber_))); }
-
- private:
-  static void CUPTIAPI CuptiCallback(void* userdata,
-                                     CUpti_CallbackDomain domain,
-                                     CUpti_CallbackId cbid,
-                                     const void* cbdata) {
-    auto recorder = static_cast<CudaEventRecorder*>(userdata);
-    auto data = static_cast<const CUpti_CallbackData*>(cbdata);
-    DCHECK_EQ(domain, CUPTI_CB_DOMAIN_DRIVER_API);
-
-    if (data->callbackSite == CUPTI_API_ENTER) {
-      DriverApiEnterCallback(cbid, *data, recorder);
-    } else {
-      DriverApiExitCallback(cbid, *data, recorder);
-    }
-  }
-
-  static CUmemorytype GetMemoryType(CUdeviceptr ptr) {
-    CUmemorytype mem_type;
-    auto status =
-        cuPointerGetAttribute(&mem_type, CU_POINTER_ATTRIBUTE_MEMORY_TYPE, ptr);
-    if (status == CUDA_ERROR_INVALID_VALUE) {
-      // Pointer not registered with CUDA, must be host memory.
-      return CU_MEMORYTYPE_HOST;
-    }
-    LogIfError(ToStatus(status));
-    return mem_type;
-  }
-
-  template <typename T>
-  static void StartMemcpy(CUmemorytype src_type, CUmemorytype dst_type,
-                          const CUpti_CallbackData& cbdata,
-                          CudaEventRecorder* recorder) {
-    auto params = static_cast<const T*>(cbdata.functionParams);
-    *cbdata.correlationData = recorder->StartMemcpy(
-        src_type, dst_type, params->ByteCount, cbdata.context, nullptr);
-  }
-  template <typename T>
-  static void StartMemcpyAsync(CUmemorytype src_type, CUmemorytype dst_type,
-                               const CUpti_CallbackData& cbdata,
-                               CudaEventRecorder* recorder) {
-    auto params = static_cast<const T*>(cbdata.functionParams);
-    *cbdata.correlationData = recorder->StartMemcpy(
-        src_type, dst_type, params->ByteCount, cbdata.context, params->hStream);
-  }
-
-  static void DriverApiEnterCallback(CUpti_CallbackId cbid,
-                                     const CUpti_CallbackData& cbdata,
-                                     CudaEventRecorder* recorder) {
-    switch (cbid) {
-      case CUPTI_DRIVER_TRACE_CBID_cuLaunchKernel: {
-        DCHECK_NE(cbdata.symbolName, nullptr);
-        auto params =
-            static_cast<const cuLaunchKernel_params*>(cbdata.functionParams);
-        *cbdata.correlationData = recorder->StartKernel(
-            cbdata.symbolName, cbdata.context, params->hStream);
-        return;
-      }
-
-      case CUPTI_DRIVER_TRACE_CBID_cuMemcpy: {
-        auto params =
-            static_cast<const cuMemcpy_params*>(cbdata.functionParams);
-        return StartMemcpy<cuMemcpy_params>(GetMemoryType(params->src),
-                                            GetMemoryType(params->dst), cbdata,
-                                            recorder);
-      }
-      case CUPTI_DRIVER_TRACE_CBID_cuMemcpyAsync: {
-        auto params =
-            static_cast<const cuMemcpyAsync_params*>(cbdata.functionParams);
-        return StartMemcpyAsync<cuMemcpyAsync_params>(
-            GetMemoryType(params->src), GetMemoryType(params->dst), cbdata,
-            recorder);
-      }
-
-      case CUPTI_DRIVER_TRACE_CBID_cuMemcpyHtoD_v2:
-        return StartMemcpy<cuMemcpyHtoD_v2_params>(
-            CU_MEMORYTYPE_HOST, CU_MEMORYTYPE_DEVICE, cbdata, recorder);
-
-      case CUPTI_DRIVER_TRACE_CBID_cuMemcpyHtoDAsync_v2:
-        return StartMemcpyAsync<cuMemcpyHtoDAsync_v2_params>(
-            CU_MEMORYTYPE_HOST, CU_MEMORYTYPE_DEVICE, cbdata, recorder);
-
-      case CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoH_v2:
-        return StartMemcpy<cuMemcpyDtoH_v2_params>(
-            CU_MEMORYTYPE_DEVICE, CU_MEMORYTYPE_HOST, cbdata, recorder);
-      case CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoHAsync_v2:
-        return StartMemcpyAsync<cuMemcpyDtoHAsync_v2_params>(
-            CU_MEMORYTYPE_DEVICE, CU_MEMORYTYPE_HOST, cbdata, recorder);
-
-      case CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoD_v2:
-        return StartMemcpy<cuMemcpyDtoD_v2_params>(
-            CU_MEMORYTYPE_DEVICE, CU_MEMORYTYPE_DEVICE, cbdata, recorder);
-      case CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoDAsync_v2:
-        return StartMemcpyAsync<cuMemcpyDtoDAsync_v2_params>(
-            CU_MEMORYTYPE_DEVICE, CU_MEMORYTYPE_DEVICE, cbdata, recorder);
-
-      default:
-        LOG(ERROR) << "Unexpected callback id: " << cbid;
-    }
-  }
-
-  static void DriverApiExitCallback(CUpti_CallbackId cbid,
-                                    const CUpti_CallbackData& cbdata,
-                                    CudaEventRecorder* recorder) {
-    switch (cbid) {
-      case CUPTI_DRIVER_TRACE_CBID_cuLaunchKernel:
-        recorder->StopKernel(*cbdata.correlationData);
-        break;
-      case CUPTI_DRIVER_TRACE_CBID_cuMemcpy:
-      case CUPTI_DRIVER_TRACE_CBID_cuMemcpyAsync:
-      case CUPTI_DRIVER_TRACE_CBID_cuMemcpyHtoD_v2:
-      case CUPTI_DRIVER_TRACE_CBID_cuMemcpyHtoDAsync_v2:
-      case CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoH_v2:
-      case CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoHAsync_v2:
-      case CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoD_v2:
-      case CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoDAsync_v2:
-        recorder->StopMemcpy(*cbdata.correlationData);
-        break;
-      default:
-        LOG(ERROR) << "Unexpected callback id: " << cbid;
-    }
-  }
-
-  CUpti_SubscriberHandle subscriber_;
-};
-
-// 'GpuTracer' is an interface for collecting low-level execution timings
-// of hardware accelerator (e.g. GPU) computation and DMA transfers.
-class GpuTracer : public profiler::ProfilerInterface {
- public:
-  GpuTracer();
-  ~GpuTracer() override;
-
-  // ProfilerInterface interface:
-  Status Start() override;
-  Status Stop() override;
-  // Collect trace results.  Results are added to the specified
-  // StepStatsCollector.  Does not clear any existing stats.
-  // It is an error to call 'Collect' while a trace is running.
-  Status CollectData(RunMetadata* run_metadata) override;
-  profiler::DeviceType GetDeviceType() override {
-    return profiler::DeviceType::kGpu;
-  }
-
- private:
-  std::unique_ptr<CudaEventRecorder> recorder_;
-  std::unique_ptr<CuptiCallbackHook> cupti_hook_;
-
-  mutex mu_;
-  bool enabled_ GUARDED_BY(mu_);
-};
-
-GpuTracer::GpuTracer() : recorder_(new CudaEventRecorder()), enabled_(false) {
-  VLOG(1) << "GpuTracer created.";
-}
-
-GpuTracer::~GpuTracer() {
-  // Unregister the CUPTI callbacks if needed to prevent them from accessing
-  // freed memory.
-  Stop().IgnoreError();
-}
-
-Status GpuTracer::Start() {
-  VLOG(1) << "GpuTracer::Start";
-  mutex_lock l(mu_);
-  if (enabled_) {
-    return errors::FailedPrecondition("GpuTracer is already enabled.");
-  }
-  cupti_hook_.reset(new CuptiCallbackHook());
-  TF_RETURN_IF_ERROR(cupti_hook_->Enable(recorder_.get()));
-
-  tracing::ScopedAnnotation::Enable(true);
-
-  enabled_ = true;
-  return Status::OK();
-}
-
-Status GpuTracer::Stop() {
-  VLOG(1) << "GpuTracer::Stop";
-  mutex_lock l(mu_);
-  if (!enabled_) {
-    return Status::OK();
-  }
-  cupti_hook_.reset();
-  tracing::ScopedAnnotation::Enable(false);
-
-  enabled_ = false;
-  return Status::OK();
-}
-
-class CudaEventCollector {
-  struct DeviceInfo {
-    int ordinal;
-    std::string name;
-    int num_contexts;
-  };
-
-  struct ContextInfo {
-    int index;
-    const DeviceInfo* dev_info;
-    int num_streams;
-    CUevent end_event;
-  };
-
-  struct StreamInfo {
-    std::string name;
-    int index;  // 0 is reserved for null stream.
-    const ContextInfo* ctx_info;
-  };
-
-  // Include context in key to distinguish null streams.
-  using StreamKey = std::pair<CUcontext, CUstream>;
-
-  CudaEventCollector(CudaEventRecorder* recorder, StepStatsCollector* collector)
-      : recorder_(recorder), collector_(collector) {
-    DCHECK(recorder != nullptr);
-    DCHECK(collector != nullptr);
-  }
-
-  // Populates device_infos_ from all devices.
-  Status InitializeDeviceInfos() {
-    int count;
-    TF_RETURN_IF_ERROR(ToStatus(cuDeviceGetCount(&count)));
-    for (int ordinal = 0; ordinal < count; ++ordinal) {
-      CUdevice device;
-      TF_RETURN_IF_ERROR(ToStatus(cuDeviceGet(&device, ordinal)));
-      char name[100];
-      TF_RETURN_IF_ERROR(ToStatus(cuDeviceGetName(name, sizeof(name), device)));
-      device_infos_[device] = {ordinal, name};
-    }
-    return Status::OK();
-  }
-
-  // Returns element from context_infos_, adding it if not yet present.
-  Status GetContextInfo(CUcontext context, ContextInfo** ctx_info_ptr) {
-    auto it = context_infos_.find(context);
-
-    if (it == context_infos_.end()) {
-      TF_RETURN_IF_ERROR(ToStatus(cuCtxSetCurrent(context)));
-      CUdevice device;
-      TF_RETURN_IF_ERROR(ToStatus(cuCtxGetDevice(&device)));
-
-      auto& dev_info = device_infos_[device];
-      ContextInfo ctx_info = {dev_info.num_contexts++, &dev_info};
-      it = context_infos_.emplace(context, ctx_info).first;
-    }
-
-    *ctx_info_ptr = &it->second;
-    return Status::OK();
-  }
-
-  // Adds element to stream_infos_ if not yet present. If present, clear name
-  // if it doesn't match parameter.
-  Status AddStreamInfo(CUcontext context, CUstream stream,
-                       absl::string_view name) {
-    StreamKey key(context, stream);
-    auto it = stream_infos_.find(key);
-    if (it != stream_infos_.end()) {
-      if (it->second.name != name) {
-        it->second.name.clear();  // Stream with inconsistent names, clear it.
-      }
-      return Status::OK();
-    }
-
-    ContextInfo* ctx_info;
-    TF_RETURN_IF_ERROR(GetContextInfo(context, &ctx_info));
-    int index = stream ? ++ctx_info->num_streams : 0;
-    StreamInfo stream_info = {static_cast<std::string>(name), index, ctx_info};
-    stream_infos_.emplace(key, stream_info);
-    return Status::OK();
-  }
-
-  // Returns string describing source and destination memory types.
-  static std::string GetMemcpyName(const MemcpyRecord& record) {
-    auto get_memory_type = [](CUmemorytype mem_type) {
-      switch (mem_type) {
-        case CU_MEMORYTYPE_HOST:
-          return 'H';
-        case CU_MEMORYTYPE_DEVICE:
-          return 'D';
-        case CU_MEMORYTYPE_ARRAY:
-          return 'A';
-        case CU_MEMORYTYPE_UNIFIED:
-          return 'U';
-        default:
-          LOG(ERROR) << "Unknown memory type: " << mem_type;
-          return '?';
-      }
-    };
-    return absl::StrFormat("Memcpy%cto%c", get_memory_type(record.src_type),
-                           get_memory_type(record.dst_type));
-  }
-
-  // Returns time in microseconds between events recorded on the GPU.
-  static uint64_t GetElapsedTimeUs(CUevent start, CUevent stop) {
-    float elapsed_ms = 0.0f;
-    LogIfError(ToStatus(cuEventElapsedTime(&elapsed_ms, start, stop)));
-    return static_cast<uint64>(
-        std::llroundf(1000 * std::max(elapsed_ms, 0.0f)));
-  }
-
-  // Synchronizes all contexts.
-  Status Synchronize() const {
-    for (const auto& pair : context_infos_) {
-      TF_RETURN_IF_ERROR(ToStatus(cuCtxSetCurrent(pair.first)));
-      TF_RETURN_IF_ERROR(ToStatus(cuCtxSynchronize()));
-    }
-    return Status::OK();
-  }
-
-  // Save stats to collector;
-  Status SaveStats(std::unique_ptr<NodeExecStats> stats,
-                   const StreamInfo& stream_info) const {
-    auto ctx_info = stream_info.ctx_info;
-    auto dev_info = ctx_info->dev_info;
-    // TODO(csigg): tfprof_node.cc, run_metadata_test.py, and timeline_test.py
-    // currently require this particular formatting.
-    collector_->Save(
-        absl::StrFormat("/device:GPU:%d/stream:all", dev_info->ordinal),
-        new NodeExecStats(*stats));
-    auto name = absl::StrFormat("/gpu:%d (%s)/context#%d/", dev_info->ordinal,
-                                dev_info->name, ctx_info->index);
-    if (stream_info.index) {
-      absl::StrAppend(&name, "stream#", std::to_string(stream_info.index));
-    } else {
-      absl::StrAppend(&name, "null stream");
-    }
-    if (!stream_info.name.empty()) {
-      absl::StrAppend(&name, ":", stream_info.name);
-    }
-    collector_->Save(name, stats.release());
-    return Status::OK();
-  }
-
-  Status SaveRecord(const KernelRecord& record) const {
-    if (!record.start_event || !record.stop_event) {
-      return Status::OK();
-    }
-    const auto& stream_info =
-        stream_infos_.at(StreamKey(record.context, record.stream));
-    auto start_us =
-        GetElapsedTimeUs(record.start_event, stream_info.ctx_info->end_event);
-    auto elapsed_us = GetElapsedTimeUs(record.start_event, record.stop_event);
-
-    auto stats = absl::make_unique<NodeExecStats>();
-    std::string node_name = port::MaybeAbiDemangle(record.kernel_name);
-    // Sometimes CUPTI returns invalid characters. See b/129892466.
-    if (!IsAscii(node_name)) {
-      node_name = "<invalid_name>";
-    }
-    if (record.annotation) {
-      node_name = absl::StrCat(*record.annotation, "@@", node_name);
-    }
-    stats->set_node_name(node_name);
-    // TODO(csigg): Report grid size?
-    std::string node_label;
-    stats->set_timeline_label(node_label);
-    stats->set_all_start_micros(end_walltime_us_ - start_us);
-    stats->set_op_end_rel_micros(elapsed_us);
-    stats->set_all_end_rel_micros(elapsed_us);
-    return SaveStats(std::move(stats), stream_info);
-  }
-
-  Status SaveRecord(const MemcpyRecord& record) const {
-    if (!record.start_event || !record.stop_event) {
-      return Status::OK();
-    }
-    const auto& stream_info =
-        stream_infos_.at(StreamKey(record.context, record.stream));
-    auto start_us =
-        GetElapsedTimeUs(record.start_event, stream_info.ctx_info->end_event);
-    auto elapsed_us = GetElapsedTimeUs(record.start_event, record.stop_event);
-
-    auto stats = absl::make_unique<NodeExecStats>();
-    std::string node_name = GetMemcpyName(record);
-    // Sometimes CUPTI returns invalid characters. See b/129892466.
-    if (!IsAscii(node_name)) {
-      node_name = "<invalid_name>";
-    }
-    if (record.annotation) {
-      node_name = absl::StrCat(*record.annotation, "@@", node_name);
-    }
-    stats->set_node_name(node_name);
-    // TODO(csigg): Show label in Chrome trace viewer.
-    std::string node_label = absl::StrFormat("%d bytes", record.size_bytes);
-    stats->set_timeline_label(node_label);
-    stats->set_all_start_micros(end_walltime_us_ - start_us);
-    stats->set_op_end_rel_micros(elapsed_us);
-    stats->set_all_end_rel_micros(elapsed_us);
-    return SaveStats(std::move(stats), stream_info);
-  }
-
-  Status Collect() {
-    TF_RETURN_IF_ERROR(InitializeDeviceInfos());
-
-    auto kernel_records = recorder_->ConsumeKernelRecords();
-    auto memcpy_records = recorder_->ConsumeMemcpyRecords();
-    LOG(INFO) << "Collecting " << kernel_records.size() << " kernel records, "
-              << memcpy_records.size() << " memcpy records.";
-
-    // Gather all profiled streams and contexts.
-    for (const auto& record : kernel_records) {
-      TF_RETURN_IF_ERROR(
-          AddStreamInfo(record.context, record.stream, "Kernel"));
-    }
-    for (const auto& record : memcpy_records) {
-      TF_RETURN_IF_ERROR(
-          AddStreamInfo(record.context, record.stream, GetMemcpyName(record)));
-    }
-
-    // Synchronize all contexts, record end events, synchronize again.
-    TF_RETURN_IF_ERROR(Synchronize());
-    for (auto& pair : context_infos_) {
-      TF_RETURN_IF_ERROR(ToStatus(cuCtxSetCurrent(pair.first)));
-      TF_RETURN_IF_ERROR(CreateAndRecordEvent(&pair.second.end_event, nullptr));
-    }
-    TF_RETURN_IF_ERROR(Synchronize());
-    end_walltime_us_ = Env::Default()->NowMicros();
-
-    for (const auto& record : kernel_records) {
-      TF_RETURN_IF_ERROR(SaveRecord(record));
-    }
-    for (const auto& record : memcpy_records) {
-      TF_RETURN_IF_ERROR(SaveRecord(record));
-    }
-
-    return Status::OK();
-  }
-
- public:
-  // Consumes the records in recorder and saves them to the collector.
-  static Status Collect(CudaEventRecorder* recorder,
-                        StepStatsCollector* collector) {
-    CUcontext context;
-    TF_RETURN_IF_ERROR(ToStatus(cuCtxGetCurrent(&context)));
-    auto status = CudaEventCollector(recorder, collector).Collect();
-    TF_RETURN_IF_ERROR(ToStatus(cuCtxSetCurrent(context)));
-    return status;
-  }
-
- private:
-  CudaEventRecorder* recorder_;
-  StepStatsCollector* collector_;
-
-  absl::node_hash_map<CUdevice, DeviceInfo> device_infos_;
-  absl::node_hash_map<CUcontext, ContextInfo> context_infos_;
-  absl::flat_hash_map<StreamKey, StreamInfo, hash<StreamKey>> stream_infos_;
-  int64 end_walltime_us_;
-};
-
-Status GpuTracer::CollectData(RunMetadata* run_metadata) {
-  mutex_lock l(mu_);
-  if (enabled_) {
-    return errors::FailedPrecondition("GpuTracer is still enabled.");
-  }
-
-  StepStatsCollector step_stats_collector(run_metadata->mutable_step_stats());
-  TF_RETURN_IF_ERROR(
-      CudaEventCollector::Collect(recorder_.get(), &step_stats_collector));
-  step_stats_collector.Finalize();
-  return Status::OK();
-}
-}  // namespace
-
-// Not in anonymous namespace for testing purposes.
-std::unique_ptr<profiler::ProfilerInterface> CreateGpuTracer(
-    const profiler::ProfilerOptions& options) {
-  auto status = cuInit(0);
-  if (status != CUDA_SUCCESS) {
-    LogIfError(ToStatus(status));
-    return nullptr;
-  }
-  if (options.device_type != profiler::DeviceType::kGpu &&
-      options.device_type != profiler::DeviceType::kUnspecified)
-    return nullptr;
-  return absl::make_unique<GpuTracer>();
-}
-
-auto register_device_tracer_factory = [] {
-  bool enable;
-  TF_CHECK_OK(ReadBoolFromEnvVar("TF_ENABLE_OSS_GPU_PROFILER", true, &enable));
-  if (enable) {
-    RegisterProfilerFactory(&CreateGpuTracer);
-  }
-  return 0;
-}();
-
-}  // namespace tensorflow
-#endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/platform/posix/env_time.cc b/tensorflow/core/platform/default/env_time.cc
similarity index 100%
rename from tensorflow/core/platform/posix/env_time.cc
rename to tensorflow/core/platform/default/env_time.cc
diff --git a/tensorflow/core/platform/posix/load_library.cc b/tensorflow/core/platform/default/load_library.cc
similarity index 100%
rename from tensorflow/core/platform/posix/load_library.cc
rename to tensorflow/core/platform/default/load_library.cc
diff --git a/tensorflow/core/platform/posix/net.cc b/tensorflow/core/platform/default/net.cc
similarity index 100%
rename from tensorflow/core/platform/posix/net.cc
rename to tensorflow/core/platform/default/net.cc
diff --git a/tensorflow/core/platform/posix/port.cc b/tensorflow/core/platform/default/port.cc
similarity index 100%
rename from tensorflow/core/platform/posix/port.cc
rename to tensorflow/core/platform/default/port.cc
diff --git a/tensorflow/core/platform/default/rocm_rocdl_path.cc b/tensorflow/core/platform/default/rocm_rocdl_path.cc
index 14196044656..0831544f616 100644
--- a/tensorflow/core/platform/default/rocm_rocdl_path.cc
+++ b/tensorflow/core/platform/default/rocm_rocdl_path.cc
@@ -17,6 +17,8 @@ limitations under the License.
 
 #include <stdlib.h>
 
+#include "tensorflow/core/lib/io/path.h"
+
 #if !defined(PLATFORM_GOOGLE) && TENSORFLOW_USE_ROCM
 #include "rocm/rocm_config.h"
 #endif
@@ -33,4 +35,8 @@ string RocmRoot() {
 #endif
 }
 
+string RocdlRoot() {
+  return tensorflow::io::JoinPath(tensorflow::RocmRoot(), "hcc/lib");
+}
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/platform/default/stacktrace.h b/tensorflow/core/platform/default/stacktrace.h
index 859faecdddb..0a8c124ff51 100644
--- a/tensorflow/core/platform/default/stacktrace.h
+++ b/tensorflow/core/platform/default/stacktrace.h
@@ -20,8 +20,7 @@ limitations under the License.
 #include "tensorflow/core/platform/platform.h"
 // clang-format on
 
-#if !defined(IS_MOBILE_PLATFORM) && !defined(PLATFORM_WINDOWS) && \
-    (defined(__clang__) || defined(__GNUC__))
+#if !defined(IS_MOBILE_PLATFORM) && (defined(__clang__) || defined(__GNUC__))
 #define TF_HAS_STACKTRACE
 #endif
 
diff --git a/tensorflow/core/platform/default/stacktrace_handler.cc b/tensorflow/core/platform/default/stacktrace_handler.cc
index 72907ecb526..37a84d6a241 100644
--- a/tensorflow/core/platform/default/stacktrace_handler.cc
+++ b/tensorflow/core/platform/default/stacktrace_handler.cc
@@ -126,7 +126,12 @@ void InstallStacktraceHandler() {
   }
 }
 
-#else
+// Todo(bmzhao): This #elif should ideally be just a #else. Due to
+// the globbing in tf_additional_lib_srcs, windows/stacktrace_handler.cc
+// is compiled and linked in addition to this implementation, leading to an
+// ODR violation. This elif is a temporary hack to prevent this. Once
+// we fully refactor platform/BUILD and core/BUILD, this should no longer exist.
+#elif !defined(PLATFORM_WINDOWS)
 void InstallStacktraceHandler() {}
 #endif  // defined(TF_GENERATE_STACKTRACE)
 
diff --git a/tensorflow/core/platform/posix/subprocess.cc b/tensorflow/core/platform/default/subprocess.cc
similarity index 100%
rename from tensorflow/core/platform/posix/subprocess.cc
rename to tensorflow/core/platform/default/subprocess.cc
diff --git a/tensorflow/core/platform/posix/subprocess.h b/tensorflow/core/platform/default/subprocess.h
similarity index 100%
rename from tensorflow/core/platform/posix/subprocess.h
rename to tensorflow/core/platform/default/subprocess.h
diff --git a/tensorflow/core/platform/dynamic_annotations.h b/tensorflow/core/platform/dynamic_annotations.h
index dad0d0f4e49..61c69c3ac8a 100644
--- a/tensorflow/core/platform/dynamic_annotations.h
+++ b/tensorflow/core/platform/dynamic_annotations.h
@@ -20,7 +20,7 @@ limitations under the License.
 
 // Include appropriate platform-dependent implementation.
 #if defined(PLATFORM_GOOGLE)
-#include "tensorflow/core/platform/google/build_config/dynamic_annotations.h"
+#include "tensorflow/core/platform/google/dynamic_annotations.h"
 #elif defined(PLATFORM_POSIX) || defined(PLATFORM_POSIX_ANDROID) || \
     defined(PLATFORM_GOOGLE_ANDROID) || defined(PLATFORM_WINDOWS)
 #include "tensorflow/core/platform/default/dynamic_annotations.h"
diff --git a/tensorflow/core/platform/posix/error.cc b/tensorflow/core/platform/error.cc
similarity index 92%
rename from tensorflow/core/platform/posix/error.cc
rename to tensorflow/core/platform/error.cc
index 2bb9443fb3c..3050ce2334e 100644
--- a/tensorflow/core/platform/posix/error.cc
+++ b/tensorflow/core/platform/error.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/platform/posix/error.h"
+#include "tensorflow/core/platform/error.h"
 
 #include <errno.h>
 #include <string.h>
@@ -175,4 +175,21 @@ Status IOError(const string& context, int err_number) {
   return Status(code, strings::StrCat(context, "; ", strerror(err_number)));
 }
 
+#if defined(_WIN32)
+namespace internal {
+
+std::string GetWindowsErrorMessage(DWORD err) {
+  LPSTR buffer = NULL;
+  DWORD flags = FORMAT_MESSAGE_ALLOCATE_BUFFER | FORMAT_MESSAGE_FROM_SYSTEM |
+                FORMAT_MESSAGE_IGNORE_INSERTS;
+  FormatMessageA(flags, NULL, err, MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT),
+                 reinterpret_cast<LPSTR>(&buffer), 0, NULL);
+  std::string message = buffer;
+  LocalFree(buffer);
+  return message;
+}
+
+}  // namespace internal
+#endif
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/platform/error.h b/tensorflow/core/platform/error.h
index ae965b6c773..3ba3e749c34 100644
--- a/tensorflow/core/platform/error.h
+++ b/tensorflow/core/platform/error.h
@@ -16,15 +16,31 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_PLATFORM_ERROR_H_
 #define TENSORFLOW_CORE_PLATFORM_ERROR_H_
 
+#include <string>
+
+#include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/platform.h"
 
-#if defined(PLATFORM_GOOGLE) || defined(PLATFORM_POSIX) || \
-    defined(PLATFORM_POSIX_ANDROID) || defined(PLATFORM_GOOGLE_ANDROID)
-#include "tensorflow/core/platform/posix/error.h"
-#elif defined(PLATFORM_WINDOWS)
-#include "tensorflow/core/platform/windows/error.h"
-#else
-#error Define the appropriate PLATFORM_<foo> macro for this platform
+namespace tensorflow {
+
+Status IOError(const string& context, int err_number);
+
+}  // namespace tensorflow
+
+#if defined(PLATFORM_WINDOWS)
+
+#include <Windows.h>
+// Windows.h #defines ERROR, but it is also used in
+// tensorflow/core/util/event.proto
+#undef ERROR
+
+namespace tensorflow {
+namespace internal {
+
+std::string GetWindowsErrorMessage(DWORD err);
+}
+}  // namespace tensorflow
+
 #endif
 
 #endif  // TENSORFLOW_CORE_PLATFORM_ERROR_H_
diff --git a/tensorflow/core/platform/posix/error.h b/tensorflow/core/platform/posix/error.h
deleted file mode 100644
index 9df5f2daa16..00000000000
--- a/tensorflow/core/platform/posix/error.h
+++ /dev/null
@@ -1,27 +0,0 @@
-/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_CORE_PLATFORM_POSIX_ERROR_H_
-#define TENSORFLOW_CORE_PLATFORM_POSIX_ERROR_H_
-
-#include "tensorflow/core/lib/core/status.h"
-
-namespace tensorflow {
-
-Status IOError(const string& context, int err_number);
-
-}  // namespace tensorflow
-
-#endif  // TENSORFLOW_CORE_PLATFORM_POSIX_ERROR_H_
diff --git a/tensorflow/core/platform/posix/posix_file_system.cc b/tensorflow/core/platform/posix/posix_file_system.cc
index 10f0950c0df..c99f65535a6 100644
--- a/tensorflow/core/platform/posix/posix_file_system.cc
+++ b/tensorflow/core/platform/posix/posix_file_system.cc
@@ -32,9 +32,9 @@ limitations under the License.
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/error.h"
 #include "tensorflow/core/platform/file_system_helper.h"
 #include "tensorflow/core/platform/logging.h"
-#include "tensorflow/core/platform/posix/error.h"
 #include "tensorflow/core/platform/posix/posix_file_system.h"
 
 namespace tensorflow {
diff --git a/tensorflow/core/platform/regexp.h b/tensorflow/core/platform/regexp.h
index 1ba2c8e9780..47fcb8cb1c2 100644
--- a/tensorflow/core/platform/regexp.h
+++ b/tensorflow/core/platform/regexp.h
@@ -21,7 +21,7 @@ limitations under the License.
 #include "tensorflow/core/platform/types.h"
 
 #if defined(PLATFORM_GOOGLE) || defined(PLATFORM_GOOGLE_ANDROID)
-#include "tensorflow/core/platform/google/build_config/re2.h"
+#include "third_party/re2/re2.h"
 #else
 #include "re2/re2.h"
 #endif
diff --git a/tensorflow/core/platform/rocm_rocdl_path.cc b/tensorflow/core/platform/rocm_rocdl_path.cc
deleted file mode 100644
index bf5b2bf722c..00000000000
--- a/tensorflow/core/platform/rocm_rocdl_path.cc
+++ /dev/null
@@ -1,26 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/core/platform/rocm_rocdl_path.h"
-
-#include "tensorflow/core/lib/io/path.h"
-
-namespace tensorflow {
-
-string RocdlRoot() {
-  return tensorflow::io::JoinPath(tensorflow::RocmRoot(), "hcc/lib");
-}
-
-}  // namespace tensorflow
diff --git a/tensorflow/core/platform/stacktrace.h b/tensorflow/core/platform/stacktrace.h
index 3c953c9383b..3853fd49033 100644
--- a/tensorflow/core/platform/stacktrace.h
+++ b/tensorflow/core/platform/stacktrace.h
@@ -22,8 +22,10 @@ limitations under the License.
 #if defined(PLATFORM_GOOGLE)
 #include "tensorflow/core/platform/google/stacktrace.h"
 #elif defined(PLATFORM_POSIX) || defined(PLATFORM_POSIX_ANDROID) || \
-    defined(PLATFORM_GOOGLE_ANDROID) || defined(PLATFORM_WINDOWS)
+    defined(PLATFORM_GOOGLE_ANDROID)
 #include "tensorflow/core/platform/default/stacktrace.h"
+#elif defined(PLATFORM_WINDOWS)
+#include "tensorflow/core/platform/windows/stacktrace.h"
 #else
 #error Define the appropriate PLATFORM_<foo> macro for this platform
 #endif
diff --git a/tensorflow/core/platform/stacktrace_handler_test.cc b/tensorflow/core/platform/stacktrace_handler_test.cc
index 5bdb1491790..3cc1d848dfd 100644
--- a/tensorflow/core/platform/stacktrace_handler_test.cc
+++ b/tensorflow/core/platform/stacktrace_handler_test.cc
@@ -24,8 +24,7 @@ namespace {
 
 TEST(StacktraceHandlerTest, GeneratesStacktrace) {
   // Just make sure we can detect one of the calls in testing stack.
-  EXPECT_DEATH(raise(SIGABRT),
-               "testing::internal::UnitTestImpl::RunAllTests()");
+  EXPECT_DEATH(raise(SIGABRT), "testing::internal::UnitTestImpl::RunAllTests");
 }
 
 }  // namespace
diff --git a/tensorflow/core/platform/subprocess.h b/tensorflow/core/platform/subprocess.h
index 7c11e6232fb..d18b69a16a0 100644
--- a/tensorflow/core/platform/subprocess.h
+++ b/tensorflow/core/platform/subprocess.h
@@ -19,6 +19,8 @@ limitations under the License.
 #include <memory>
 #include <vector>
 
+#include "tensorflow/core/platform/types.h"
+
 namespace tensorflow {
 
 // Channel identifiers.
@@ -60,7 +62,7 @@ std::unique_ptr<SubProcess> CreateSubProcess(const std::vector<string>& argv);
 #include "tensorflow/core/platform/google/subprocess.h"
 #elif defined(PLATFORM_POSIX) || defined(PLATFORM_POSIX_ANDROID) || \
     defined(PLATFORM_GOOGLE_ANDROID)
-#include "tensorflow/core/platform/posix/subprocess.h"
+#include "tensorflow/core/platform/default/subprocess.h"
 #elif defined(PLATFORM_WINDOWS)
 #include "tensorflow/core/platform/windows/subprocess.h"
 #else
diff --git a/tensorflow/core/platform/windows/env.cc b/tensorflow/core/platform/windows/env.cc
index fedbd674d5f..c9faa1a7ecc 100644
--- a/tensorflow/core/platform/windows/env.cc
+++ b/tensorflow/core/platform/windows/env.cc
@@ -155,41 +155,18 @@ class WindowsEnv : public Env {
   }
 
   Status LoadLibrary(const char* library_filename, void** handle) override {
-    std::string file_name = library_filename;
-    std::replace(file_name.begin(), file_name.end(), '/', '\\');
-
-    std::wstring ws_file_name(Utf8ToWideChar(file_name));
-
-    HMODULE hModule = LoadLibraryExW(ws_file_name.c_str(), NULL,
-                                     LOAD_WITH_ALTERED_SEARCH_PATH);
-    if (!hModule) {
-      return errors::NotFound(file_name + " not found");
-    }
-    *handle = hModule;
-    return Status::OK();
+    return tensorflow::internal::LoadLibrary(library_filename, handle);
   }
 
   Status GetSymbolFromLibrary(void* handle, const char* symbol_name,
                               void** symbol) override {
-    FARPROC found_symbol;
-
-    found_symbol = GetProcAddress((HMODULE)handle, symbol_name);
-    if (found_symbol == NULL) {
-      return errors::NotFound(std::string(symbol_name) + " not found");
-    }
-    *symbol = (void**)found_symbol;
-    return Status::OK();
+    return tensorflow::internal::GetSymbolFromLibrary(handle, symbol_name,
+                                                      symbol);
   }
 
   string FormatLibraryFileName(const string& name,
                                const string& version) override {
-    string filename;
-    if (version.size() == 0) {
-      filename = name + ".dll";
-    } else {
-      filename = name + version + ".dll";
-    }
-    return filename;
+    return tensorflow::internal::FormatLibraryFileName(name, version);
   }
 
   string GetRunfilesDir() override {
diff --git a/tensorflow/core/platform/windows/error.cc b/tensorflow/core/platform/windows/error.cc
deleted file mode 100644
index 291fc5003fb..00000000000
--- a/tensorflow/core/platform/windows/error.cc
+++ /dev/null
@@ -1,33 +0,0 @@
-/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/core/platform/windows/error.h"
-
-namespace tensorflow {
-namespace internal {
-
-std::string GetWindowsErrorMessage(DWORD err) {
-  LPSTR buffer = NULL;
-  DWORD flags = FORMAT_MESSAGE_ALLOCATE_BUFFER | FORMAT_MESSAGE_FROM_SYSTEM |
-                FORMAT_MESSAGE_IGNORE_INSERTS;
-  FormatMessageA(flags, NULL, err, MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT),
-                 reinterpret_cast<LPSTR>(&buffer), 0, NULL);
-  std::string message = buffer;
-  LocalFree(buffer);
-  return message;
-}
-
-}  // namespace internal
-}  // namespace tensorflow
diff --git a/tensorflow/core/platform/windows/error.h b/tensorflow/core/platform/windows/error.h
deleted file mode 100644
index 22875ac2bc4..00000000000
--- a/tensorflow/core/platform/windows/error.h
+++ /dev/null
@@ -1,33 +0,0 @@
-/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_CORE_PLATFORM_WINDOWS_ERROR_H_
-#define TENSORFLOW_CORE_PLATFORM_WINDOWS_ERROR_H_
-
-#include <string>
-
-#include <Windows.h>
-// Windows.h #defines ERROR, but it is also used in
-// tensorflow/core/util/event.proto
-#undef ERROR
-
-namespace tensorflow {
-namespace internal {
-
-std::string GetWindowsErrorMessage(DWORD err);
-}
-}  // namespace tensorflow
-
-#endif  // TENSORFLOW_CORE_PLATFORM_WINDOWS_ERROR_H_
diff --git a/tensorflow/core/platform/windows/load_library.cc b/tensorflow/core/platform/windows/load_library.cc
new file mode 100644
index 00000000000..177253debdc
--- /dev/null
+++ b/tensorflow/core/platform/windows/load_library.cc
@@ -0,0 +1,76 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/platform/load_library.h"
+
+#include <Shlwapi.h>
+#undef StrCat  // Don't let StrCat be renamed to lstrcatA
+#include <errno.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <time.h>
+#include <windows.h>
+#undef LoadLibrary
+#undef ERROR
+
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/platform/windows/wide_char.h"
+
+#pragma comment(lib, "Shlwapi.lib")
+
+namespace tensorflow {
+
+namespace internal {
+
+Status LoadLibrary(const char* library_filename, void** handle) {
+  string file_name = library_filename;
+  std::replace(file_name.begin(), file_name.end(), '/', '\\');
+
+  std::wstring ws_file_name(Utf8ToWideChar(file_name));
+
+  HMODULE hModule =
+      LoadLibraryExW(ws_file_name.c_str(), NULL, LOAD_WITH_ALTERED_SEARCH_PATH);
+  if (!hModule) {
+    return errors::NotFound(file_name + " not found");
+  }
+  *handle = hModule;
+  return Status::OK();
+}
+
+Status GetSymbolFromLibrary(void* handle, const char* symbol_name,
+                            void** symbol) {
+  FARPROC found_symbol;
+
+  found_symbol = GetProcAddress((HMODULE)handle, symbol_name);
+  if (found_symbol == NULL) {
+    return errors::NotFound(std::string(symbol_name) + " not found");
+  }
+  *symbol = (void**)found_symbol;
+  return Status::OK();
+}
+
+string FormatLibraryFileName(const string& name, const string& version) {
+  string filename;
+  if (version.size() == 0) {
+    filename = name + ".dll";
+  } else {
+    filename = name + version + ".dll";
+  }
+  return filename;
+}
+
+}  // namespace internal
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/platform/windows/net.cc b/tensorflow/core/platform/windows/net.cc
index 2ab558ab95c..787085086db 100644
--- a/tensorflow/core/platform/windows/net.cc
+++ b/tensorflow/core/platform/windows/net.cc
@@ -15,14 +15,14 @@ limitations under the License.
 
 #include "tensorflow/core/platform/net.h"
 
-#include <cstdlib>
-#include <unordered_set>
-
 #include <sys/types.h>
 #include <winsock2.h>
 
+#include <cstdlib>
+#include <unordered_set>
+
+#include "tensorflow/core/platform/error.h"
 #include "tensorflow/core/platform/logging.h"
-#include "tensorflow/core/platform/windows/error.h"
 
 #undef ERROR
 
diff --git a/tensorflow/core/platform/windows/stacktrace.cc b/tensorflow/core/platform/windows/stacktrace.cc
new file mode 100644
index 00000000000..0a5b56506e3
--- /dev/null
+++ b/tensorflow/core/platform/windows/stacktrace.cc
@@ -0,0 +1,101 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/platform/windows/stacktrace.h"
+
+// clang-format off
+#include <windows.h>  // Windows.h must be declared above dgbhelp.
+#include <dbghelp.h>
+// clang-format on
+
+#include <string>
+
+#include "tensorflow/core/platform/mutex.h"
+
+#pragma comment(lib, "dbghelp.lib")
+
+namespace tensorflow {
+
+// We initialize the Symbolizer on first call:
+// https://docs.microsoft.com/en-us/windows/win32/debug/initializing-the-symbol-handler
+static bool SymbolsAreAvailableInit() {
+  SymSetOptions(SYMOPT_UNDNAME | SYMOPT_DEFERRED_LOADS);
+  return SymInitialize(GetCurrentProcess(), NULL, true);
+}
+
+static bool SymbolsAreAvailable() {
+  static bool kSymbolsAvailable = SymbolsAreAvailableInit();  // called once
+  return kSymbolsAvailable;
+}
+
+// Generating stacktraces involve two steps:
+// 1. Producing a list of pointers, where each pointer corresponds to the
+//    function called at each stack frame (aka stack unwinding).
+// 2. Converting each pointer into a human readable string corresponding to
+//    the function's name (aka symbolization).
+// Windows provides two APIs for stack unwinding: StackWalk
+// (https://docs.microsoft.com/en-us/windows/win32/api/dbghelp/nf-dbghelp-stackwalk64)
+// and CaptureStackBackTrace
+// (https://docs.microsoft.com/en-us/windows/win32/debug/capturestackbacktrace).
+// Although StackWalk is more flexible, it does not have any threadsafety
+// guarantees. See https://stackoverflow.com/a/17890764
+// Windows provides one symbolization API, SymFromAddr:
+// https://docs.microsoft.com/en-us/windows/win32/debug/retrieving-symbol-information-by-address
+// which is unfortunately not threadsafe. Therefore, we acquire a lock prior to
+// calling it, making this function NOT async-signal-safe.
+// FYI from m3b@ about signal safety:
+// Functions that block when acquiring mutexes are not async-signal-safe
+// primarily because the signal might have been delivered to a thread that holds
+// the lock. That is, the thread could self-deadlock if a signal is delivered at
+// the wrong moment; no other threads are needed.
+std::string CurrentStackTrace() {
+  // For reference, many stacktrace-related Windows APIs are documented here:
+  // https://docs.microsoft.com/en-us/windows/win32/debug/about-dbghelp.
+  HANDLE current_process = GetCurrentProcess();
+  static constexpr int kMaxStackFrames = 64;
+  void* trace[kMaxStackFrames];
+  int num_frames = CaptureStackBackTrace(0, kMaxStackFrames, trace, NULL);
+
+  static mutex mu(tensorflow::LINKER_INITIALIZED);
+
+  std::string stacktrace;
+  for (int i = 0; i < num_frames; ++i) {
+    const char* symbol = "(unknown)";
+    if (SymbolsAreAvailable()) {
+      char symbol_info_buffer[sizeof(SYMBOL_INFO) +
+                              MAX_SYM_NAME * sizeof(TCHAR)];
+      SYMBOL_INFO* symbol_ptr =
+          reinterpret_cast<SYMBOL_INFO*>(symbol_info_buffer);
+      symbol_ptr->SizeOfStruct = sizeof(SYMBOL_INFO);
+      symbol_ptr->MaxNameLen = MAX_SYM_NAME;
+
+      // Because SymFromAddr is not threadsafe, we acquire a lock.
+      mutex_lock lock(mu);
+      if (SymFromAddr(current_process, reinterpret_cast<DWORD64>(trace[i]), 0,
+                      symbol_ptr)) {
+        symbol = symbol_ptr->Name;
+      }
+    }
+
+    char buffer[256];
+    snprintf(buffer, sizeof(buffer), "0x%p\t%s", trace[i], symbol);
+    stacktrace += buffer;
+    stacktrace += "\n";
+  }
+
+  return stacktrace;
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/platform/windows/stacktrace.h b/tensorflow/core/platform/windows/stacktrace.h
new file mode 100644
index 00000000000..62aff0b14b0
--- /dev/null
+++ b/tensorflow/core/platform/windows/stacktrace.h
@@ -0,0 +1,53 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_PLATFORM_WINDOWS_STACKTRACE_H_
+#define TENSORFLOW_CORE_PLATFORM_WINDOWS_STACKTRACE_H_
+
+#include <string>
+
+#define TF_HAS_STACKTRACE
+
+namespace tensorflow {
+
+// Function to create a pretty stacktrace.
+std::string CurrentStackTrace();
+
+inline void DebugWriteToString(const char* data, void* arg) {
+  reinterpret_cast<std::string*>(arg)->append(data);
+}
+
+// A dummy class that does nothing.  Someday, add real support.
+class SavedStackTrace {
+ public:
+  SavedStackTrace() {}
+
+  void CreateCurrent(int skip_count) {}
+
+  void Reset() {}
+
+  typedef void DebugWriter(const char*, void*);
+  void Dump(DebugWriter* writerfn, void* arg) const {}
+
+  int depth() const { return 0; }
+  void* const* stack() const { return stack_; }
+
+ private:
+  void* stack_[32];
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PLATFORM_WINDOWS_STACKTRACE_H_
diff --git a/tensorflow/core/platform/windows/stacktrace_handler.cc b/tensorflow/core/platform/windows/stacktrace_handler.cc
new file mode 100644
index 00000000000..e8f05ada266
--- /dev/null
+++ b/tensorflow/core/platform/windows/stacktrace_handler.cc
@@ -0,0 +1,184 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/platform/stacktrace_handler.h"
+
+// clang-format off
+#include <windows.h>  // Windows.h must be declared above dgbhelp.
+#include <dbghelp.h>
+// clang-format on
+
+#include <io.h>
+#include <errno.h>
+#include <signal.h>
+#include <stdlib.h>
+#include <stdio.h>
+
+#include <string>
+
+#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/stacktrace.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+
+
+// This mutex allows us to unblock an alarm thread.
+static mutex alarm_mu(LINKER_INITIALIZED);
+static bool alarm_activated = false;
+
+static void AlarmThreadBody() {
+  // Wait until the alarm_activated bool is true, sleep for 60 seconds,
+  // then kill the program.
+  alarm_mu.lock();
+  alarm_mu.Await(Condition(&alarm_activated));
+  alarm_mu.unlock();
+  Sleep(60000);
+
+  // Reinstall the standard signal handler, so that we actually abort the
+  // program, instead of just re-triggering the signal handler.
+  signal(SIGABRT, SIG_DFL);
+  abort();
+}
+
+// There is no generally available async-signal safe function for converting an
+// integer (or pointer) to ASCII (see:
+// http://man7.org/linux/man-pages/man7/signal-safety.7.html). This function
+// attempts to convert a ptr to a hexadecimal string stored in buffer `buf` of
+// size `size`. The string has a '0x' prefix, and a '\n\0' (newline + null
+// terminator) suffix. If the returned value is true, buf contains the string
+// described above. If the returned value is false, buf is unchanged. Ideally,
+// clients should provide a buffer with size >= 2 * sizeof(uintptr_t) + 4.
+static bool PtrToString(uintptr_t ptr, char* buf, size_t size) {
+  static constexpr char kHexCharacters[] = "0123456789abcdef";
+  static constexpr int kHexBase = 16;
+
+  // The addressible space of an N byte pointer is at most 2^(8N).
+  // Since we are printing an address in hexadecimal, the number of hex
+  // characters we must print (lets call this H) satisfies 2^(8N) = 16^H.
+  // Therefore H = 2N.
+  size_t num_hex_chars = 2 * sizeof(uintptr_t);
+
+  // The buffer size also needs 4 extra bytes:
+  // 2 bytes for 0x prefix,
+  // 1 byte for a '\n' newline suffix, and
+  // 1 byte for a '\0' null terminator.
+  if (size < (num_hex_chars + 4)) {
+    return false;
+  }
+
+  buf[0] = '0';
+  buf[1] = 'x';
+
+  // Convert the entire number to hex, going backwards.
+  int start_index = 2;
+  for (int i = num_hex_chars - 1 + start_index; i >= start_index; --i) {
+    buf[i] = kHexCharacters[ptr % kHexBase];
+    ptr /= kHexBase;
+  }
+
+  // Terminate the output with a newline and NULL terminator.
+  int current_index = start_index + num_hex_chars;
+  buf[current_index] = '\n';
+  buf[current_index + 1] = '\0';
+
+  return true;
+}
+
+// This function will print a stacktrace of pointers to STDERR.
+// It avoids using malloc, so it makes sure to dump the stack even when the heap
+// is corrupted. It also does not call Window's symbolization function (which
+// requires acquiring a mutex), which is not safe in a signal handler.
+static inline void SafePrintStackTracePointers() {
+  static constexpr char begin_msg[] = "*** BEGIN STACK TRACE POINTERS ***\n";
+  (void)_write(_fileno(stderr), begin_msg, strlen(begin_msg));
+
+  static constexpr int kMaxStackFrames = 64;
+  void* trace[kMaxStackFrames];
+  int num_frames = CaptureStackBackTrace(0, kMaxStackFrames, trace, NULL);
+
+  for (int i = 0; i < num_frames; ++i) {
+    char buffer[32] = "unsuccessful ptr conversion";
+    PtrToString(reinterpret_cast<uintptr_t>(trace[i]), buffer, sizeof(buffer));
+    (void)_write(_fileno(stderr), buffer, strlen(buffer));
+  }
+
+  static constexpr char end_msg[] = "*** END STACK TRACE POINTERS ***\n\n";
+  (void)_write(_fileno(stderr), end_msg, strlen(end_msg));
+}
+
+static void StacktraceHandler(int sig) {
+  // We want to make sure our handler does not deadlock; this should be the last
+  // thing our program does. In unix systems, we can use setitimer and SIGALRM,
+  // to send an alarm signal killing our process after a set amount of time.
+  // Since Windows does not support this, we unblock a sleeping thread meant
+  // to abort the program ~60 seconds after waking.
+  alarm_mu.lock();
+  alarm_activated = true;
+  alarm_mu.unlock();
+
+  char buf[128];
+  snprintf(buf, sizeof(buf), "*** Received signal %d ***\n", sig);
+  (void)write(_fileno(stderr), buf, strlen(buf));
+
+  // Print "a" stack trace, as safely as possible.
+  SafePrintStackTracePointers();
+
+  // Up until this line, we made sure not to allocate memory, to be able to dump
+  // a stack trace even in the event of heap corruption. After this line, we
+  // will try to print more human readable things to the terminal.
+  // But these have a higher probability to fail.
+  std::string stacktrace = CurrentStackTrace();
+  (void)write(_fileno(stderr), stacktrace.c_str(), stacktrace.length());
+
+  // Reinstall the standard signal handler, so that we actually abort the
+  // program, instead of just re-triggering the signal handler.
+  signal(SIGABRT, SIG_DFL);
+  abort();
+}
+
+namespace testing {
+
+// Windows documentation on signal handling:
+// https://docs.microsoft.com/en-us/cpp/c-runtime-library/reference/signal?view=vs-2019
+void InstallStacktraceHandler() {
+  int handled_signals[] = {SIGSEGV, SIGABRT, SIGILL, SIGFPE};
+
+  std::thread alarm_thread(AlarmThreadBody);
+  alarm_thread.detach();
+
+  typedef void (*SignalHandlerPointer)(int);
+
+  for (int sig : handled_signals) {
+    SignalHandlerPointer previousHandler = signal(sig, StacktraceHandler);
+    if (previousHandler == SIG_ERR) {
+      char buf[128];
+      snprintf(buf, sizeof(buf),
+               "tensorflow::InstallStackTraceHandler: Warning, can't install "
+               "backtrace signal handler for signal %d, errno:%d \n",
+               sig, errno);
+      (void)write(_fileno(stderr), buf, strlen(buf));
+    } else if (previousHandler != SIG_DFL) {
+      char buf[128];
+      snprintf(buf, sizeof(buf),
+               "tensorflow::InstallStackTraceHandler: Warning, backtrace "
+               "signal handler for signal %d overwrote previous handler.\n",
+               sig);
+      (void)write(_fileno(stderr), buf, strlen(buf));
+    }
+  }
+}
+
+}  // namespace testing
+}  // namespace tensorflow
diff --git a/tensorflow/core/platform/windows/windows_file_system.cc b/tensorflow/core/platform/windows/windows_file_system.cc
index 14543c29f52..f2d4a16a873 100644
--- a/tensorflow/core/platform/windows/windows_file_system.cc
+++ b/tensorflow/core/platform/windows/windows_file_system.cc
@@ -28,10 +28,9 @@ limitations under the License.
 #include "tensorflow/core/lib/core/error_codes.pb.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/error.h"
 #include "tensorflow/core/platform/file_system_helper.h"
 #include "tensorflow/core/platform/logging.h"
-#include "tensorflow/core/platform/posix/error.h"
-#include "tensorflow/core/platform/windows/error.h"
 #include "tensorflow/core/platform/windows/wide_char.h"
 #include "tensorflow/core/platform/windows/windows_file_system.h"
 
diff --git a/tensorflow/core/profiler/BUILD b/tensorflow/core/profiler/BUILD
index b4ba7bc532a..5e454597cd8 100644
--- a/tensorflow/core/profiler/BUILD
+++ b/tensorflow/core/profiler/BUILD
@@ -50,6 +50,7 @@ tf_proto_library(
     cc_api_version = 2,
     cc_grpc_version = 1,
     protodeps = [":op_profile_proto"] + tf_additional_all_protos(),
+    provide_cc_alias = True,
 )
 
 tf_proto_library(
@@ -59,6 +60,7 @@ tf_proto_library(
     cc_api_version = 2,
     cc_grpc_version = 1,
     protodeps = [":profiler_service_proto"],
+    provide_cc_alias = True,
 )
 
 tf_proto_library(
diff --git a/tensorflow/core/profiler/internal/BUILD b/tensorflow/core/profiler/internal/BUILD
index 53505eb1210..ea4cb7481f9 100644
--- a/tensorflow/core/profiler/internal/BUILD
+++ b/tensorflow/core/profiler/internal/BUILD
@@ -426,7 +426,6 @@ tf_cc_test(
     deps = [
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:core_cpu_internal",
-        "//tensorflow/core:device_tracer",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
diff --git a/tensorflow/core/profiler/internal/gpu/BUILD b/tensorflow/core/profiler/internal/gpu/BUILD
index 41780b02ae2..40a0a744b45 100644
--- a/tensorflow/core/profiler/internal/gpu/BUILD
+++ b/tensorflow/core/profiler/internal/gpu/BUILD
@@ -1,6 +1,19 @@
+load(
+    "//tensorflow:tensorflow.bzl",
+    "tf_copts",
+    "tf_cuda_library",
+    "if_cuda_is_configured_compat",
+)
+load("//tensorflow:tensorflow.bzl", "tf_cc_test_gpu")
 load(
     "//tensorflow/core/platform:default/build_config.bzl",
     "tf_additional_cupti_utils_cuda_deps",
+    "tf_additional_device_tracer_srcs",
+    "tf_kernel_tests_linkstatic",
+)
+load(
+    "//tensorflow/core/platform:default/build_config_root.bzl",
+    "tf_cuda_tests_tags",
 )
 
 package(
@@ -8,16 +21,54 @@ package(
     licenses = ["notice"],  # Apache 2.0
 )
 
-alias(
+tf_cuda_library(
     name = "device_tracer",
-    actual = "//tensorflow/core:device_tracer",
+    srcs = tf_additional_device_tracer_srcs(),
+    copts = tf_copts(),
+    cuda_deps = [
+        "//tensorflow/core/profiler/internal/gpu:cupti_tracer",
+        "//tensorflow/core/profiler/internal/gpu:cupti_wrapper",
+    ],
+    deps = [
+        ":cupti_utils",
+        "//tensorflow/core:core_cpu_internal",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/profiler/internal:parse_annotation",
+        "//tensorflow/core/profiler/internal:profiler_interface",
+        "//tensorflow/core/profiler/lib:traceme",
+        "@com_google_absl//absl/flags:flag",
+    ],
+    alwayslink = 1,
 )
 
-load(
-    "//tensorflow:tensorflow.bzl",
-    "tf_copts",
-    "tf_cuda_library",
-    "if_cuda_is_configured_compat",
+tf_cc_test_gpu(
+    name = "device_tracer_test",
+    size = "small",
+    srcs = ["device_tracer_test.cc"],
+    args = ["--heap_check=local"],
+    linkstatic = tf_kernel_tests_linkstatic(),
+    tags = tf_cuda_tests_tags() + ["nomac"],
+    deps = [
+        ":device_tracer",
+        "//tensorflow/cc:cc_ops",
+        "//tensorflow/core:all_kernels",
+        "//tensorflow/core:core_cpu",
+        "//tensorflow/core:core_cpu_internal",
+        "//tensorflow/core:direct_session",
+        "//tensorflow/core:direct_session_internal",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:framework_internal",
+        "//tensorflow/core:gpu_runtime",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+        "//tensorflow/core/kernels:ops_util",
+        "//tensorflow/core/profiler/internal:profiler_interface",
+    ],
 )
 
 tf_cuda_library(
@@ -52,10 +103,12 @@ tf_cuda_library(
     visibility = ["//visibility:public"],
     deps = [
         ":cupti_interface",
+        ":cupti_utils",
         "//tensorflow/core:lib",
         "//tensorflow/core/platform:annotation",
         "@com_google_absl//absl/container:fixed_array",
         "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:node_hash_map",
         "@com_google_absl//absl/container:node_hash_set",
         "@com_google_absl//absl/types:optional",
     ],
diff --git a/tensorflow/core/profiler/internal/gpu/cupti_tracer.cc b/tensorflow/core/profiler/internal/gpu/cupti_tracer.cc
index f0188c5f41f..cefb233aee1 100644
--- a/tensorflow/core/profiler/internal/gpu/cupti_tracer.cc
+++ b/tensorflow/core/profiler/internal/gpu/cupti_tracer.cc
@@ -15,8 +15,11 @@ limitations under the License.
 
 #include "tensorflow/core/profiler/internal/gpu/cupti_tracer.h"
 
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/node_hash_map.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/gtl/cleanup.h"
+#include "tensorflow/core/lib/hash/hash.h"
 #include "tensorflow/core/platform/annotation.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/logging.h"
@@ -26,6 +29,30 @@ namespace tensorflow {
 namespace profiler {
 
 namespace {
+
+Status ToStatus(CUptiResult result) {
+  if (result == CUPTI_SUCCESS) {
+    return Status::OK();
+  }
+  const char *str = nullptr;
+  cuptiGetResultString(result, &str);
+  return errors::Unavailable("CUPTI error: ", str ? str : "<unknown>");
+}
+
+Status ToStatus(CUresult result) {
+  if (result == CUDA_SUCCESS) {
+    return Status::OK();
+  }
+  const char *str = nullptr;
+  cuGetErrorName(result, &str);
+  return errors::Unavailable("CUDA error: ", str ? str : "<unknown>");
+}
+
+inline void LogIfError(const Status &status) {
+  if (status.ok()) return;
+  LOG(ERROR) << status.error_message();
+}
+
 // Maps an OverheadKind enum to a const string.
 const char *getActivityOverheadKindString(CUpti_ActivityOverheadKind kind) {
   switch (kind) {
@@ -82,8 +109,8 @@ const char *getActivityUnifiedMemoryKindString(
 // GetCachedTID() caches the thread ID in thread-local storage (which is a
 // userspace construct) to avoid unnecessary system calls. Without this caching,
 // it can take roughly 98ns, while it takes roughly 1ns with this caching.
-pid_t GetCachedTID() {
-  static thread_local pid_t current_thread_id =
+int32 GetCachedTID() {
+  static thread_local int32 current_thread_id =
       Env::Default()->GetCurrentThreadId();
   return current_thread_id;
 }
@@ -195,15 +222,15 @@ DecodeDriverMemcpy(CUpti_CallbackId cbid, const void *params) {
 }
 
 // Cupti callback corresponding to a driver or runtime API. This global function
-// is invoked twice for each API: at entry and at exit. The callback_info
+// is invoked twice for each API: at entry and at exit. The cbdata
 // parameter is guaranteed by Cupti to be thread-safe. Most invocations are
 // dropped to the floor and entry/exit is tracked for the APIs we deem
 // performance-relevant.
 void CUPTIAPI ApiCallback(void *user_data, CUpti_CallbackDomain domain,
                           CUpti_CallbackId cbid,
-                          const CUpti_CallbackData *callback_info) {
+                          const CUpti_CallbackData *cbdata) {
   CuptiTracer *tracer = reinterpret_cast<CuptiTracer *>(user_data);
-  tracer->HandleCallback(domain, cbid, callback_info).IgnoreError();
+  tracer->HandleCallback(domain, cbid, cbdata).IgnoreError();
 }
 
 // Callback which is invoked when an empty buffer is requested by CUPTI.
@@ -255,25 +282,25 @@ void CUPTIAPI FreeCuptiActivityBuffer(CUcontext context, uint32_t stream_id,
 }
 
 void AddKernelEventUponApiExit(CuptiTraceCollector *collector, uint32 device_id,
-                               const CUpti_CallbackData *callback_info,
+                               const CUpti_CallbackData *cbdata,
                                uint64 start_time, uint64 end_time) {
   CuptiTracerEvent event;
   event.type = CuptiTracerEventType::Kernel;
   event.source = CuptiTracerEventSource::DriverCallback;
-  event.name = callback_info->symbolName;
+  event.name = cbdata->symbolName;
   event.start_time_ns = start_time;
   event.end_time_ns = end_time;
   event.thread_id = GetCachedTID();
   event.device_id = device_id;
-  event.context_id = callback_info->contextUid;
-  event.correlation_id = callback_info->correlationId;
+  event.context_id = cbdata->contextUid;
+  event.correlation_id = cbdata->correlationId;
   VLOG(3) << "Cuda Kernel Launched: " << event.name;
   collector->AddEvent(std::move(event));
 }
 
 // Performs the actual callback for both normal and P2P memcpy operations.
 CuptiTracerEvent PopulateMemcpyCallbackEvent(
-    CuptiTracerEventType type, const CUpti_CallbackData *callback_info,
+    CuptiTracerEventType type, const CUpti_CallbackData *cbdata,
     size_t num_bytes, uint32 src_device, uint32 dst_device, bool async,
     uint64 start_time, uint64 end_time) {
   CuptiTracerEvent event;
@@ -283,8 +310,8 @@ CuptiTracerEvent PopulateMemcpyCallbackEvent(
   event.end_time_ns = end_time;
   event.thread_id = GetCachedTID();
   event.device_id = src_device;
-  event.context_id = callback_info->contextUid;
-  event.correlation_id = callback_info->correlationId;
+  event.context_id = cbdata->contextUid;
+  event.correlation_id = cbdata->correlationId;
   event.memcpy_info.kind = CUPTI_ACTIVITY_MEMCPY_KIND_UNKNOWN;
   event.memcpy_info.num_bytes = num_bytes;
   event.memcpy_info.destination = dst_device;
@@ -294,63 +321,61 @@ CuptiTracerEvent PopulateMemcpyCallbackEvent(
 
 void AddNormalMemcpyEventUponApiExit(CuptiTraceCollector *collector,
                                      uint32 device_id, CUpti_CallbackId cbid,
-                                     const CUpti_CallbackData *callback_info,
+                                     const CUpti_CallbackData *cbdata,
                                      uint64 start_time, uint64 end_time) {
   size_t num_bytes;
   CuptiTracerEventType type;
   bool async;
   std::tie(num_bytes, type, async) =
-      DecodeDriverMemcpy(cbid, callback_info->functionParams);
+      DecodeDriverMemcpy(cbid, cbdata->functionParams);
 
   VLOG(3) << "Cuda Memcpy observed :" << num_bytes;
   CuptiTracerEvent event =
-      PopulateMemcpyCallbackEvent(type, callback_info, num_bytes, device_id,
-                                  device_id, async, start_time, end_time);
+      PopulateMemcpyCallbackEvent(type, cbdata, num_bytes, device_id, device_id,
+                                  async, start_time, end_time);
   collector->AddEvent(std::move(event));
 }
 
 void AddP2PMemcpyEventUponApiExit(CuptiTraceCollector *collector,
                                   CuptiInterface *cupti_interface,
                                   uint32 device_id, CUpti_CallbackId cbid,
-                                  const CUpti_CallbackData *callback_info,
+                                  const CUpti_CallbackData *cbdata,
                                   uint64 start_time, uint64 end_time) {
   size_t num_bytes;
   CuptiTracerEventType type;
   bool async;
   std::tie(num_bytes, type, async) =
-      DecodeDriverMemcpy(cbid, callback_info->functionParams);
+      DecodeDriverMemcpy(cbid, cbdata->functionParams);
 
   uint32 dst_device = -1, src_device = -1;
   const cuMemcpyPeer_params *p2p_params =
-      reinterpret_cast<const cuMemcpyPeer_params *>(
-          callback_info->functionParams);
+      reinterpret_cast<const cuMemcpyPeer_params *>(cbdata->functionParams);
   cupti_interface->GetDeviceId(p2p_params->srcContext, &src_device);
   cupti_interface->GetDeviceId(p2p_params->dstContext, &dst_device);
   VLOG(3) << "Cuda P2P Memcpy observed, src: " << src_device
           << " dst: " << dst_device << " size:" << num_bytes;
   CuptiTracerEvent event =
-      PopulateMemcpyCallbackEvent(type, callback_info, num_bytes, src_device,
+      PopulateMemcpyCallbackEvent(type, cbdata, num_bytes, src_device,
                                   dst_device, async, start_time, end_time);
   collector->AddEvent(std::move(event));
 }
 
 void AddCudaMallocEventUponApiExit(CuptiTraceCollector *collector,
                                    uint32 device_id, CUpti_CallbackId cbid,
-                                   const CUpti_CallbackData *callback_info,
+                                   const CUpti_CallbackData *cbdata,
                                    uint64 start_time, uint64 end_time) {
   const cuMemAlloc_v2_params_st *params =
-      reinterpret_cast<const cuMemAlloc_v2_params_st *>(
-          callback_info->functionParams);
+      reinterpret_cast<const cuMemAlloc_v2_params_st *>(cbdata->functionParams);
   CuptiTracerEvent event;
   event.type = CuptiTracerEventType::MemoryAlloc;
   event.source = CuptiTracerEventSource::DriverCallback;
-  event.name = callback_info->functionName;
+  event.name = cbdata->functionName;
   event.start_time_ns = start_time;
   event.end_time_ns = end_time;
   event.thread_id = GetCachedTID();
   event.device_id = device_id;
-  event.context_id = callback_info->contextUid;
-  event.correlation_id = callback_info->correlationId;
+  event.context_id = cbdata->contextUid;
+  event.correlation_id = cbdata->correlationId;
   event.memalloc_info.num_bytes = params->bytesize;
   VLOG(3) << "Cuda Malloc/Free observed: " << params->bytesize;
   collector->AddEvent(std::move(event));
@@ -358,18 +383,18 @@ void AddCudaMallocEventUponApiExit(CuptiTraceCollector *collector,
 
 void AddGenericEventUponApiExit(CuptiTraceCollector *collector,
                                 uint32 device_id, CUpti_CallbackId cbid,
-                                const CUpti_CallbackData *callback_info,
+                                const CUpti_CallbackData *cbdata,
                                 uint64 start_time, uint64 end_time) {
   CuptiTracerEvent event;
   event.type = CuptiTracerEventType::Generic;
   event.source = CuptiTracerEventSource::DriverCallback;
-  event.name = callback_info->functionName;
+  event.name = cbdata->functionName;
   event.start_time_ns = start_time;
   event.end_time_ns = end_time;
   event.thread_id = GetCachedTID();
   event.device_id = device_id;
-  event.context_id = callback_info->contextUid;
-  event.correlation_id = callback_info->correlationId;
+  event.context_id = cbdata->contextUid;
+  event.correlation_id = cbdata->correlationId;
   collector->AddEvent(std::move(event));
 }
 
@@ -546,10 +571,608 @@ void AddUnifiedMemoryActivityEvent(
   collector->AddEvent(std::move(event));
 }
 
+// This hook uses cupti activity api to measure device side activities.
+class CuptiDriverApiHookWithActivityApi : public CuptiDriverApiHook {
+ public:
+  CuptiDriverApiHookWithActivityApi(const CuptiTracerOptions &option,
+                                    CuptiInterface *cupti_interface,
+                                    CuptiTraceCollector *collector,
+                                    AnnotationMap *annotation_map)
+      : option_(option),
+        cupti_interface_(cupti_interface),
+        collector_(collector),
+        annotation_map_(annotation_map) {}
+
+  Status OnDriverApiEnter(int device_id, CUpti_CallbackDomain domain,
+                          CUpti_CallbackId cbid,
+                          const CUpti_CallbackData *cbdata) override {
+    // Stash away the current Cupti timestamp into cbdata.
+    *cbdata->correlationData =
+        option_.required_callback_api_events ? CuptiTracer::GetTimestamp() : 0;
+    return Status::OK();
+  }
+  Status OnDriverApiExit(int device_id, CUpti_CallbackDomain domain,
+                         CUpti_CallbackId cbid,
+                         const CUpti_CallbackData *cbdata) override {
+    // If we are not collecting CPU events from Callback API, we can return now.
+    if (!option_.required_callback_api_events) {
+      return Status::OK();
+    }
+
+    // Grab timestamp for API exit. API entry timestamp saved in cbdata.
+    uint64 end_tsc = CuptiTracer::GetTimestamp();
+    uint64 start_tsc = *cbdata->correlationData;
+    return AddDriverApiCallbackEvent(collector_, cupti_interface_, device_id,
+                                     start_tsc, end_tsc, domain, cbid, cbdata);
+  }
+  Status Flush() override { return Status::OK(); }
+
+ private:
+  const CuptiTracerOptions option_;
+  CuptiInterface *cupti_interface_;
+  CuptiTraceCollector *collector_;
+  AnnotationMap *annotation_map_;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(CuptiDriverApiHookWithActivityApi);
+};
+
+struct KernelRecord {
+  const char *kernel_name;
+  // TODO(csigg): cuStreamGetCtx introduced in CUDA 9.2 would allow us to only
+  // record the stream and infer the context during collection.
+  CUcontext context;
+  CUstream stream;
+  uint32 correlation_id;
+  CUevent start_event;
+  CUevent stop_event;
+  KernelDetails details;
+};
+
+struct MemcpyRecord {
+  CuptiTracerEventType type;
+  size_t size_bytes;
+  CUcontext context;
+  CUstream stream;
+  uint32 correlation_id;
+  CUevent start_event;
+  CUevent stop_event;
+};
+
+Status CreateAndRecordEvent(CUevent *event, CUstream stream) {
+  TF_RETURN_IF_ERROR(ToStatus(cuEventCreate(event, CU_EVENT_DEFAULT)));
+  return ToStatus(cuEventRecord(*event, stream));
+}
+
+// Stores a series of kernel and memcpy records.
+class CudaEventRecorder {
+ public:
+  CudaEventRecorder(CuptiInterface *cupti_interface,
+                    CuptiTraceCollector *collector, int ordinal)
+      : cupti_interface_(cupti_interface),
+        collector_(collector),
+        ordinal_(ordinal) {
+    device_name_ = absl::StrCat("gpu ", ordinal);  // default.
+    CUdevice device;
+    if (cuDeviceGet(&device, ordinal) == CUDA_SUCCESS) {
+      char name[100];
+      if (cuDeviceGetName(name, sizeof(name), device) == CUDA_SUCCESS) {
+        device_name_ = name;
+      }
+    }
+  }
+
+  // Registers the start of a kernel launch. The returned index should be passed
+  // to StopKernel() after the kernel launch has completed.
+  size_t StartKernel(const char *kernel_name, CUcontext context,
+                     uint32 correlation_id,
+                     const cuLaunchKernel_params *params) {
+    CUstream stream = params->hStream;
+    KernelRecord record = {kernel_name, context, stream, correlation_id};
+    record.details.registers_per_thread = 0;  // unknown.
+    record.details.static_shared_memory_usage = params->sharedMemBytes;
+    record.details.dynamic_shared_memory_usage = 0;  // unknown
+    record.details.block_x = params->blockDimX;
+    record.details.block_y = params->blockDimY;
+    record.details.block_z = params->blockDimZ;
+    record.details.grid_x = params->gridDimX;
+    record.details.grid_y = params->gridDimY;
+    record.details.grid_z = params->gridDimZ;
+    LogIfError(CreateAndRecordEvent(&record.start_event, stream));
+    absl::MutexLock lock(&mutex_);
+    if (stopped_) return -1;
+    kernel_records_.push_back(record);
+    return kernel_records_.size() - 1;
+  }
+  void StopKernel(size_t index) {
+    absl::MutexLock lock(&mutex_);
+    if (index >= kernel_records_.size()) return;
+    auto &record = kernel_records_[index];
+    LogIfError(CreateAndRecordEvent(&record.stop_event, record.stream));
+  }
+
+  // Registers the start of a copy operation. The returned index should be
+  // passed to StopMemcpy() after the memcpy has completed.
+  size_t StartMemcpy(CuptiTracerEventType type, size_t size_bytes,
+                     CUcontext context, CUstream stream,
+                     uint32 correlation_id) {
+    MemcpyRecord record = {type, size_bytes, context, stream, correlation_id};
+    LogIfError(CreateAndRecordEvent(&record.start_event, stream));
+    absl::MutexLock lock(&mutex_);
+    if (stopped_) return -1;
+    memcpy_records_.push_back(record);
+    return memcpy_records_.size() - 1;
+  }
+  void StopMemcpy(size_t index) {
+    absl::MutexLock lock(&mutex_);
+    if (index >= memcpy_records_.size()) return;
+    auto &record = memcpy_records_[index];
+    LogIfError(CreateAndRecordEvent(&record.stop_event, record.stream));
+  }
+
+  Status Stop() {
+    {
+      absl::MutexLock lock(&mutex_);
+      stopped_ = true;
+      LOG(INFO) << "Collecting " << kernel_records_.size()
+                << " kernel records, " << memcpy_records_.size()
+                << " memcpy records.";
+
+      // Gather all profiled streams and contexts.
+      for (const auto &record : kernel_records_) {
+        TF_RETURN_IF_ERROR(
+            AddStreamInfo(record.context, record.stream, "Kernel"));
+      }
+      for (const auto &record : memcpy_records_) {
+        TF_RETURN_IF_ERROR(AddStreamInfo(record.context, record.stream,
+                                         GetTraceEventTypeName(record.type)));
+      }
+    }
+
+    // Synchronize all contexts, record end events, synchronize again.
+    // This scheme is an unreliable measure to associate a event with the wall
+    // time. There are chances that other threads might enque kernels which
+    // delay the second synchornization.
+    TF_RETURN_IF_ERROR(Synchronize());
+    for (auto &pair : context_infos_) {
+      TF_RETURN_IF_ERROR(ToStatus(cuCtxSetCurrent(pair.first)));
+      TF_RETURN_IF_ERROR(CreateAndRecordEvent(&pair.second.end_event, nullptr));
+    }
+
+    TF_RETURN_IF_ERROR(Synchronize());
+    end_walltime_us_ = Env::Default()->NowMicros();
+    return Status::OK();
+  }
+
+  Status Flush(AnnotationMap *annotation_map) {
+    auto kernel_records = ConsumeKernelRecords();
+    auto memcpy_records = ConsumeMemcpyRecords();
+    for (const auto &record : kernel_records) {
+      TF_RETURN_IF_ERROR(SaveRecord(record, annotation_map));
+    }
+    for (const auto &record : memcpy_records) {
+      TF_RETURN_IF_ERROR(SaveRecord(record, annotation_map));
+    }
+    return Status::OK();
+  }
+
+  std::vector<KernelRecord> ConsumeKernelRecords() {
+    absl::MutexLock lock(&mutex_);
+    return std::move(kernel_records_);
+  }
+  std::vector<MemcpyRecord> ConsumeMemcpyRecords() {
+    absl::MutexLock lock(&mutex_);
+    return std::move(memcpy_records_);
+  }
+
+ private:
+  struct ContextInfo {
+    uint32 context_id = 0;
+    int num_streams = 0;
+    CUevent end_event;
+  };
+
+  struct StreamInfo {
+    uint32 stream_id = 0;
+    std::string name;
+    int index;  // 0 is reserved for null stream.
+    const ContextInfo *ctx_info;
+  };
+
+  // Synchronizes all contexts.
+  Status Synchronize() const {
+    for (const auto &pair : context_infos_) {
+      TF_RETURN_IF_ERROR(ToStatus(cuCtxSetCurrent(pair.first)));
+      TF_RETURN_IF_ERROR(ToStatus(cuCtxSynchronize()));
+    }
+    return Status::OK();
+  }
+
+  // Returns element from context_infos_, adding it if not yet present.
+  Status GetContextInfo(CUcontext context, ContextInfo **ctx_info_ptr) {
+    auto it = context_infos_.find(context);
+
+    if (it == context_infos_.end()) {
+      uint32 context_id = 0;
+      RETURN_IF_CUPTI_ERROR(cuptiGetContextId(context, &context_id));
+      ContextInfo ctx_info = {context_id};
+      it = context_infos_.emplace(context, ctx_info).first;
+    }
+
+    *ctx_info_ptr = &it->second;
+    return Status::OK();
+  }
+
+  // Adds element to stream_infos_ if not yet present. If present, clear name
+  // if it doesn't match parameter.
+  Status AddStreamInfo(CUcontext context, CUstream stream,
+                       absl::string_view name) {
+    StreamKey key(context, stream);
+    auto it = stream_infos_.find(key);
+    if (it != stream_infos_.end()) {
+      if (it->second.name != name) {
+        it->second.name.clear();  // Stream with inconsistent names, clear it.
+      }
+      return Status::OK();
+    }
+
+    ContextInfo *ctx_info;
+    TF_RETURN_IF_ERROR(GetContextInfo(context, &ctx_info));
+    int index = stream ? ++ctx_info->num_streams : 0;
+    uint32 stream_id = 0;
+    RETURN_IF_CUPTI_ERROR(cuptiGetStreamId(context, stream, &stream_id));
+    StreamInfo stream_info = {stream_id, static_cast<std::string>(name), index,
+                              ctx_info};
+    stream_infos_.emplace(key, stream_info);
+    return Status::OK();
+  }
+
+  // Returns time in microseconds between events recorded on the GPU.
+  static uint64_t GetElapsedTimeUs(CUevent start, CUevent stop) {
+    float elapsed_ms = 0.0f;
+    LogIfError(ToStatus(cuEventElapsedTime(&elapsed_ms, start, stop)));
+    return static_cast<uint64>(
+        std::llroundf(1000 * std::max(elapsed_ms, 0.0f)));
+  }
+
+  Status SaveRecord(const KernelRecord &record,
+                    AnnotationMap *annotation_map) const {
+    if (!record.start_event || !record.stop_event) {
+      return Status::OK();
+    }
+    const auto &stream_info =
+        stream_infos_.at(StreamKey(record.context, record.stream));
+    auto start_us =
+        GetElapsedTimeUs(record.start_event, stream_info.ctx_info->end_event);
+    auto elapsed_us = GetElapsedTimeUs(record.start_event, record.stop_event);
+
+    std::string annotation;
+
+    CuptiTracerEvent event;
+    event.type = CuptiTracerEventType::Kernel;
+    event.source = CuptiTracerEventSource::Activity;  // on gpu device.
+    event.name = record.kernel_name;
+    event.start_time_ns = (end_walltime_us_ - start_us) * 1000;
+    event.end_time_ns = event.start_time_ns + elapsed_us * 1000;
+    event.device_id = ordinal_;
+    event.context_id = stream_info.ctx_info->context_id;
+    event.stream_id = stream_info.stream_id;
+    event.correlation_id = record.correlation_id;
+    event.annotation =
+        annotation_map->LookUp(event.device_id, event.correlation_id);
+    event.kernel_info = record.details;
+    collector_->AddEvent(std::move(event));
+    return Status::OK();
+  }
+
+  Status SaveRecord(const MemcpyRecord &record,
+                    AnnotationMap *annotation_map) const {
+    if (!record.start_event || !record.stop_event) {
+      return Status::OK();
+    }
+    const auto &stream_info =
+        stream_infos_.at(StreamKey(record.context, record.stream));
+    auto start_us =
+        GetElapsedTimeUs(record.start_event, stream_info.ctx_info->end_event);
+    auto elapsed_us = GetElapsedTimeUs(record.start_event, record.stop_event);
+
+    CuptiTracerEvent event;
+    event.type = record.type;
+    event.name = GetTraceEventTypeName(event.type);
+    event.source = CuptiTracerEventSource::Activity;
+    event.start_time_ns = (end_walltime_us_ - start_us) * 1000;
+    event.end_time_ns = event.start_time_ns + elapsed_us * 1000;
+    event.device_id = ordinal_;
+    event.context_id = stream_info.ctx_info->context_id;
+    event.stream_id = stream_info.stream_id;
+    event.correlation_id = record.correlation_id;
+    event.annotation =
+        annotation_map->LookUp(event.device_id, event.correlation_id);
+    event.memcpy_info.num_bytes = record.size_bytes;
+    // TODO: support MemcpyD2D where destination != source;
+    event.memcpy_info.destination = ordinal_;
+    // TODO: support differentiate sync and async memcpy.
+    event.memcpy_info.async = false;
+    collector_->AddEvent(std::move(event));
+    return Status::OK();
+  }
+
+  absl::Mutex mutex_;
+  bool stopped_ GUARDED_BY(mutex_) = false;
+  std::vector<KernelRecord> kernel_records_ GUARDED_BY(mutex_);
+  std::vector<MemcpyRecord> memcpy_records_ GUARDED_BY(mutex_);
+
+  CuptiInterface *cupti_interface_;
+  CuptiTraceCollector *collector_;
+  const int ordinal_;
+  std::string device_name_;
+  uint64 end_walltime_us_;
+  // Include context in key to distinguish null streams.
+  using StreamKey = std::pair<CUcontext, CUstream>;
+
+  absl::node_hash_map<CUcontext, ContextInfo> context_infos_;
+  absl::flat_hash_map<StreamKey, StreamInfo, hash<StreamKey>> stream_infos_;
+};
+
+// This hook uses cuda events to measure device side activities.
+class CuptiDriverApiHookWithCudaEvent : public CuptiDriverApiHook {
+ public:
+  CuptiDriverApiHookWithCudaEvent(const CuptiTracerOptions &option,
+                                  CuptiInterface *cupti_interface,
+                                  CuptiTraceCollector *collector,
+                                  AnnotationMap *annotation_map)
+      : option_(option),
+        cupti_interface_(cupti_interface),
+        annotation_map_(annotation_map),
+        collector_(collector) {
+    int num_gpus = CuptiTracer::NumGpus();
+    cuda_event_recorders_.reserve(num_gpus);
+    for (int i = 0; i < num_gpus; ++i) {
+      cuda_event_recorders_.emplace_back(
+          absl::make_unique<CudaEventRecorder>(cupti_interface, collector, i));
+    }
+  }
+
+  Status OnDriverApiEnter(int device_id, CUpti_CallbackDomain domain,
+                          CUpti_CallbackId cbid,
+                          const CUpti_CallbackData *cbdata) override {
+    auto *recorder = cuda_event_recorders_[device_id].get();
+    switch (cbid) {
+      case CUPTI_DRIVER_TRACE_CBID_cuLaunchKernel: {
+        DCHECK_NE(cbdata->symbolName, nullptr);
+        auto params =
+            static_cast<const cuLaunchKernel_params *>(cbdata->functionParams);
+        *cbdata->correlationData = recorder->StartKernel(
+            cbdata->symbolName, cbdata->context, cbdata->correlationId, params);
+        break;
+      }
+      case CUPTI_DRIVER_TRACE_CBID_cuMemcpy: {
+        auto params =
+            static_cast<const cuMemcpy_params *>(cbdata->functionParams);
+        StartMemcpy<cuMemcpy_params>(GetMemcpyType(params->src, params->dst),
+                                     cbdata, recorder);
+        break;
+      }
+      case CUPTI_DRIVER_TRACE_CBID_cuMemcpyAsync: {
+        auto params =
+            static_cast<const cuMemcpyAsync_params *>(cbdata->functionParams);
+        StartMemcpyAsync<cuMemcpyAsync_params>(
+            GetMemcpyType(params->src, params->dst), cbdata, recorder);
+        break;
+      }
+      case CUPTI_DRIVER_TRACE_CBID_cuMemcpyHtoD_v2:
+        StartMemcpy<cuMemcpyHtoD_v2_params>(CuptiTracerEventType::MemcpyH2D,
+                                            cbdata, recorder);
+        break;
+      case CUPTI_DRIVER_TRACE_CBID_cuMemcpyHtoDAsync_v2:
+        StartMemcpyAsync<cuMemcpyHtoDAsync_v2_params>(
+            CuptiTracerEventType::MemcpyH2D, cbdata, recorder);
+        break;
+      case CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoH_v2:
+        StartMemcpy<cuMemcpyDtoH_v2_params>(CuptiTracerEventType::MemcpyD2H,
+                                            cbdata, recorder);
+        break;
+      case CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoHAsync_v2:
+        StartMemcpyAsync<cuMemcpyDtoHAsync_v2_params>(
+            CuptiTracerEventType::MemcpyD2H, cbdata, recorder);
+        break;
+      case CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoD_v2:
+        StartMemcpy<cuMemcpyDtoD_v2_params>(CuptiTracerEventType::MemcpyD2D,
+                                            cbdata, recorder);
+        break;
+      case CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoDAsync_v2:
+        StartMemcpyAsync<cuMemcpyDtoDAsync_v2_params>(
+            CuptiTracerEventType::MemcpyD2D, cbdata, recorder);
+        break;
+      default:
+        LOG(ERROR) << "Unexpected callback id: " << cbid;
+        break;
+    }
+    return Status::OK();
+  }
+  Status OnDriverApiExit(int device_id, CUpti_CallbackDomain domain,
+                         CUpti_CallbackId cbid,
+                         const CUpti_CallbackData *cbdata) override {
+    auto *recorder = cuda_event_recorders_[device_id].get();
+    if (*cbdata->correlationData == static_cast<size_t>(-1))
+      return Status::OK();
+    switch (cbid) {
+      case CUPTI_DRIVER_TRACE_CBID_cuLaunchKernel:
+        recorder->StopKernel(*cbdata->correlationData);
+        break;
+      case CUPTI_DRIVER_TRACE_CBID_cuMemcpy:
+      case CUPTI_DRIVER_TRACE_CBID_cuMemcpyAsync:
+      case CUPTI_DRIVER_TRACE_CBID_cuMemcpyHtoD_v2:
+      case CUPTI_DRIVER_TRACE_CBID_cuMemcpyHtoDAsync_v2:
+      case CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoH_v2:
+      case CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoHAsync_v2:
+      case CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoD_v2:
+      case CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoDAsync_v2:
+        recorder->StopMemcpy(*cbdata->correlationData);
+        break;
+      default:
+        LOG(ERROR) << "Unexpected callback id: " << cbid;
+        break;
+    }
+    // If we are not collecting CPU events from Callback API, we can return now.
+    if (!option_.required_callback_api_events) {
+      return Status::OK();
+    }
+
+    // Grab timestamp for API exit. API entry timestamp saved in cbdata.
+    uint64 end_tsc = CuptiTracer::GetTimestamp();
+    uint64 start_tsc = *cbdata->correlationData;
+    return AddDriverApiCallbackEvent(collector_, cupti_interface_, device_id,
+                                     start_tsc, end_tsc, domain, cbid, cbdata);
+
+    return Status::OK();
+  }
+  Status Flush() override {
+    for (auto &recorder : cuda_event_recorders_) {
+      TF_RETURN_IF_ERROR(recorder->Stop());
+    }
+    for (auto &recorder : cuda_event_recorders_) {
+      TF_RETURN_IF_ERROR(recorder->Flush(annotation_map_));
+    }
+    return Status::OK();
+  }
+
+ private:
+  template <typename T>
+  static void StartMemcpy(CuptiTracerEventType type,
+                          const CUpti_CallbackData *cbdata,
+                          CudaEventRecorder *recorder) {
+    auto params = static_cast<const T *>(cbdata->functionParams);
+    *cbdata->correlationData =
+        recorder->StartMemcpy(type, params->ByteCount, cbdata->context, nullptr,
+                              cbdata->correlationId);
+  }
+  template <typename T>
+  static void StartMemcpyAsync(CuptiTracerEventType type,
+                               const CUpti_CallbackData *cbdata,
+                               CudaEventRecorder *recorder) {
+    auto params = static_cast<const T *>(cbdata->functionParams);
+    *cbdata->correlationData =
+        recorder->StartMemcpy(type, params->ByteCount, cbdata->context,
+                              params->hStream, cbdata->correlationId);
+  }
+
+  static CUmemorytype GetMemoryType(CUdeviceptr ptr) {
+    CUmemorytype mem_type = CU_MEMORYTYPE_HOST;
+    auto status =
+        cuPointerGetAttribute(&mem_type, CU_POINTER_ATTRIBUTE_MEMORY_TYPE, ptr);
+    if (status == CUDA_ERROR_INVALID_VALUE) {
+      // Pointer not registered with CUDA, must be host memory.
+      return CU_MEMORYTYPE_HOST;
+    }
+    LogIfError(ToStatus(status));
+    return mem_type;
+  }
+
+  static CuptiTracerEventType GetMemcpyType(CUdeviceptr src, CUdeviceptr dst) {
+    CUmemorytype src_type = GetMemoryType(src);
+    CUmemorytype dst_type = GetMemoryType(dst);
+    // TODO: handle CU_MEMORYTYPE_ARRAY case
+    if (src_type == CU_MEMORYTYPE_HOST && dst_type == CU_MEMORYTYPE_DEVICE) {
+      return CuptiTracerEventType::MemcpyH2D;
+    } else if (src_type == CU_MEMORYTYPE_DEVICE &&
+               dst_type == CU_MEMORYTYPE_HOST) {
+      return CuptiTracerEventType::MemcpyD2H;
+    } else if (src_type == CU_MEMORYTYPE_DEVICE &&
+               dst_type == CU_MEMORYTYPE_DEVICE) {
+      return CuptiTracerEventType::MemcpyD2D;
+    }
+    return CuptiTracerEventType::MemcpyOther;
+  }
+
+  const CuptiTracerOptions option_;
+  CuptiInterface *cupti_interface_;
+  AnnotationMap *annotation_map_;
+  CuptiTraceCollector *collector_;
+  std::vector<std::unique_ptr<CudaEventRecorder>> cuda_event_recorders_;
+  TF_DISALLOW_COPY_AND_ASSIGN(CuptiDriverApiHookWithCudaEvent);
+};
 }  // namespace
 
+/*static*/ Status CuptiDriverApiHook::AddDriverApiCallbackEvent(
+    CuptiTraceCollector *collector, CuptiInterface *cupti_interface,
+    int device_id, uint64 start_tsc, uint64 end_tsc,
+    CUpti_CallbackDomain domain, CUpti_CallbackId cbid,
+    const CUpti_CallbackData *cbdata) {
+  switch (cbid) {
+    case CUPTI_DRIVER_TRACE_CBID_cuLaunchKernel:
+      AddKernelEventUponApiExit(collector, device_id, cbdata, start_tsc,
+                                end_tsc);
+      break;
+    case CUPTI_DRIVER_TRACE_CBID_cuMemcpy:
+    case CUPTI_DRIVER_TRACE_CBID_cuMemcpyAsync:
+    case CUPTI_DRIVER_TRACE_CBID_cuMemcpyHtoD_v2:
+    case CUPTI_DRIVER_TRACE_CBID_cuMemcpyHtoDAsync_v2:
+    case CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoH_v2:
+    case CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoHAsync_v2:
+    case CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoD_v2:
+    case CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoDAsync_v2:
+    case CUPTI_DRIVER_TRACE_CBID_cuMemcpyAtoH_v2:
+    case CUPTI_DRIVER_TRACE_CBID_cuMemcpyAtoHAsync_v2:
+    case CUPTI_DRIVER_TRACE_CBID_cuMemcpyAtoD_v2:
+    case CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoA_v2:
+    case CUPTI_DRIVER_TRACE_CBID_cuMemcpyAtoA_v2:
+    case CUPTI_DRIVER_TRACE_CBID_cuMemcpy2D_v2:
+    case CUPTI_DRIVER_TRACE_CBID_cuMemcpy2DUnaligned_v2:
+    case CUPTI_DRIVER_TRACE_CBID_cuMemcpy2DAsync_v2:
+    case CUPTI_DRIVER_TRACE_CBID_cuMemcpy3D_v2:
+    case CUPTI_DRIVER_TRACE_CBID_cuMemcpy3DAsync_v2:
+    case CUPTI_DRIVER_TRACE_CBID_cuMemcpyHtoA_v2:
+    case CUPTI_DRIVER_TRACE_CBID_cuMemcpyHtoAAsync_v2:
+      AddNormalMemcpyEventUponApiExit(collector, device_id, cbid, cbdata,
+                                      start_tsc, end_tsc);
+      break;
+    case CUPTI_DRIVER_TRACE_CBID_cuMemcpyPeer:
+    case CUPTI_DRIVER_TRACE_CBID_cuMemcpyPeerAsync:
+      AddP2PMemcpyEventUponApiExit(collector, cupti_interface, device_id, cbid,
+                                   cbdata, start_tsc, end_tsc);
+      break;
+    case CUPTI_DRIVER_TRACE_CBID_cuMemAlloc_v2:
+      AddCudaMallocEventUponApiExit(collector, device_id, cbid, cbdata,
+                                    start_tsc, end_tsc);
+      break;
+    default:
+      AddGenericEventUponApiExit(collector, device_id, cbid, cbdata, start_tsc,
+                                 end_tsc);
+      break;
+  }
+  return Status::OK();
+}
+
+const char *GetTraceEventTypeName(const CuptiTracerEventType &type) {
+  switch (type) {
+    case CuptiTracerEventType::MemcpyH2D:
+      return "MemcpyH2D";
+    case CuptiTracerEventType::MemcpyD2H:
+      return "MemcpyD2H";
+    case CuptiTracerEventType::MemcpyD2D:
+      return "MemcpyD2D";
+    case CuptiTracerEventType::MemcpyP2P:
+      return "MemcpyP2P";
+    case CuptiTracerEventType::MemcpyOther:
+      return "MemcpyOther";
+    case CuptiTracerEventType::Kernel:
+      return "Compute";
+    case CuptiTracerEventType::MemoryAlloc:
+      return "MemoryAlloc";
+    case CuptiTracerEventType::Overhead:
+      return "Overhead";
+    case CuptiTracerEventType::UnifiedMemory:
+      return "UnifiedMemory";
+    case CuptiTracerEventType::Generic:
+      return "Generic";
+    default:
+      DCHECK(false);
+      return "";
+  }
+}
+
 void AnnotationMap::Add(uint32 device_id, uint32 correlation_id,
-                        const string &annotation) {
+                        const std::string &annotation) {
   if (annotation.empty()) return;
   VLOG(3) << "Add annotation: device_id: " << device_id
           << " correlation_id: " << correlation_id
@@ -575,7 +1198,7 @@ absl::string_view AnnotationMap::LookUp(uint32 device_id,
 }
 
 /* static */ CuptiTracer *CuptiTracer::GetCuptiTracerSingleton() {
-  static auto *singleton = new CuptiTracer();
+  static auto *singleton = new CuptiTracer(GetCuptiInterface());
   return singleton;
 }
 
@@ -599,11 +1222,20 @@ int CuptiTracer::NumGpus() {
 }
 
 void CuptiTracer::Enable(const CuptiTracerOptions &option,
-                         CuptiInterface *cupti_interface,
                          CuptiTraceCollector *collector) {
   option_ = option;
-  cupti_interface_ = cupti_interface, collector_ = collector;
+  collector_ = collector;
   annotation_map_.emplace(option.max_annotation_strings, NumGpus());
+
+  if (option_->enable_event_based_activity) {
+    option_->enable_activity_api = false;
+    cupti_driver_api_hook_.reset(new CuptiDriverApiHookWithCudaEvent(
+        option, cupti_interface_, collector, &*annotation_map_));
+  } else {
+    cupti_driver_api_hook_.reset(new CuptiDriverApiHookWithActivityApi(
+        option, cupti_interface_, collector, &*annotation_map_));
+  }
+
   EnableApiTracing().IgnoreError();
   if (option_->enable_activity_api) {
     EnableActivityTracing().IgnoreError();
@@ -617,10 +1249,11 @@ void CuptiTracer::Disable() {
   }
   cupti_interface_->CleanUp();
   Finalize().IgnoreError();
+  cupti_driver_api_hook_->Flush().IgnoreError();
   collector_->Flush();
   collector_ = nullptr;
-  cupti_interface_ = nullptr;
   option_.reset();
+  cupti_driver_api_hook_.reset();
   annotation_map_.reset();
 }
 
@@ -701,6 +1334,7 @@ Status CuptiTracer::DisableActivityTracing() {
     VLOG(1) << "Flushing CUPTI activity buffer";
     RETURN_IF_CUPTI_ERROR(
         cupti_interface_->ActivityFlushAll(CUPTI_ACTIVITY_FLAG_FLUSH_FORCED));
+    LOG(INFO) << "CUPTI activity buffer flushed";
   }
   activity_tracing_enabled_ = false;
   return Status::OK();
@@ -726,83 +1360,38 @@ Status CuptiTracer::Finalize() {
 
 Status CuptiTracer::HandleCallback(CUpti_CallbackDomain domain,
                                    CUpti_CallbackId cbid,
-                                   const CUpti_CallbackData *callback_info) {
+                                   const CUpti_CallbackData *cbdata) {
+  if (!api_tracing_enabled_) return Status::OK();  // already unsubscribed.
   if (domain != CUPTI_CB_DOMAIN_DRIVER_API) return Status::OK();
-  if (callback_info->callbackSite == CUPTI_API_ENTER) {
-    // Stash away the current Cupti timestamp into callback_info.
-    *callback_info->correlationData = GetTimestamp();
 
-  } else if (callback_info->callbackSite == CUPTI_API_EXIT) {
-    if (callback_info->context == nullptr) {
-      // API callback is called before any CUDA context is created.
-      // This is expected to be rare, and we ignore this case.
-      VLOG(3) << "API callback received before creation of CUDA context\n";
-      return errors::Internal("cutpi callback without context");
-    }
-    // Grab timestamp for API exit. API entry timestamp saved in callback_info
-    // data.
-    uint64 end_tsc = GetTimestamp();
-    uint64 start_tsc = *callback_info->correlationData;
+  if (cbdata->context == nullptr) {
+    // API callback is called before any CUDA context is created.
+    // This is expected to be rare, and we ignore this case.
+    VLOG(3) << "API callback received before creation of CUDA context\n";
+    return errors::Internal("cutpi callback without context");
+  }
 
-    // Grab a correct device ID.
-    uint32 device_id = -1;
-    RETURN_IF_CUPTI_ERROR(
-        cupti_interface_->GetDeviceId(callback_info->context, &device_id));
+  // Grab a correct device ID.
+  uint32 device_id = -1;
+  RETURN_IF_CUPTI_ERROR(
+      cupti_interface_->GetDeviceId(cbdata->context, &device_id));
+  if (device_id >= num_gpus_) {
+    return errors::Internal(absl::StrCat("Invalid device id:", device_id));
+  }
 
+  if (cbdata->callbackSite == CUPTI_API_ENTER) {
+    TF_RETURN_IF_ERROR(cupti_driver_api_hook_->OnDriverApiEnter(
+        device_id, domain, cbid, cbdata));
+  } else if (cbdata->callbackSite == CUPTI_API_EXIT) {
     // Set up the map from correlation id to annotation string.
-    const string &annotation = tensorflow::Annotation::CurrentAnnotation();
+    const std::string &annotation = tensorflow::Annotation::CurrentAnnotation();
     if (!annotation.empty()) {
-      annotation_map_->Add(device_id, callback_info->correlationId, annotation);
+      annotation_map_->Add(device_id, cbdata->correlationId, annotation);
     }
 
-    // If we are not collecting CPU events from Callback API, we can return now.
-    if (!option_->required_callback_api_events) {
-      return Status::OK();
-    }
-
-    switch (cbid) {
-      case CUPTI_DRIVER_TRACE_CBID_cuLaunchKernel:
-        AddKernelEventUponApiExit(collector_, device_id, callback_info,
-                                  start_tsc, end_tsc);
-        break;
-      case CUPTI_DRIVER_TRACE_CBID_cuMemcpy:
-      case CUPTI_DRIVER_TRACE_CBID_cuMemcpyAsync:
-      case CUPTI_DRIVER_TRACE_CBID_cuMemcpyHtoD_v2:
-      case CUPTI_DRIVER_TRACE_CBID_cuMemcpyHtoDAsync_v2:
-      case CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoH_v2:
-      case CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoHAsync_v2:
-      case CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoD_v2:
-      case CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoDAsync_v2:
-      case CUPTI_DRIVER_TRACE_CBID_cuMemcpyAtoH_v2:
-      case CUPTI_DRIVER_TRACE_CBID_cuMemcpyAtoHAsync_v2:
-      case CUPTI_DRIVER_TRACE_CBID_cuMemcpyAtoD_v2:
-      case CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoA_v2:
-      case CUPTI_DRIVER_TRACE_CBID_cuMemcpyAtoA_v2:
-      case CUPTI_DRIVER_TRACE_CBID_cuMemcpy2D_v2:
-      case CUPTI_DRIVER_TRACE_CBID_cuMemcpy2DUnaligned_v2:
-      case CUPTI_DRIVER_TRACE_CBID_cuMemcpy2DAsync_v2:
-      case CUPTI_DRIVER_TRACE_CBID_cuMemcpy3D_v2:
-      case CUPTI_DRIVER_TRACE_CBID_cuMemcpy3DAsync_v2:
-      case CUPTI_DRIVER_TRACE_CBID_cuMemcpyHtoA_v2:
-      case CUPTI_DRIVER_TRACE_CBID_cuMemcpyHtoAAsync_v2:
-        AddNormalMemcpyEventUponApiExit(collector_, device_id, cbid,
-                                        callback_info, start_tsc, end_tsc);
-        break;
-      case CUPTI_DRIVER_TRACE_CBID_cuMemcpyPeer:
-      case CUPTI_DRIVER_TRACE_CBID_cuMemcpyPeerAsync:
-        AddP2PMemcpyEventUponApiExit(collector_, cupti_interface_, device_id,
-                                     cbid, callback_info, start_tsc, end_tsc);
-        break;
-      case CUPTI_DRIVER_TRACE_CBID_cuMemAlloc_v2:
-        AddCudaMallocEventUponApiExit(collector_, device_id, cbid,
-                                      callback_info, start_tsc, end_tsc);
-        break;
-      default:
-        AddGenericEventUponApiExit(collector_, device_id, cbid, callback_info,
-                                   start_tsc, end_tsc);
-        break;
-    }
-  }  // CUPTI_API_EXIT
+    TF_RETURN_IF_ERROR(cupti_driver_api_hook_->OnDriverApiExit(
+        device_id, domain, cbid, cbdata));
+  }
   return Status::OK();
 }
 
@@ -841,6 +1430,10 @@ void CuptiTracer::ConfigureActivityUnifiedMemoryCounter(bool enable) {
 
 Status CuptiTracer::ProcessActivityBuffer(CUcontext context, uint32_t stream_id,
                                           uint8_t *buffer, size_t size) {
+  if (!activity_tracing_enabled_) {
+    LOG(WARNING) << "CUPTI activity buffer is freed after flush.";
+    return Status::OK();
+  }
   if (cupti_interface_->Disabled()) return errors::Internal("Disabled.");
 
   CUpti_Activity *record = nullptr;
diff --git a/tensorflow/core/profiler/internal/gpu/cupti_tracer.h b/tensorflow/core/profiler/internal/gpu/cupti_tracer.h
index d92c350593c..23d0e5c2d18 100644
--- a/tensorflow/core/profiler/internal/gpu/cupti_tracer.h
+++ b/tensorflow/core/profiler/internal/gpu/cupti_tracer.h
@@ -83,6 +83,8 @@ enum class CuptiTracerEventType {
   Generic = 100,
 };
 
+const char* GetTraceEventTypeName(const CuptiTracerEventType& type);
+
 enum class CuptiTracerEventSource {
   DriverCallback = 0,
   Activity = 1,
@@ -100,8 +102,12 @@ struct CuptiTracerEvent {
       std::numeric_limits<uint64_t>::max();
   CuptiTracerEventType type;
   CuptiTracerEventSource source;
-  // name and annotation are only guaranteed to be valid in collector->AddEvent.
-  absl::string_view name;
+  // Although CUpti_CallbackData::functionName is persistent, however
+  // CUpti_ActivityKernel4::name is not persistent, therefore we need a copy of
+  // it.
+  std::string name;
+  // This points to strings in AnnotationMap, which should outlive the point
+  // where serialization happens.
   absl::string_view annotation;
   uint64 start_time_ns;
   uint64 end_time_ns;
@@ -119,6 +125,12 @@ struct CuptiTracerEvent {
 
 struct CuptiTracerOptions {
   bool enable_activity_api = true;
+
+  // Use cuda events to enclose the kernel/memcpy to measure device activity.
+  // enable_event_based_activity, if true, will override the enable_activity_api
+  // setting.
+  bool enable_event_based_activity = false;
+
   bool required_callback_api_events = true;
   // Maximum number of annotation strings that we can accommodate.
   uint64 max_annotation_strings = 1024 * 1024;
@@ -151,7 +163,8 @@ class CuptiTraceCollector {
   virtual ~CuptiTraceCollector() {}
 
   virtual void AddEvent(CuptiTracerEvent&& event) = 0;
-  virtual void OnEventsDropped(const string& reason, uint32 num_events) = 0;
+  virtual void OnEventsDropped(const std::string& reason,
+                               uint32 num_events) = 0;
   virtual void Flush() = 0;
 
  protected:
@@ -165,7 +178,8 @@ class AnnotationMap {
  public:
   explicit AnnotationMap(uint64 max_size, uint32 num_gpus)
       : max_size_(max_size), per_device_map_(num_gpus) {}
-  void Add(uint32 device_id, uint32 correlation_id, const string& annotation);
+  void Add(uint32 device_id, uint32 correlation_id,
+           const std::string& annotation);
   absl::string_view LookUp(uint32 device_id, uint32 correlation_id);
 
  private:
@@ -175,7 +189,7 @@ class AnnotationMap {
     absl::Mutex mutex;
     // Annotation tends to be repetitive, use a hash_set to store the strings,
     // an use the reference to the string in the map.
-    absl::node_hash_set<string> annotations;
+    absl::node_hash_set<std::string> annotations;
     absl::flat_hash_map<uint32, absl::string_view> correlation_map;
   };
   const uint64 max_size_;
@@ -184,6 +198,26 @@ class AnnotationMap {
   TF_DISALLOW_COPY_AND_ASSIGN(AnnotationMap);
 };
 
+class CuptiDriverApiHook {
+ public:
+  virtual ~CuptiDriverApiHook() {}
+
+  virtual Status OnDriverApiEnter(int device_id, CUpti_CallbackDomain domain,
+                                  CUpti_CallbackId cbid,
+                                  const CUpti_CallbackData* callback_info) = 0;
+  virtual Status OnDriverApiExit(int device_id, CUpti_CallbackDomain domain,
+                                 CUpti_CallbackId cbid,
+                                 const CUpti_CallbackData* callback_info) = 0;
+  virtual Status Flush() = 0;
+
+ protected:
+  static Status AddDriverApiCallbackEvent(
+      CuptiTraceCollector* collector, CuptiInterface* cupti_interface,
+      int device_id, uint64 start_tsc, uint64 end_tsc,
+      CUpti_CallbackDomain domain, CUpti_CallbackId cbid,
+      const CUpti_CallbackData* callback_info);
+};
+
 // The class use to enable cupti callback/activity API and forward the collected
 // trace events to CuptiTraceCollector. There should be only one CuptiTracer
 // per process.
@@ -195,8 +229,7 @@ class CuptiTracer {
   // Only one profile session can be live in the same time.
   bool IsAvailable() const;
 
-  void Enable(const CuptiTracerOptions& option, CuptiInterface* cupti_interface,
-              CuptiTraceCollector* collector);
+  void Enable(const CuptiTracerOptions& option, CuptiTraceCollector* collector);
   void Disable();
 
   Status HandleCallback(CUpti_CallbackDomain domain, CUpti_CallbackId cbid,
@@ -209,9 +242,12 @@ class CuptiTracer {
   static uint64 GetTimestamp();
   static int NumGpus();
 
- private:
-  CuptiTracer() {}
+ protected:
+  // protected constructor for injecting mock cupti interface for testing.
+  explicit CuptiTracer(CuptiInterface* cupti_interface)
+      : num_gpus_(NumGpus()), cupti_interface_(cupti_interface) {}
 
+ private:
   Status EnableApiTracing();
   Status EnableActivityTracing();
   Status DisableApiTracing();
@@ -219,6 +255,7 @@ class CuptiTracer {
   Status Finalize();
   void ConfigureActivityUnifiedMemoryCounter(bool enable);
 
+  int num_gpus_;
   absl::optional<CuptiTracerOptions> option_;
   CuptiInterface* cupti_interface_ = nullptr;
   CuptiTraceCollector* collector_ = nullptr;
@@ -232,6 +269,8 @@ class CuptiTracer {
 
   bool activity_tracing_enabled_ = false;
 
+  std::unique_ptr<CuptiDriverApiHook> cupti_driver_api_hook_;
+
   TF_DISALLOW_COPY_AND_ASSIGN(CuptiTracer);
 };
 
diff --git a/tensorflow/core/profiler/internal/gpu/device_tracer.cc b/tensorflow/core/profiler/internal/gpu/device_tracer.cc
new file mode 100644
index 00000000000..93195b11eb7
--- /dev/null
+++ b/tensorflow/core/profiler/internal/gpu/device_tracer.cc
@@ -0,0 +1,394 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#if GOOGLE_CUDA
+
+#include <stdlib.h>
+
+#include <memory>
+
+#include "absl/container/fixed_array.h"
+#include "absl/strings/str_cat.h"
+#include "tensorflow/core/common_runtime/step_stats_collector.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/strings/str_util.h"
+#include "tensorflow/core/platform/abi.h"
+#include "tensorflow/core/platform/annotation.h"
+#include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/stringprintf.h"
+#include "tensorflow/core/profiler/internal/gpu/cupti_tracer.h"
+#include "tensorflow/core/profiler/internal/gpu/cupti_wrapper.h"
+#include "tensorflow/core/profiler/internal/parse_annotation.h"
+#include "tensorflow/core/profiler/internal/profiler_interface.h"
+#include "tensorflow/core/util/env_var.h"
+
+namespace tensorflow {
+namespace profiler {
+
+// Adapter from CuptiTraceCollector to StepStatsCollector: This class convert
+// and filter from CuptiTracerEvent to tensorflow::NodeExecStats.
+// We can not just forward event on the fly because StepStatsCollector have
+// a single mutex for all devices, Therefore we will cache events and forward
+// only when Flush().
+class StepStatsCuptiTracerAdaptor : public CuptiTraceCollector {
+ public:
+  StepStatsCuptiTracerAdaptor(const CuptiTracerCollectorOptions& option,
+                              const std::string prefix, int num_gpus,
+                              uint64 start_walltime_ns, uint64 start_gpu_ns,
+                              StepStatsCollector* trace_collector)
+      : CuptiTraceCollector(option),
+        trace_collector_(trace_collector),
+        num_callback_events_(0),
+        num_activity_events_(0),
+        start_walltime_ns_(start_walltime_ns),
+        start_gpu_ns_(start_gpu_ns),
+        num_gpus_(num_gpus),
+        per_device_adaptor_(num_gpus) {
+    for (int i = 0; i < num_gpus; ++i) {  // for each device id.
+      per_device_adaptor_[i].stream_device =
+          strings::StrCat(prefix, "/device:GPU:", i, "/stream:");
+      per_device_adaptor_[i].memcpy_device =
+          strings::StrCat(prefix, "/device:GPU:", i, "/memcpy");
+      per_device_adaptor_[i].sync_device =
+          strings::StrCat(prefix, "/device:GPU:", i, "/sync");
+    }
+  }
+
+  void AddEvent(CuptiTracerEvent&& event) override {
+    if (event.device_id >= num_gpus_) return;
+    if (event.source == CuptiTracerEventSource::DriverCallback) {
+      if (num_callback_events_ > options_.max_callback_api_events) {
+        OnEventsDropped("trace collector", 1);
+        return;
+      }
+      num_callback_events_++;
+    } else {
+      if (num_activity_events_ > options_.max_activity_api_events) {
+        OnEventsDropped("trace collector", 1);
+        return;
+      }
+      num_activity_events_++;
+    }
+    per_device_adaptor_[event.device_id].AddEvent(std::move(event));
+  }
+  void OnEventsDropped(const std::string& reason, uint32 num_events) override {}
+  void Flush() override {
+    LOG(INFO) << " GpuTracer has collected " << num_callback_events_
+              << " callback api events and " << num_activity_events_
+              << " activity events.";
+    for (int i = 0; i < num_gpus_; ++i) {
+      per_device_adaptor_[i].Flush(trace_collector_, start_walltime_ns_,
+                                   start_gpu_ns_);
+    }
+  }
+
+ private:
+  StepStatsCollector* trace_collector_;
+  std::atomic<int> num_callback_events_;
+  std::atomic<int> num_activity_events_;
+  uint64 start_walltime_ns_;
+  uint64 start_gpu_ns_;
+  int num_gpus_;
+
+  struct CorrelationInfo {
+    CorrelationInfo(uint32 t, uint32 e) : thread_id(t), enqueue_time_ns(e) {}
+    uint32 thread_id;
+    uint64 enqueue_time_ns;
+  };
+  struct PerDeviceAdaptor {
+    void AddEvent(CuptiTracerEvent&& event) {
+      absl::MutexLock lock(&mutex);
+      if (event.source == CuptiTracerEventSource::DriverCallback) {
+        // Cupti api callcack events were used to populate launch times etc.
+        if (event.name == "cuStreamSynchronize") {
+          events.emplace_back(std::move(event));
+        }
+        if (event.correlation_id != CuptiTracerEvent::kInvalidCorrelationId) {
+          correlation_info.insert(
+              {event.correlation_id,
+               CorrelationInfo(event.thread_id, event.start_time_ns)});
+        }
+      } else {
+        // Cupti activity events measure device times etc.
+        events.emplace_back(std::move(event));
+      }
+    }
+    void Flush(StepStatsCollector* collector, uint64 start_walltime_ns,
+               uint64 start_gpu_ns) {
+      absl::MutexLock lock(&mutex);
+      for (auto& event : events) {
+        NodeExecStats* ns = new NodeExecStats;
+        ns->set_all_start_micros(
+            (start_walltime_ns + (event.start_time_ns - start_gpu_ns)) / 1000);
+        ns->set_op_start_rel_micros(0);
+        auto elapsed_ns = event.end_time_ns - event.start_time_ns;
+        ns->set_op_end_rel_micros(elapsed_ns / 1000);
+        ns->set_all_end_rel_micros(elapsed_ns / 1000);
+
+        if (event.source == CuptiTracerEventSource::DriverCallback) {
+          DCHECK_EQ(event.name, "cuStreamSynchronize");
+          ns->set_node_name(event.name);
+          ns->set_timeline_label(absl::StrCat("ThreadId ", event.thread_id));
+          ns->set_thread_id(event.thread_id);
+          collector->Save(sync_device, ns);
+        } else {  // CuptiTracerEventSource::Activity
+          // Get launch information if available.
+          if (event.correlation_id != CuptiTracerEvent::kInvalidCorrelationId) {
+            auto it = correlation_info.find(event.correlation_id);
+            if (it != correlation_info.end()) {
+              ns->set_scheduled_micros(it->second.enqueue_time_ns / 1000);
+              ns->set_thread_id(it->second.thread_id);
+            }
+          }
+
+          auto annotation_stack = ParseAnnotationStack(event.annotation);
+          std::string activity_name =
+              !annotation_stack.empty()
+                  ? std::string(annotation_stack.back().name)
+                  : port::MaybeAbiDemangle(event.name.c_str());
+          ns->set_node_name(activity_name);
+          switch (event.type) {
+            case CuptiTracerEventType::Kernel: {
+              const std::string details = strings::Printf(
+                  "regs:%llu shm:%llu grid:%llu,%llu,%llu block:%llu,%llu,%llu",
+                  event.kernel_info.registers_per_thread,
+                  event.kernel_info.static_shared_memory_usage,
+                  event.kernel_info.grid_x, event.kernel_info.grid_y,
+                  event.kernel_info.grid_z, event.kernel_info.block_x,
+                  event.kernel_info.block_y, event.kernel_info.block_z);
+              ns->set_timeline_label(absl::StrCat(activity_name, " ", details));
+              auto nscopy = new NodeExecStats(*ns);
+              collector->Save(absl::StrCat(stream_device, "all"), ns);
+              collector->Save(absl::StrCat(stream_device, event.stream_id),
+                              nscopy);
+              break;
+            }
+            case CuptiTracerEventType::MemcpyH2D:
+            case CuptiTracerEventType::MemcpyD2H:
+            case CuptiTracerEventType::MemcpyD2D:
+            case CuptiTracerEventType::MemcpyP2P: {
+              std::string details = absl::StrCat(
+                  activity_name, " bytes:", event.memcpy_info.num_bytes);
+              if (event.memcpy_info.async) {
+                absl::StrAppend(&details, " aync");
+              }
+              if (event.memcpy_info.destination != event.device_id) {
+                absl::StrAppend(&details,
+                                " to device:", event.memcpy_info.destination);
+              }
+              ns->set_timeline_label(std::move(details));
+              auto nscopy = new NodeExecStats(*ns);
+              collector->Save(memcpy_device, ns);
+              collector->Save(
+                  absl::StrCat(stream_device, event.stream_id, "<",
+                               GetTraceEventTypeName(event.type), ">"),
+                  nscopy);
+              break;
+            }
+            default:
+              ns->set_timeline_label(activity_name);
+              collector->Save(stream_device, ns);
+          }
+        }
+      }
+    }
+
+    absl::Mutex mutex;
+    std::string stream_device GUARDED_BY(mutex);
+    std::string memcpy_device GUARDED_BY(mutex);
+    std::string sync_device GUARDED_BY(mutex);
+    std::vector<CuptiTracerEvent> events GUARDED_BY(mutex);
+    absl::flat_hash_map<uint32, CorrelationInfo> correlation_info
+        GUARDED_BY(mutex);
+  };
+  absl::FixedArray<PerDeviceAdaptor> per_device_adaptor_;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(StepStatsCuptiTracerAdaptor);
+};
+
+// GpuTracer for GPU.
+class GpuTracer : public profiler::ProfilerInterface {
+ public:
+  GpuTracer(CuptiTracer* cupti_tracer, CuptiInterface* cupti_interface)
+      : cupti_tracer_(cupti_tracer),
+        trace_collector_(&step_stats_) {
+    VLOG(1) << "GpuTracer created.";
+  }
+  ~GpuTracer() override {}
+
+  // GpuTracer interface:
+  Status Start() override;
+  Status Stop() override;
+  Status CollectData(RunMetadata* run_metadata) override;
+  profiler::DeviceType GetDeviceType() override {
+    return profiler::DeviceType::kGpu;
+  }
+
+ private:
+  Status DoStart();
+  Status DoStop();
+
+  enum State {
+    kNotStarted,
+    kStartedOk,
+    kStartedError,
+    kStoppedOk,
+    kStoppedError
+  };
+  State profiling_state_ = State::kNotStarted;
+
+  CuptiTracer* cupti_tracer_;
+  CuptiTracerOptions options_;
+  StepStats step_stats_;
+  StepStatsCollector trace_collector_;
+  std::unique_ptr<StepStatsCuptiTracerAdaptor> step_stats_cupti_adaptor_;
+};
+
+Status GpuTracer::DoStart() {
+  if (!cupti_tracer_->IsAvailable()) {
+    return errors::Unavailable("Another profile session running.");
+  }
+
+  options_.cbids_selected = {
+      // KERNEL
+      CUPTI_DRIVER_TRACE_CBID_cuLaunchKernel,
+      // MEMCPY
+      CUPTI_DRIVER_TRACE_CBID_cuMemcpy,
+      CUPTI_DRIVER_TRACE_CBID_cuMemcpyAsync,
+      CUPTI_DRIVER_TRACE_CBID_cuMemcpyHtoD_v2,
+      CUPTI_DRIVER_TRACE_CBID_cuMemcpyHtoDAsync_v2,
+      CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoH_v2,
+      CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoHAsync_v2,
+      CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoD_v2,
+      CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoDAsync_v2,
+      CUPTI_DRIVER_TRACE_CBID_cuMemcpyAtoH_v2,
+      CUPTI_DRIVER_TRACE_CBID_cuMemcpyAtoHAsync_v2,
+      CUPTI_DRIVER_TRACE_CBID_cuMemcpyAtoD_v2,
+      CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoA_v2,
+      CUPTI_DRIVER_TRACE_CBID_cuMemcpyAtoA_v2,
+      CUPTI_DRIVER_TRACE_CBID_cuMemcpy2D_v2,
+      CUPTI_DRIVER_TRACE_CBID_cuMemcpy2DUnaligned_v2,
+      CUPTI_DRIVER_TRACE_CBID_cuMemcpy2DAsync_v2,
+      CUPTI_DRIVER_TRACE_CBID_cuMemcpy3D_v2,
+      CUPTI_DRIVER_TRACE_CBID_cuMemcpy3DAsync_v2,
+      CUPTI_DRIVER_TRACE_CBID_cuMemcpyHtoA_v2,
+      CUPTI_DRIVER_TRACE_CBID_cuMemcpyHtoAAsync_v2,
+      // GENERIC
+      CUPTI_DRIVER_TRACE_CBID_cuStreamSynchronize,
+  };
+
+  bool trace_concurrent_kernels = false;
+  ReadBoolFromEnvVar("TF_GPU_CUPTI_FORCE_CONCURRENT_KERNEL", false,
+                     &trace_concurrent_kernels)
+      .IgnoreError();
+  options_.activities_selected.push_back(
+      trace_concurrent_kernels ? CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL
+                               : CUPTI_ACTIVITY_KIND_KERNEL);
+  options_.activities_selected.push_back(CUPTI_ACTIVITY_KIND_MEMCPY);
+  options_.activities_selected.push_back(CUPTI_ACTIVITY_KIND_MEMCPY2);
+  options_.activities_selected.push_back(CUPTI_ACTIVITY_KIND_OVERHEAD);
+
+#if CUDA_VERSION < 10000
+  if (!trace_concurrent_kernels) options_.cupti_finalize = true;
+#endif
+
+  CuptiTracerCollectorOptions collector_options;
+  uint64 start_gputime_ns = CuptiTracer::GetTimestamp();
+  uint64 start_walltime_ns = tensorflow::EnvTime::Default()->NowNanos();
+  int num_gpus = cupti_tracer_->NumGpus();
+  step_stats_cupti_adaptor_ = absl::make_unique<StepStatsCuptiTracerAdaptor>(
+      collector_options, "", num_gpus, start_walltime_ns, start_gputime_ns,
+      &trace_collector_);
+
+  tensorflow::tracing::ScopedAnnotation::Enable(true);
+  cupti_tracer_->Enable(options_, step_stats_cupti_adaptor_.get());
+  return Status::OK();
+}
+
+Status GpuTracer::Start() {
+  Status status = DoStart();
+  if (status.ok()) {
+    profiling_state_ = State::kStartedOk;
+    return Status::OK();
+  } else {
+    profiling_state_ = State::kStartedError;
+    return status;
+  }
+}
+
+Status GpuTracer::DoStop() {
+  cupti_tracer_->Disable();
+  tensorflow::tracing::ScopedAnnotation::Enable(false);
+  return Status::OK();
+}
+
+Status GpuTracer::Stop() {
+  if (profiling_state_ == State::kStartedOk) {
+    Status status = DoStop();
+    profiling_state_ = status.ok() ? State::kStoppedOk : State::kStoppedError;
+  }
+  return Status::OK();
+}
+
+Status GpuTracer::CollectData(RunMetadata* run_metadata) {
+  switch (profiling_state_) {
+    case State::kNotStarted:
+      VLOG(1) << "No trace data collected, session wasn't started";
+      return Status::OK();
+    case State::kStartedOk:
+      return errors::FailedPrecondition("Cannot collect trace before stopping");
+    case State::kStartedError:
+      LOG(ERROR) << "Cannot collect, xprof failed to start";
+      return Status::OK();
+    case State::kStoppedError:
+      VLOG(1) << "No trace data collected";
+      return Status::OK();
+    case State::kStoppedOk: {
+      // Input run_metadata is shared by profiler interfaces, we need append.
+      trace_collector_.Finalize();
+      for (auto& dev_stats : *step_stats_.mutable_dev_stats()) {
+        run_metadata->mutable_step_stats()->add_dev_stats()->Swap(&dev_stats);
+      }
+      return Status::OK();
+    }
+  }
+  return errors::Internal("Invalid profiling state: ", profiling_state_);
+}
+
+}  // namespace profiler
+
+// Not in anonymous namespace for testing purposes.
+std::unique_ptr<profiler::ProfilerInterface> CreateGpuTracer(
+    const profiler::ProfilerOptions& options) {
+  if (options.device_type != profiler::DeviceType::kGpu &&
+      options.device_type != profiler::DeviceType::kUnspecified)
+    return nullptr;
+  profiler::CuptiTracer* cupti_tracer =
+      profiler::CuptiTracer::GetCuptiTracerSingleton();
+  if (!cupti_tracer->IsAvailable()) {
+    return nullptr;
+  }
+  profiler::CuptiInterface* cupti_interface = profiler::GetCuptiInterface();
+  return absl::make_unique<profiler::GpuTracer>(cupti_tracer, cupti_interface);
+}
+
+auto register_gpu_tracer_factory = [] {
+  RegisterProfilerFactory(&CreateGpuTracer);
+  return 0;
+}();
+
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/platform/device_tracer_test.cc b/tensorflow/core/profiler/internal/gpu/device_tracer_test.cc
similarity index 100%
rename from tensorflow/core/platform/device_tracer_test.cc
rename to tensorflow/core/profiler/internal/gpu/device_tracer_test.cc
diff --git a/tensorflow/core/profiler/internal/parse_annotation.cc b/tensorflow/core/profiler/internal/parse_annotation.cc
index b3f2e0824c1..8a5d21c79f5 100644
--- a/tensorflow/core/profiler/internal/parse_annotation.cc
+++ b/tensorflow/core/profiler/internal/parse_annotation.cc
@@ -14,6 +14,7 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/core/profiler/internal/parse_annotation.h"
 
+#include "absl/strings/ascii.h"
 #include "absl/strings/str_split.h"
 #include "absl/strings/string_view.h"
 
@@ -28,7 +29,7 @@ std::vector<absl::string_view> SplitNameAndMetadata(
     parts.emplace_back(annotation);
   } else {
     annotation.remove_suffix(1);
-    parts = absl::StrSplit(annotation, '#', absl::SkipEmpty());
+    parts = absl::StrSplit(annotation, '#');
     if (parts.size() > 2) {
       parts.resize(2);
     }
@@ -42,10 +43,16 @@ std::vector<absl::string_view> SplitNameAndMetadata(
 std::vector<std::pair<absl::string_view, absl::string_view>> ParseMetadata(
     absl::string_view metadata) {
   std::vector<std::pair<absl::string_view, absl::string_view>> key_values;
-  for (absl::string_view pair : absl::StrSplit(metadata, ',')) {
-    std::vector<absl::string_view> parts = absl::StrSplit(pair, '=');
-    if (parts.size() == 2 && !parts[0].empty() && !parts[1].empty()) {
-      key_values.push_back(std::make_pair(parts[0], parts[1]));
+  for (absl::string_view pair :
+       absl::StrSplit(metadata, ',', absl::SkipWhitespace())) {
+    std::vector<absl::string_view> parts =
+        absl::StrSplit(pair, absl::MaxSplits('=', 1));
+    if (parts.size() == 2) {
+      absl::string_view key = absl::StripAsciiWhitespace(parts[0]);
+      absl::string_view value = absl::StripAsciiWhitespace(parts[1]);
+      if (!key.empty() && !value.empty()) {
+        key_values.push_back({key, value});
+      }
     }
   }
   return key_values;
@@ -57,7 +64,7 @@ Annotation ParseAnnotation(absl::string_view annotation) {
   Annotation result;
   std::vector<absl::string_view> parts = SplitNameAndMetadata(annotation);
   if (!parts.empty()) {
-    result.name = parts[0];
+    result.name = absl::StripAsciiWhitespace(parts[0]);
     for (const auto& key_value : ParseMetadata(parts[1])) {
       result.metadata.push_back({key_value.first, key_value.second});
     }
diff --git a/tensorflow/core/profiler/internal/parse_annotation_test.cc b/tensorflow/core/profiler/internal/parse_annotation_test.cc
index 8217bf1e42b..65d4ed7d7c3 100644
--- a/tensorflow/core/profiler/internal/parse_annotation_test.cc
+++ b/tensorflow/core/profiler/internal/parse_annotation_test.cc
@@ -53,6 +53,12 @@ TEST(ParseAnnotationTest, SimpleNameTest) {
   EXPECT_TRUE(annotation.metadata.empty());
 }
 
+TEST(ParseAnnotationTest, SimpleNameWithWhitespaceTest) {
+  Annotation annotation = ParseAnnotation("name ");
+  EXPECT_EQ(annotation.name, "name");
+  EXPECT_TRUE(annotation.metadata.empty());
+}
+
 TEST(ParseAnnotationTest, EmptyMetadataTest) {
   Annotation annotation = ParseAnnotation("name#");
   EXPECT_EQ(annotation.name, "name");
@@ -70,7 +76,7 @@ TEST(ParseAnnotationTest, EmptyMetadataTest) {
 TEST(ParseAnnotationTest, SingleMetadataTest) {
   Annotation annotation = ParseAnnotation("name#key=value#");
   EXPECT_EQ(annotation.name, "name");
-  EXPECT_EQ(annotation.metadata.size(), 1);
+  ASSERT_EQ(annotation.metadata.size(), 1);
   EXPECT_EQ(annotation.metadata.at(0).key, "key");
   EXPECT_EQ(annotation.metadata.at(0).value, "value");
 }
@@ -78,7 +84,7 @@ TEST(ParseAnnotationTest, SingleMetadataTest) {
 TEST(ParseAnnotationTest, MultipleMetadataTest) {
   Annotation annotation = ParseAnnotation("name#k1=v1,k2=v2,k3=v3#");
   EXPECT_EQ(annotation.name, "name");
-  EXPECT_EQ(annotation.metadata.size(), 3);
+  ASSERT_EQ(annotation.metadata.size(), 3);
   EXPECT_EQ(annotation.metadata.at(0).key, "k1");
   EXPECT_EQ(annotation.metadata.at(0).value, "v1");
   EXPECT_EQ(annotation.metadata.at(1).key, "k2");
@@ -87,17 +93,34 @@ TEST(ParseAnnotationTest, MultipleMetadataTest) {
   EXPECT_EQ(annotation.metadata.at(2).value, "v3");
 }
 
-TEST(ParseAnnotationTest, ExtraCharactersTest) {
-  Annotation annotation = ParseAnnotation("name#k1=v1,k2=,k3=v3,k4=v4=#more#");
+TEST(ParseAnnotationTest, MultipleMetadataWithWhitespaceTest) {
+  Annotation annotation = ParseAnnotation("name # k1 = v1, ,k2=v2 #");
   EXPECT_EQ(annotation.name, "name");
-  EXPECT_EQ(annotation.metadata.size(), 2);
+  ASSERT_EQ(annotation.metadata.size(), 2);
   EXPECT_EQ(annotation.metadata.at(0).key, "k1");
   EXPECT_EQ(annotation.metadata.at(0).value, "v1");
-  // "k2=" is ignored due to missing value.
-  EXPECT_EQ(annotation.metadata.at(1).key, "k3");
-  EXPECT_EQ(annotation.metadata.at(1).value, "v3");
-  // "k4=v4=" is ignored due to extra '='.
-  // "more#" is ignored.
+  EXPECT_EQ(annotation.metadata.at(1).key, "k2");
+  EXPECT_EQ(annotation.metadata.at(1).value, "v2");
+}
+
+TEST(ParseAnnotationTest, KeyValueSeparatorTest) {
+  Annotation annotation = ParseAnnotation("name#=v1,k2=,k3==v3,k4=v4=#");
+  EXPECT_EQ(annotation.name, "name");
+  ASSERT_EQ(annotation.metadata.size(), 2);
+  // "=v1" is ignored due to empty key.
+  // "k2=" is ignored due to empty value.
+  // "=" in value is OK.
+  EXPECT_EQ(annotation.metadata.at(0).key, "k3");
+  EXPECT_EQ(annotation.metadata.at(0).value, "=v3");
+  EXPECT_EQ(annotation.metadata.at(1).key, "k4");
+  EXPECT_EQ(annotation.metadata.at(1).value, "v4=");
+}
+
+TEST(ParseAnnotationTest, ExtraMetadataSeparatorTest) {
+  Annotation annotation = ParseAnnotation("name##k1=v1#");
+  EXPECT_EQ(annotation.name, "name");
+  // "k1=v1" is ignored due to extra metadata separator.
+  EXPECT_TRUE(annotation.metadata.empty());
 }
 
 }  // namespace
diff --git a/tensorflow/core/profiler/lib/profiler_session.cc b/tensorflow/core/profiler/lib/profiler_session.cc
index 3decf1eff59..a58e386c253 100644
--- a/tensorflow/core/profiler/lib/profiler_session.cc
+++ b/tensorflow/core/profiler/lib/profiler_session.cc
@@ -29,6 +29,7 @@ limitations under the License.
 #include "tensorflow/core/profiler/lib/profiler_utils.h"
 #include "tensorflow/core/protobuf/config.pb.h"
 #include "tensorflow/core/protobuf/trace_events.pb.h"
+#include "tensorflow/core/util/env_var.h"
 
 namespace tensorflow {
 namespace {
@@ -100,6 +101,12 @@ void ConvertRunMetadataToTraceEvent(RunMetadata* run_metadata,
     // Create device
     auto* device_stats =
         run_metadata->mutable_step_stats()->mutable_dev_stats(device_id);
+    // Don't generate trace events for "derived or aggregated" devices, the
+    // events in these devices are duplicated from other streams.
+    if (absl::EndsWith(device_stats->device(), "stream:all") ||
+        absl::EndsWith(device_stats->device(), "sync") ||
+        absl::EndsWith(device_stats->device(), "memcpy"))
+      continue;
     profiler::Device device;
     device.set_name(device_stats->device());
     device.set_device_id(device_id);
@@ -116,8 +123,7 @@ void ConvertRunMetadataToTraceEvent(RunMetadata* run_metadata,
     (*trace_devices)[device_id] = device;
 
     // Emit events.
-    for (auto node :
-         run_metadata->step_stats().dev_stats(device_id).node_stats()) {
+    for (auto node : device_stats->node_stats()) {
       if (node.all_start_micros() < profile_start_time_micros ||
           node.all_start_micros() + node.all_end_rel_micros() >
               profile_end_time_micros) {
@@ -151,6 +157,18 @@ void ConvertRunMetadataToTraceEvent(RunMetadata* run_metadata,
   return absl::WrapUnique(new ProfilerSession(options));
 }
 
+/*static*/ std::unique_ptr<ProfilerSession> ProfilerSession::Create() {
+  int64 host_tracer_level = 2;
+  tensorflow::Status s = ReadInt64FromEnvVar("TF_PROFILER_HOST_TRACER_LEVEL", 2,
+                                             &host_tracer_level);
+  if (!s.ok()) {
+    LOG(WARNING) << "ProfilerSession: " << s.error_message();
+  }
+  profiler::ProfilerOptions options;
+  options.host_tracer_level = host_tracer_level;
+  return Create(options);
+}
+
 Status ProfilerSession::Status() {
   mutex_lock l(mutex_);
   return status_;
diff --git a/tensorflow/core/profiler/lib/profiler_session.h b/tensorflow/core/profiler/lib/profiler_session.h
index 6bf28bf0fba..e47a85594ed 100644
--- a/tensorflow/core/profiler/lib/profiler_session.h
+++ b/tensorflow/core/profiler/lib/profiler_session.h
@@ -34,9 +34,7 @@ class ProfilerSession {
   // Creates and ProfilerSession and starts profiling.
   static std::unique_ptr<ProfilerSession> Create(
       const profiler::ProfilerOptions& options);
-  static std::unique_ptr<ProfilerSession> Create() {
-    return Create(profiler::ProfilerOptions());
-  }
+  static std::unique_ptr<ProfilerSession> Create();
 
   // Deletes an exsiting Profiler and enables starting a new one.
   ~ProfilerSession();
diff --git a/tensorflow/core/protobuf/eager_service.proto b/tensorflow/core/protobuf/eager_service.proto
index 99534a1fa96..60b39a2b97a 100644
--- a/tensorflow/core/protobuf/eager_service.proto
+++ b/tensorflow/core/protobuf/eager_service.proto
@@ -122,6 +122,10 @@ message RegisterFunctionRequest {
   fixed64 context_id = 1;
 
   FunctionDef function_def = 2;
+
+  // If true, it means that function_def is produced by graph partition during
+  // multi-device function instantiation.
+  bool is_component_function = 3;
 }
 
 message RegisterFunctionResponse {}
diff --git a/tensorflow/core/protobuf/graph_debug_info.proto b/tensorflow/core/protobuf/graph_debug_info.proto
index 45f930cfaca..a3e3b18d72a 100644
--- a/tensorflow/core/protobuf/graph_debug_info.proto
+++ b/tensorflow/core/protobuf/graph_debug_info.proto
@@ -1,6 +1,7 @@
 syntax = "proto3";
 
 package tensorflow;
+
 option cc_enable_arenas = true;
 option java_outer_classname = "GraphDebugInfoProtos";
 option java_multiple_files = true;
@@ -37,5 +38,14 @@ message GraphDebugInfo {
   repeated string files = 1;
 
   // This maps a node name to a stack trace in the source code.
+  // The map key is a mangling of the containing function and op name with
+  // syntax:
+  //   op.name '@' func_name
+  // For ops in the top-level graph, the func_name is the empty string.
+  // Note that op names are restricted to a small number of characters which
+  // exclude '@', making it impossible to collide keys of this form. Function
+  // names accept a much wider set of characters.
+  // It would be preferable to avoid mangling and use a tuple key of (op.name,
+  // func_name), but this is not supported with protocol buffers.
   map<string, StackTrace> traces = 2;
 }
diff --git a/tensorflow/core/protobuf/tpu/BUILD b/tensorflow/core/protobuf/tpu/BUILD
index 98aa1b8e5cf..45f7c6698f2 100644
--- a/tensorflow/core/protobuf/tpu/BUILD
+++ b/tensorflow/core/protobuf/tpu/BUILD
@@ -73,6 +73,19 @@ tf_proto_library_py(
     visibility = ["//visibility:public"],
 )
 
+tf_proto_library_py(
+    name = "compile_metadata_proto",
+    srcs = [
+        "compile_metadata.proto",
+    ],
+    protodeps = tf_additional_all_protos() + [
+        ":dynamic_padding_proto",
+        "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/compiler/xla:xla_proto",
+    ],
+    visibility = ["//visibility:public"],
+)
+
 tf_pyclif_proto_library(
     name = "topology_pyclif",
     proto_lib = ":topology_proto",
diff --git a/tensorflow/core/protobuf/tpu/compile_metadata.proto b/tensorflow/core/protobuf/tpu/compile_metadata.proto
new file mode 100644
index 00000000000..4bd71d0c0ac
--- /dev/null
+++ b/tensorflow/core/protobuf/tpu/compile_metadata.proto
@@ -0,0 +1,86 @@
+syntax = "proto3";
+
+package tensorflow.tpu;
+
+import "tensorflow/compiler/xla/xla.proto";
+import "tensorflow/compiler/xla/xla_data.proto";
+import "tensorflow/core/framework/tensor_shape.proto";
+import "tensorflow/core/framework/types.proto";
+import "tensorflow/core/protobuf/tpu/dynamic_padding.proto";
+
+option cc_enable_arenas = true;
+
+// This is an experimental proto used in the TF/XLA bridge to store metadata to
+// a compile op (e.g. _TPUCompileMlir).
+// TODO(lyandy): Deprecate proto once generic metadata proto is created.
+message TPUCompileMetadataProto {
+  // Description of the types and shapes of the arguments to a computation.
+  message Arg {
+    enum Kind {
+      INVALID = 0;
+      PARAMETER = 1;
+      VARIABLE = 2;
+      // These are args which have been guaranteed to be constants during the
+      // session lifetime by the use of the GuaranteeConstOp (or ConstantOp).
+      GUARANTEED_CONSTANT = 3;
+    }
+    DataType dtype = 1;
+    TensorShapeProto shape = 2;
+    Kind kind = 3;
+
+    // The cross-core sharding of this input within each replica, e.g.,
+    // assigning to one core, or replicate across all cores.
+    xla.OpSharding sharding = 4;
+
+    // Whether this argument will receive the same data across all replicas.
+    bool is_same_data_across_replicas = 5;
+
+    enum EnableXlaSharding {
+      DISALLOWED = 0;
+      // Sharding is allowed if host training loop exists.
+      TENTATIVE = 1;
+      ALLOWED = 2;
+    }
+    // Whether to allow XLA to produce separate programs to shard/unshard this
+    // argument. Requires this arg to be an on-device variable.
+    EnableXlaSharding enable_xla_sharding = 6;
+  }
+  repeated Arg args = 1;
+
+  // Description of the return values from a computation.
+  message Retval {
+    // The cross-core sharding of this return value within each replica, e.g.,
+    // assigning to one core, or replicate across all cores.
+    xla.OpSharding sharding = 1;
+  }
+  repeated Retval retvals = 2;
+
+  // Number of replicas of the computation and number of cores in each replica.
+  // TODO(b/140721404): it may not be necessary to state the number of cores per
+  // replica here. Reconsider when replicated model-parallelism is implemented
+  // in XLA.
+  int32 num_replicas = 3;
+  int32 num_cores_per_replica = 4;
+
+  reserved 5;  // was device_names
+  reserved 7;  // was replica_device_assignment
+
+  xla.DeviceAssignmentProto device_assignment = 8;
+
+  // A fingerprint of the function library. Ensures that any functions called
+  // by the computation have matching definitions.
+  uint64 function_library_fingerprint = 6;
+
+  // Unique session identifier. Can be empty.
+  string session_handle = 9;
+
+  // Fingerprint of guaranteed_const value. The fingerprint computation inside
+  // tpu_compile_op may be slow. The compuation can be avoided by setting the
+  // fingerprint value here.
+  string guaranteed_const_fingerprint = 10;
+
+  repeated tpu.PaddingMap padding_maps = 11;
+
+  // The location of step markers that XLA compile will instrument.
+  xla.DebugOptions.StepMarkerLocation step_marker_location = 12;
+}
diff --git a/tensorflow/core/public/version.h b/tensorflow/core/public/version.h
index 61b3ee7679b..8c34e2e7679 100644
--- a/tensorflow/core/public/version.h
+++ b/tensorflow/core/public/version.h
@@ -108,7 +108,7 @@ limitations under the License.
 
 #define TF_GRAPH_DEF_VERSION_MIN_PRODUCER 0
 #define TF_GRAPH_DEF_VERSION_MIN_CONSUMER 0
-#define TF_GRAPH_DEF_VERSION 148  // Updated: 2019/9/4
+#define TF_GRAPH_DEF_VERSION 157  // Updated: 2019/9/13
 
 // Checkpoint compatibility versions (the versions field in SavedSliceMeta).
 //
diff --git a/tensorflow/core/tpu/tpu_embedding_optimization_parameters_utils.cc b/tensorflow/core/tpu/tpu_embedding_optimization_parameters_utils.cc
index fa49c42a39d..d2f34549848 100644
--- a/tensorflow/core/tpu/tpu_embedding_optimization_parameters_utils.cc
+++ b/tensorflow/core/tpu/tpu_embedding_optimization_parameters_utils.cc
@@ -345,6 +345,12 @@ Status RegisterPerTableLoadOpsForAlgorithmBody(
     shard_id_attr->set_name("shard_id");
     shard_id_attr->set_type("int");
   }
+  {
+    auto* embedding_config_attr = op_def->add_attr();
+    embedding_config_attr->set_name("config");
+    embedding_config_attr->set_type("string");
+    embedding_config_attr->mutable_default_value()->set_s("");
+  }
   string parameter_descriptions;
   for (const auto& parameter : state_variable_specs) {
     if (parameter.has_user_defined() || is_debug_op) {
@@ -462,6 +468,12 @@ Status RegisterPerTableRetrieveOpsForAlgorithmBody(
     shard_id_attr->set_name("shard_id");
     shard_id_attr->set_type("int");
   }
+  {
+    auto* embedding_config_attr = op_def->add_attr();
+    embedding_config_attr->set_name("config");
+    embedding_config_attr->set_type("string");
+    embedding_config_attr->mutable_default_value()->set_s("");
+  }
   string parameter_descriptions;
   for (const auto& param : state_variable_specs) {
     if (param.has_user_defined() || is_debug_op) {
diff --git a/tensorflow/core/util/test_log.proto b/tensorflow/core/util/test_log.proto
index ddb0599388f..e4a93126d75 100644
--- a/tensorflow/core/util/test_log.proto
+++ b/tensorflow/core/util/test_log.proto
@@ -159,6 +159,8 @@ message MachineConfiguration {
 // Run-specific items such as arguments to the test / benchmark.
 message RunConfiguration {
   repeated string argument = 1;
+  // Environment variables used to run the test/benchmark.
+  map<string, string> env_vars = 2;
 }
 
 // The output of one benchmark / test run.  Each run contains a list of
diff --git a/tensorflow/core/util/work_sharder.cc b/tensorflow/core/util/work_sharder.cc
index 74f0713a618..58808e3a636 100644
--- a/tensorflow/core/util/work_sharder.cc
+++ b/tensorflow/core/util/work_sharder.cc
@@ -45,13 +45,15 @@ void Shard(int max_parallelism, thread::ThreadPool* workers, int64 total,
     workers->ParallelFor(total, cost_per_unit, work);
     return;
   }
-  Sharder::Do(total, cost_per_unit, work,
-              [&workers](Sharder::Closure c) { workers->Schedule(c); },
-              max_parallelism);
+  Sharder::Do(
+      total, cost_per_unit, work,
+      [&workers](Sharder::Closure c) { workers->Schedule(c); },
+      max_parallelism);
 }
 
-// DEPRECATED: Prefer threadpool->TransformRangeConcurrently, which allows you
-// to directly specify the shard size.
+// DEPRECATED: Prefer threadpool->ParallelFor with SchedulingStrategy, which
+// allows you to specify the strategy for choosing shard sizes, including using
+// a fixed shard size.
 void Sharder::Do(int64 total, int64 cost_per_unit, const Work& work,
                  const Runner& runner, int max_parallelism) {
   cost_per_unit = std::max(int64{1}, cost_per_unit);
diff --git a/tensorflow/core/util/work_sharder.h b/tensorflow/core/util/work_sharder.h
index 9db85a54c6c..92d1dc698b1 100644
--- a/tensorflow/core/util/work_sharder.h
+++ b/tensorflow/core/util/work_sharder.h
@@ -23,9 +23,11 @@ limitations under the License.
 
 namespace tensorflow {
 
-// DEPRECATED: Prefer threadpool->TransformRangeConcurrently, which allows you
-// to directly specify the shard size. Use this function only if you want to
-// manually cap parallelism.
+// DEPRECATED: Prefer threadpool->ParallelFor with SchedulingStrategy, which
+// allows you to specify the strategy for choosing shard sizes, including using
+// a fixed shard size. Use this function only if you want to manually cap
+// parallelism.
+//
 // Shards the "total" unit of work assuming each unit of work having
 // roughly "cost_per_unit". Each unit of work is indexed 0, 1, ...,
 // total - 1. Each shard contains 1 or more units of work and the
diff --git a/tensorflow/examples/adding_an_op/zero_out_1_test.py b/tensorflow/examples/adding_an_op/zero_out_1_test.py
index 459ac2dc279..a8513c72831 100644
--- a/tensorflow/examples/adding_an_op/zero_out_1_test.py
+++ b/tensorflow/examples/adding_an_op/zero_out_1_test.py
@@ -34,6 +34,18 @@ class ZeroOut1Test(tf.test.TestCase):
       result = zero_out_op_1.zero_out([5, 4, 3, 2, 1])
       self.assertAllEqual(result.eval(), [5, 0, 0, 0, 0])
 
+  @test_util.run_deprecated_v1
+  def test_namespace(self):
+    with self.cached_session():
+      result = zero_out_op_1.namespace_zero_out([5, 4, 3, 2, 1])
+      self.assertAllEqual(result.eval(), [5, 0, 0, 0, 0])
+
+  @test_util.run_deprecated_v1
+  def test_namespace_nested(self):
+    with self.cached_session():
+      result = zero_out_op_1.namespace_nested_zero_out([5, 4, 3, 2, 1])
+      self.assertAllEqual(result.eval(), [5, 0, 0, 0, 0])
+
   def testLoadTwice(self):
     zero_out_loaded_again = tf.load_op_library(os.path.join(
         tf.resource_loader.get_data_files_path(), 'zero_out_op_kernel_1.so'))
diff --git a/tensorflow/examples/adding_an_op/zero_out_op_1.py b/tensorflow/examples/adding_an_op/zero_out_op_1.py
index 6bd98b1f06d..57980c3e2aa 100644
--- a/tensorflow/examples/adding_an_op/zero_out_op_1.py
+++ b/tensorflow/examples/adding_an_op/zero_out_op_1.py
@@ -25,3 +25,5 @@ _zero_out_module = tf.load_op_library(
     os.path.join(tf.resource_loader.get_data_files_path(),
                  'zero_out_op_kernel_1.so'))
 zero_out = _zero_out_module.zero_out
+namespace_zero_out = _zero_out_module.namespace_zero_out
+namespace_nested_zero_out = _zero_out_module.namespace_nested_zero_out
diff --git a/tensorflow/examples/adding_an_op/zero_out_op_kernel_1.cc b/tensorflow/examples/adding_an_op/zero_out_op_kernel_1.cc
index 6d57b64d1aa..aabb2cd5f79 100644
--- a/tensorflow/examples/adding_an_op/zero_out_op_kernel_1.cc
+++ b/tensorflow/examples/adding_an_op/zero_out_op_kernel_1.cc
@@ -60,3 +60,37 @@ class ZeroOutOp : public OpKernel {
 };
 
 REGISTER_KERNEL_BUILDER(Name("ZeroOut").Device(DEVICE_CPU), ZeroOutOp);
+
+REGISTER_OP("Namespace>ZeroOut")
+    .Input("to_zero: int32")
+    .Output("zeroed: int32")
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      c->set_output(0, c->input(0));
+      return Status::OK();
+    })
+    .Doc(R"doc(
+Zeros out all but the first value of a Tensor.
+
+zeroed: A Tensor whose first value is identical to `to_zero`, and 0
+  otherwise.
+)doc");
+
+REGISTER_KERNEL_BUILDER(Name("Namespace>ZeroOut").Device(DEVICE_CPU),
+                        ZeroOutOp);
+
+REGISTER_OP("Namespace>Nested>ZeroOut")
+    .Input("to_zero: int32")
+    .Output("zeroed: int32")
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      c->set_output(0, c->input(0));
+      return Status::OK();
+    })
+    .Doc(R"doc(
+Zeros out all but the first value of a Tensor.
+
+zeroed: A Tensor whose first value is identical to `to_zero`, and 0
+  otherwise.
+)doc");
+
+REGISTER_KERNEL_BUILDER(Name("Namespace>Nested>ZeroOut").Device(DEVICE_CPU),
+                        ZeroOutOp);
diff --git a/tensorflow/examples/android/BUILD b/tensorflow/examples/android/BUILD
index 693d17825c1..50470c027e7 100644
--- a/tensorflow/examples/android/BUILD
+++ b/tensorflow/examples/android/BUILD
@@ -14,7 +14,7 @@ package(
 
 exports_files(["LICENSE"])
 
-LINKER_SCRIPT = "//tensorflow/contrib/android:jni/version_script.lds"
+LINKER_SCRIPT = "jni/version_script.lds"
 
 # libtensorflow_demo.so contains the native code for image colorspace conversion
 # and object tracking used by the demo. It does not require TF as a dependency
@@ -118,4 +118,6 @@ filegroup(
     srcs = glob(["res/**"]),
 )
 
-exports_files(["AndroidManifest.xml"])
+exports_files([
+    "AndroidManifest.xml",
+])
diff --git a/tensorflow/examples/android/jni/version_script.lds b/tensorflow/examples/android/jni/version_script.lds
new file mode 100644
index 00000000000..38c93dda730
--- /dev/null
+++ b/tensorflow/examples/android/jni/version_script.lds
@@ -0,0 +1,11 @@
+VERS_1.0 {
+  # Export JNI symbols.
+  global:
+    Java_*;
+    JNI_OnLoad;
+    JNI_OnUnload;
+
+  # Hide everything else.
+  local:
+    *;
+};
diff --git a/tensorflow/examples/saved_model/integration_tests/export_simple_text_embedding.py b/tensorflow/examples/saved_model/integration_tests/export_simple_text_embedding.py
index b8e76e895fc..444af7c9d8b 100644
--- a/tensorflow/examples/saved_model/integration_tests/export_simple_text_embedding.py
+++ b/tensorflow/examples/saved_model/integration_tests/export_simple_text_embedding.py
@@ -25,10 +25,6 @@ from absl import flags
 
 import tensorflow.compat.v2 as tf
 
-# TODO(vbardiovsky): remove these when symbols are public.
-from tensorflow.python.ops import lookup_ops
-from tensorflow.python.training.tracking import tracking
-
 FLAGS = flags.FLAGS
 
 flags.DEFINE_string("export_dir", None, "Directory to export SavedModel.")
@@ -54,13 +50,15 @@ class TextEmbeddingModel(tf.train.Checkpoint):
   def __init__(self, vocabulary, emb_dim, oov_buckets):
     super(TextEmbeddingModel, self).__init__()
     self._oov_buckets = oov_buckets
-    self._vocabulary_file = tracking.TrackableAsset(
-        write_vocabulary_file(vocabulary))
     self._total_size = len(vocabulary) + oov_buckets
-    self._table = lookup_ops.index_table_from_file(
-        vocabulary_file=self._vocabulary_file,
-        num_oov_buckets=self._oov_buckets,
-        hasher_spec=lookup_ops.FastHashSpec)
+    # Assign the table initializer to this instance to ensure the asset
+    # it depends on is saved with the SavedModel.
+    self._table_initializer = tf.lookup.TextFileInitializer(
+        write_vocabulary_file(vocabulary), tf.string,
+        tf.lookup.TextFileIndex.WHOLE_LINE, tf.int64,
+        tf.lookup.TextFileIndex.LINE_NUMBER)
+    self._table = tf.lookup.StaticVocabularyTable(
+        self._table_initializer, num_oov_buckets=self._oov_buckets)
     self.embeddings = tf.Variable(
         tf.random.uniform(shape=[self._total_size, emb_dim]))
     self.variables = [self.embeddings]
diff --git a/tensorflow/examples/speech_commands/train.py b/tensorflow/examples/speech_commands/train.py
index 446e351cb81..7cdeae67a06 100644
--- a/tensorflow/examples/speech_commands/train.py
+++ b/tensorflow/examples/speech_commands/train.py
@@ -296,8 +296,8 @@ def main(_):
       total_conf_matrix = conf_matrix
     else:
       total_conf_matrix += conf_matrix
-  tf.compat.v1.logging.info('Confusion Matrix:\n %s' % (total_conf_matrix))
-  tf.compat.v1.logging.info('Final test accuracy = %.1f%% (N=%d)' %
+  tf.compat.v1.logging.warn('Confusion Matrix:\n %s' % (total_conf_matrix))
+  tf.compat.v1.logging.warn('Final test accuracy = %.1f%% (N=%d)' %
                             (total_accuracy * 100, set_size))
 
 
diff --git a/tensorflow/examples/tutorials/mnist/BUILD b/tensorflow/examples/tutorials/mnist/BUILD
index 264d0849256..c4f283bb679 100644
--- a/tensorflow/examples/tutorials/mnist/BUILD
+++ b/tensorflow/examples/tutorials/mnist/BUILD
@@ -29,7 +29,6 @@ py_library(
     visibility = ["//visibility:public"],
     deps = [
         "//tensorflow:tensorflow_py",
-        "//tensorflow/contrib/learn/python/learn/datasets",
         "//third_party/py/numpy",
         "@six_archive//:six",
     ],
diff --git a/tensorflow/examples/tutorials/mnist/input_data.py b/tensorflow/examples/tutorials/mnist/input_data.py
index fa148ae3e6f..dfe0174409a 100644
--- a/tensorflow/examples/tutorials/mnist/input_data.py
+++ b/tensorflow/examples/tutorials/mnist/input_data.py
@@ -1,4 +1,4 @@
-# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,20 +12,326 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
+"""Functions for downloading and reading MNIST data (deprecated).
+
+This module and all its submodules are deprecated. See
+[contrib/learn/README.md](https://www.tensorflow.org/code/tensorflow/contrib/learn/README.md)
+for migration instructions.
+"""
 
-"""Functions for downloading and reading MNIST data."""
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-# pylint: disable=unused-import
+import collections
 import gzip
 import os
-import tempfile
 
 import numpy
 from six.moves import urllib
 from six.moves import xrange  # pylint: disable=redefined-builtin
-import tensorflow as tf
-from tensorflow.contrib.learn.python.learn.datasets.mnist import read_data_sets
-# pylint: enable=unused-import
+
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import random_seed
+from tensorflow.python.platform import gfile
+from tensorflow.python.util.deprecation import deprecated
+
+_Datasets = collections.namedtuple('_Datasets', ['train', 'validation', 'test'])
+
+# CVDF mirror of http://yann.lecun.com/exdb/mnist/
+DEFAULT_SOURCE_URL = 'https://storage.googleapis.com/cvdf-datasets/mnist/'
+
+
+def _read32(bytestream):
+  dt = numpy.dtype(numpy.uint32).newbyteorder('>')
+  return numpy.frombuffer(bytestream.read(4), dtype=dt)[0]
+
+
+@deprecated(None, 'Please use tf.data to implement this functionality.')
+def _extract_images(f):
+  """Extract the images into a 4D uint8 numpy array [index, y, x, depth].
+
+  Args:
+    f: A file object that can be passed into a gzip reader.
+
+  Returns:
+    data: A 4D uint8 numpy array [index, y, x, depth].
+
+  Raises:
+    ValueError: If the bytestream does not start with 2051.
+
+  """
+  print('Extracting', f.name)
+  with gzip.GzipFile(fileobj=f) as bytestream:
+    magic = _read32(bytestream)
+    if magic != 2051:
+      raise ValueError('Invalid magic number %d in MNIST image file: %s' %
+                       (magic, f.name))
+    num_images = _read32(bytestream)
+    rows = _read32(bytestream)
+    cols = _read32(bytestream)
+    buf = bytestream.read(rows * cols * num_images)
+    data = numpy.frombuffer(buf, dtype=numpy.uint8)
+    data = data.reshape(num_images, rows, cols, 1)
+    return data
+
+
+@deprecated(None, 'Please use tf.one_hot on tensors.')
+def _dense_to_one_hot(labels_dense, num_classes):
+  """Convert class labels from scalars to one-hot vectors."""
+  num_labels = labels_dense.shape[0]
+  index_offset = numpy.arange(num_labels) * num_classes
+  labels_one_hot = numpy.zeros((num_labels, num_classes))
+  labels_one_hot.flat[index_offset + labels_dense.ravel()] = 1
+  return labels_one_hot
+
+
+@deprecated(None, 'Please use tf.data to implement this functionality.')
+def _extract_labels(f, one_hot=False, num_classes=10):
+  """Extract the labels into a 1D uint8 numpy array [index].
+
+  Args:
+    f: A file object that can be passed into a gzip reader.
+    one_hot: Does one hot encoding for the result.
+    num_classes: Number of classes for the one hot encoding.
+
+  Returns:
+    labels: a 1D uint8 numpy array.
+
+  Raises:
+    ValueError: If the bystream doesn't start with 2049.
+  """
+  print('Extracting', f.name)
+  with gzip.GzipFile(fileobj=f) as bytestream:
+    magic = _read32(bytestream)
+    if magic != 2049:
+      raise ValueError('Invalid magic number %d in MNIST label file: %s' %
+                       (magic, f.name))
+    num_items = _read32(bytestream)
+    buf = bytestream.read(num_items)
+    labels = numpy.frombuffer(buf, dtype=numpy.uint8)
+    if one_hot:
+      return _dense_to_one_hot(labels, num_classes)
+    return labels
+
+
+class _DataSet(object):
+  """Container class for a _DataSet (deprecated).
+
+  THIS CLASS IS DEPRECATED. See
+  [contrib/learn/README.md](https://www.tensorflow.org/code/tensorflow/contrib/learn/README.md)
+  for general migration instructions.
+  """
+
+  @deprecated(None, 'Please use alternatives such as official/mnist/_DataSet.py'
+              ' from tensorflow/models.')
+  def __init__(self,
+               images,
+               labels,
+               fake_data=False,
+               one_hot=False,
+               dtype=dtypes.float32,
+               reshape=True,
+               seed=None):
+    """Construct a _DataSet.
+
+    one_hot arg is used only if fake_data is true.  `dtype` can be either
+    `uint8` to leave the input as `[0, 255]`, or `float32` to rescale into
+    `[0, 1]`.  Seed arg provides for convenient deterministic testing.
+
+    Args:
+      images: The images
+      labels: The labels
+      fake_data: Ignore inages and labels, use fake data.
+      one_hot: Bool, return the labels as one hot vectors (if True) or ints (if
+        False).
+      dtype: Output image dtype. One of [uint8, float32]. `uint8` output has
+        range [0,255]. float32 output has range [0,1].
+      reshape: Bool. If True returned images are returned flattened to vectors.
+      seed: The random seed to use.
+    """
+    seed1, seed2 = random_seed.get_seed(seed)
+    # If op level seed is not set, use whatever graph level seed is returned
+    numpy.random.seed(seed1 if seed is None else seed2)
+    dtype = dtypes.as_dtype(dtype).base_dtype
+    if dtype not in (dtypes.uint8, dtypes.float32):
+      raise TypeError('Invalid image dtype %r, expected uint8 or float32' %
+                      dtype)
+    if fake_data:
+      self._num_examples = 10000
+      self.one_hot = one_hot
+    else:
+      assert images.shape[0] == labels.shape[0], (
+          'images.shape: %s labels.shape: %s' % (images.shape, labels.shape))
+      self._num_examples = images.shape[0]
+
+      # Convert shape from [num examples, rows, columns, depth]
+      # to [num examples, rows*columns] (assuming depth == 1)
+      if reshape:
+        assert images.shape[3] == 1
+        images = images.reshape(images.shape[0],
+                                images.shape[1] * images.shape[2])
+      if dtype == dtypes.float32:
+        # Convert from [0, 255] -> [0.0, 1.0].
+        images = images.astype(numpy.float32)
+        images = numpy.multiply(images, 1.0 / 255.0)
+    self._images = images
+    self._labels = labels
+    self._epochs_completed = 0
+    self._index_in_epoch = 0
+
+  @property
+  def images(self):
+    return self._images
+
+  @property
+  def labels(self):
+    return self._labels
+
+  @property
+  def num_examples(self):
+    return self._num_examples
+
+  @property
+  def epochs_completed(self):
+    return self._epochs_completed
+
+  def next_batch(self, batch_size, fake_data=False, shuffle=True):
+    """Return the next `batch_size` examples from this data set."""
+    if fake_data:
+      fake_image = [1] * 784
+      if self.one_hot:
+        fake_label = [1] + [0] * 9
+      else:
+        fake_label = 0
+      return [fake_image for _ in xrange(batch_size)
+             ], [fake_label for _ in xrange(batch_size)]
+    start = self._index_in_epoch
+    # Shuffle for the first epoch
+    if self._epochs_completed == 0 and start == 0 and shuffle:
+      perm0 = numpy.arange(self._num_examples)
+      numpy.random.shuffle(perm0)
+      self._images = self.images[perm0]
+      self._labels = self.labels[perm0]
+    # Go to the next epoch
+    if start + batch_size > self._num_examples:
+      # Finished epoch
+      self._epochs_completed += 1
+      # Get the rest examples in this epoch
+      rest_num_examples = self._num_examples - start
+      images_rest_part = self._images[start:self._num_examples]
+      labels_rest_part = self._labels[start:self._num_examples]
+      # Shuffle the data
+      if shuffle:
+        perm = numpy.arange(self._num_examples)
+        numpy.random.shuffle(perm)
+        self._images = self.images[perm]
+        self._labels = self.labels[perm]
+      # Start next epoch
+      start = 0
+      self._index_in_epoch = batch_size - rest_num_examples
+      end = self._index_in_epoch
+      images_new_part = self._images[start:end]
+      labels_new_part = self._labels[start:end]
+      return numpy.concatenate((images_rest_part, images_new_part),
+                               axis=0), numpy.concatenate(
+                                   (labels_rest_part, labels_new_part), axis=0)
+    else:
+      self._index_in_epoch += batch_size
+      end = self._index_in_epoch
+      return self._images[start:end], self._labels[start:end]
+
+
+@deprecated(None, 'Please write your own downloading logic.')
+def _maybe_download(filename, work_directory, source_url):
+  """Download the data from source url, unless it's already here.
+
+  Args:
+      filename: string, name of the file in the directory.
+      work_directory: string, path to working directory.
+      source_url: url to download from if file doesn't exist.
+
+  Returns:
+      Path to resulting file.
+  """
+  if not gfile.Exists(work_directory):
+    gfile.MakeDirs(work_directory)
+  filepath = os.path.join(work_directory, filename)
+  if not gfile.Exists(filepath):
+    urllib.request.urlretrieve(source_url, filepath)
+    with gfile.GFile(filepath) as f:
+      size = f.size()
+    print('Successfully downloaded', filename, size, 'bytes.')
+  return filepath
+
+
+@deprecated(None, 'Please use alternatives such as:'
+            ' tensorflow_datasets.load(\'mnist\')')
+def read_data_sets(train_dir,
+                   fake_data=False,
+                   one_hot=False,
+                   dtype=dtypes.float32,
+                   reshape=True,
+                   validation_size=5000,
+                   seed=None,
+                   source_url=DEFAULT_SOURCE_URL):
+  if fake_data:
+
+    def fake():
+      return _DataSet([], [],
+                      fake_data=True,
+                      one_hot=one_hot,
+                      dtype=dtype,
+                      seed=seed)
+
+    train = fake()
+    validation = fake()
+    test = fake()
+    return _Datasets(train=train, validation=validation, test=test)
+
+  if not source_url:  # empty string check
+    source_url = DEFAULT_SOURCE_URL
+
+  train_images_file = 'train-images-idx3-ubyte.gz'
+  train_labels_file = 'train-labels-idx1-ubyte.gz'
+  test_images_file = 't10k-images-idx3-ubyte.gz'
+  test_labels_file = 't10k-labels-idx1-ubyte.gz'
+
+  local_file = _maybe_download(train_images_file, train_dir,
+                               source_url + train_images_file)
+  with gfile.Open(local_file, 'rb') as f:
+    train_images = _extract_images(f)
+
+  local_file = _maybe_download(train_labels_file, train_dir,
+                               source_url + train_labels_file)
+  with gfile.Open(local_file, 'rb') as f:
+    train_labels = _extract_labels(f, one_hot=one_hot)
+
+  local_file = _maybe_download(test_images_file, train_dir,
+                               source_url + test_images_file)
+  with gfile.Open(local_file, 'rb') as f:
+    test_images = _extract_images(f)
+
+  local_file = _maybe_download(test_labels_file, train_dir,
+                               source_url + test_labels_file)
+  with gfile.Open(local_file, 'rb') as f:
+    test_labels = _extract_labels(f, one_hot=one_hot)
+
+  if not 0 <= validation_size <= len(train_images):
+    raise ValueError(
+        'Validation size should be between 0 and {}. Received: {}.'.format(
+            len(train_images), validation_size))
+
+  validation_images = train_images[:validation_size]
+  validation_labels = train_labels[:validation_size]
+  train_images = train_images[validation_size:]
+  train_labels = train_labels[validation_size:]
+
+  options = dict(dtype=dtype, reshape=reshape, seed=seed)
+
+  train = _DataSet(train_images, train_labels, **options)
+  validation = _DataSet(validation_images, validation_labels, **options)
+  test = _DataSet(test_images, test_labels, **options)
+
+  return _Datasets(train=train, validation=validation, test=test)
+
diff --git a/tensorflow/examples/tutorials/word2vec/word2vec_basic.py b/tensorflow/examples/tutorials/word2vec/word2vec_basic.py
index 5eef590c1dd..82bc2c4d3bd 100644
--- a/tensorflow/examples/tutorials/word2vec/word2vec_basic.py
+++ b/tensorflow/examples/tutorials/word2vec/word2vec_basic.py
@@ -99,9 +99,7 @@ def word2vec_basic(log_dir):
     """Process raw inputs into a dataset."""
     count = [['UNK', -1]]
     count.extend(collections.Counter(words).most_common(n_words - 1))
-    dictionary = {}
-    for word, _ in count:
-      dictionary[word] = len(dictionary)
+    dictionary = {word: index for index, (word, _) in enumerate(count)}
     data = []
     unk_count = 0
     for word in words:
@@ -294,10 +292,10 @@ def word2vec_basic(log_dir):
           top_k = 8  # number of nearest neighbors
           nearest = (-sim[i, :]).argsort()[1:top_k + 1]
           log_str = 'Nearest to %s:' % valid_word
-          for k in xrange(top_k):
-            close_word = reverse_dictionary[nearest[k]]
-            log_str = '%s %s,' % (log_str, close_word)
-          print(log_str)
+
+          print(
+              log_str,
+              ', '.join([reverse_dictionary[nearest[k]] for k in range(top_k)]))
     final_embeddings = normalized_embeddings.eval()
 
     # Write corresponding labels for the embeddings.
diff --git a/tensorflow/go/graph.go b/tensorflow/go/graph.go
index 1524741fee5..785ac51cdd0 100644
--- a/tensorflow/go/graph.go
+++ b/tensorflow/go/graph.go
@@ -94,7 +94,12 @@ func (g *Graph) WriteTo(w io.Writer) (int64, error) {
 	// A []byte slice backed by C memory.
 	// See: https://github.com/golang/go/wiki/cgo#turning-c-arrays-into-go-slices
 	length := int(buf.length)
-	slice := (*[1 << 30]byte)(unsafe.Pointer(buf.data))[:length:length]
+	var slice []byte
+	if unsafe.Sizeof(unsafe.Pointer(nil)) == 8 {
+		slice = (*[1<<50 - 1]byte)(unsafe.Pointer(buf.data))[:length:length]
+	} else {
+		slice = (*[1 << 30]byte)(unsafe.Pointer(buf.data))[:length:length]
+	}
 	n, err := w.Write(slice)
 	return int64(n), err
 }
diff --git a/tensorflow/go/tensor.go b/tensorflow/go/tensor.go
index f3338f65957..9bc643ae6d2 100644
--- a/tensorflow/go/tensor.go
+++ b/tensorflow/go/tensor.go
@@ -207,7 +207,12 @@ func tensorData(c *C.TF_Tensor) []byte {
 		return nil
 	}
 	length := int(C.TF_TensorByteSize(c))
-	slice := (*[1 << 30]byte)(unsafe.Pointer(cbytes))[:length:length]
+	var slice []byte
+	if unsafe.Sizeof(unsafe.Pointer(nil)) == 8 {
+		slice = (*[1<<50 - 1]byte)(unsafe.Pointer(cbytes))[:length:length]
+	} else {
+		slice = (*[1 << 30]byte)(unsafe.Pointer(cbytes))[:length:length]
+	}
 	return slice
 }
 
diff --git a/tensorflow/java/BUILD b/tensorflow/java/BUILD
index 48723f85735..92b84b6ca81 100644
--- a/tensorflow/java/BUILD
+++ b/tensorflow/java/BUILD
@@ -24,7 +24,7 @@ java_library(
         ":java_op_sources",
         ":java_sources",
     ],
-    data = [":libtensorflow_jni"] + tf_binary_additional_srcs(),
+    data = tf_binary_additional_srcs() + [":libtensorflow_jni"],
     javacopts = JAVACOPTS,
     plugins = [":processor"],
     visibility = ["//visibility:public"],
diff --git a/tensorflow/java/README.md b/tensorflow/java/README.md
index 4206f6f9fc8..51e718fa1c0 100644
--- a/tensorflow/java/README.md
+++ b/tensorflow/java/README.md
@@ -3,11 +3,7 @@
 > *WARNING*: The TensorFlow Java API is not currently covered by the TensorFlow
 > [API stability guarantees](https://www.tensorflow.org/guide/version_compat).
 >
-> For using TensorFlow on Android refer instead to
-> [contrib/android](https://www.tensorflow.org/code/tensorflow/contrib/android),
-> [makefile](https://www.tensorflow.org/code/tensorflow/contrib/makefile#android)
-> and/or the
-> [Android demo](https://www.tensorflow.org/code/tensorflow/examples/android).
+> For using TensorFlow on Android refer instead to [TensorFlow Lite](https://www.tensorflow.org/code/tensorflow/lite/).
 
 ## Quickstart
 
diff --git a/tensorflow/lite/BUILD b/tensorflow/lite/BUILD
index aa137d6607d..9dcae6d2c62 100644
--- a/tensorflow/lite/BUILD
+++ b/tensorflow/lite/BUILD
@@ -141,7 +141,7 @@ exports_files(["builtin_ops.h"])
 cc_library(
     name = "string",
     hdrs = [
-        "string.h",
+        "string_type.h",
     ],
     copts = TFLITE_DEFAULT_COPTS,
 )
diff --git a/tensorflow/lite/allocation.h b/tensorflow/lite/allocation.h
index baf9ac3d421..70c9a46b7bf 100644
--- a/tensorflow/lite/allocation.h
+++ b/tensorflow/lite/allocation.h
@@ -24,7 +24,7 @@ limitations under the License.
 #include "tensorflow/lite/c/c_api_internal.h"
 #include "tensorflow/lite/core/api/error_reporter.h"
 #include "tensorflow/lite/simple_memory_arena.h"
-#include "tensorflow/lite/string.h"
+#include "tensorflow/lite/string_type.h"
 
 namespace tflite {
 
diff --git a/tensorflow/lite/build_def.bzl b/tensorflow/lite/build_def.bzl
index d540ff55ee4..619f95fca2b 100644
--- a/tensorflow/lite/build_def.bzl
+++ b/tensorflow/lite/build_def.bzl
@@ -641,7 +641,7 @@ def gen_model_coverage_test(src, model_name, data, failure_type, tags):
       model_name: Name of the model to test (must be also listed in the 'data'
         dependencies)
       data: List of BUILD targets linking the data.
-      failure_type: List of failure types (none, toco, crash, inference)
+      failure_type: List of failure types (none, toco, crash, inference, evaluation)
         expected for the corresponding combinations of op sets
         ("TFLITE_BUILTINS", "TFLITE_BUILTINS,SELECT_TF_OPS", "SELECT_TF_OPS").
       tags: List of strings of additional tags.
diff --git a/tensorflow/lite/core/api/flatbuffer_conversions_test.cc b/tensorflow/lite/core/api/flatbuffer_conversions_test.cc
index 22d152c097e..009129a4a2e 100644
--- a/tensorflow/lite/core/api/flatbuffer_conversions_test.cc
+++ b/tensorflow/lite/core/api/flatbuffer_conversions_test.cc
@@ -20,7 +20,7 @@ limitations under the License.
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "tensorflow/lite/c/builtin_op_data.h"
-#include "tensorflow/lite/string.h"
+#include "tensorflow/lite/string_type.h"
 
 namespace tflite {
 namespace {
diff --git a/tensorflow/lite/delegates/flex/buffer_map.cc b/tensorflow/lite/delegates/flex/buffer_map.cc
index 6d792d97cb2..ace195d5607 100644
--- a/tensorflow/lite/delegates/flex/buffer_map.cc
+++ b/tensorflow/lite/delegates/flex/buffer_map.cc
@@ -19,7 +19,7 @@ limitations under the License.
 #include "tensorflow/core/framework/log_memory.h"
 #include "tensorflow/core/framework/typed_allocator.h"
 #include "tensorflow/lite/delegates/flex/util.h"
-#include "tensorflow/lite/string.h"
+#include "tensorflow/lite/string_type.h"
 #include "tensorflow/lite/string_util.h"
 
 namespace tflite {
diff --git a/tensorflow/lite/delegates/flex/kernel.cc b/tensorflow/lite/delegates/flex/kernel.cc
index 6478dd75930..68cdf27fd5d 100644
--- a/tensorflow/lite/delegates/flex/kernel.cc
+++ b/tensorflow/lite/delegates/flex/kernel.cc
@@ -28,7 +28,7 @@ limitations under the License.
 #include "tensorflow/lite/delegates/flex/delegate_data.h"
 #include "tensorflow/lite/delegates/flex/util.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
-#include "tensorflow/lite/string.h"
+#include "tensorflow/lite/string_type.h"
 
 // Note: this is part of TF Lite's Flex delegation code which is to be
 // completed soon.
diff --git a/tensorflow/lite/delegates/flex/test_util.cc b/tensorflow/lite/delegates/flex/test_util.cc
index a67aeef231b..bf844e94230 100644
--- a/tensorflow/lite/delegates/flex/test_util.cc
+++ b/tensorflow/lite/delegates/flex/test_util.cc
@@ -17,7 +17,7 @@ limitations under the License.
 
 #include "absl/memory/memory.h"
 #include "flatbuffers/flexbuffers.h"  // TF:flatbuffers
-#include "tensorflow/lite/string.h"
+#include "tensorflow/lite/string_type.h"
 
 namespace tflite {
 namespace flex {
diff --git a/tensorflow/lite/delegates/flex/util_test.cc b/tensorflow/lite/delegates/flex/util_test.cc
index 69bba405055..751289ef28f 100644
--- a/tensorflow/lite/delegates/flex/util_test.cc
+++ b/tensorflow/lite/delegates/flex/util_test.cc
@@ -18,7 +18,7 @@ limitations under the License.
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
-#include "tensorflow/lite/string.h"
+#include "tensorflow/lite/string_type.h"
 #include "tensorflow/lite/testing/util.h"
 
 namespace tflite {
diff --git a/tensorflow/lite/delegates/gpu/BUILD b/tensorflow/lite/delegates/gpu/BUILD
index 7fe31477a27..aac20c09748 100644
--- a/tensorflow/lite/delegates/gpu/BUILD
+++ b/tensorflow/lite/delegates/gpu/BUILD
@@ -99,7 +99,6 @@ objc_library(
 objc_library(
     name = "metal_delegate_internal",
     hdrs = ["metal_delegate_internal.h"],
-    copts = ["-std=c++11"],
     sdk_frameworks = ["Metal"],
     deps = [
         "//tensorflow/lite/delegates/gpu:metal_delegate",
@@ -159,6 +158,7 @@ cc_library(
     hdrs = ["spi.h"],
     deps = [
         ":api",
+        "//tensorflow/lite/delegates/gpu/common:access_type",
         "//tensorflow/lite/delegates/gpu/common:status",
     ],
 )
diff --git a/tensorflow/lite/delegates/gpu/cl/BUILD b/tensorflow/lite/delegates/gpu/cl/BUILD
index 99d0434c5e7..9da04b7ec05 100644
--- a/tensorflow/lite/delegates/gpu/cl/BUILD
+++ b/tensorflow/lite/delegates/gpu/cl/BUILD
@@ -313,6 +313,7 @@ cc_library(
         "//tensorflow/lite/delegates/gpu/common:memory_management",
         "//tensorflow/lite/delegates/gpu/common:model",
         "//tensorflow/lite/delegates/gpu/common:model_transformer",
+        "//tensorflow/lite/delegates/gpu/common:operations",
         "//tensorflow/lite/delegates/gpu/common:status",
         "//tensorflow/lite/delegates/gpu/common:tensor",
         "//tensorflow/lite/delegates/gpu/common:types",
diff --git a/tensorflow/lite/delegates/gpu/cl/api.cc b/tensorflow/lite/delegates/gpu/cl/api.cc
index 6b6333b3bc1..2d70aea43d0 100644
--- a/tensorflow/lite/delegates/gpu/cl/api.cc
+++ b/tensorflow/lite/delegates/gpu/cl/api.cc
@@ -42,39 +42,6 @@ namespace gpu {
 namespace cl {
 namespace {
 
-// Connects tensor definition provided by a user (external) with tensor
-// definition used by the inference engine (internal).
-struct TensorTieDef {
-  ValueId id;
-  AccessType access_type;
-  TensorObjectDef internal_def;
-  TensorObjectDef external_def;
-};
-
-// Connects external tensor object to internal tensor object and provides
-// functionality to copy data to/from external object to internal.
-class TensorTie {
- public:
-  explicit TensorTie(const TensorTieDef& def) : def_(def) {}
-
-  virtual ~TensorTie() {}
-
-  virtual Status SetExternalObject(TensorObject obj) {
-    return InvalidArgumentError("Tensor object is readonly.");
-  }
-
-  virtual TensorObject GetExternalObject() = 0;
-
-  virtual Status CopyToExternalObject() = 0;
-
-  virtual Status CopyFromExternalObject() = 0;
-
-  const TensorTieDef& def() const { return def_; }
-
- private:
-  const TensorTieDef def_;
-};
-
 // Both internal and external defs are identical, therefore nothing to connect
 // here.
 class NoopTensorTie : public TensorTie {
@@ -86,9 +53,14 @@ class NoopTensorTie : public TensorTie {
     return def.external_def == def.internal_def;
   }
 
-  static Status New(const TensorTieDef& def, TensorObject internal_object,
-                    std::unique_ptr<TensorTie>* tie) {
-    *tie = absl::make_unique<NoopTensorTie>(def, internal_object);
+  Status SetExternalObject(TensorObject obj) final {
+    if (!def().external_def.object_def.user_provided) {
+      return InvalidArgumentError("Tensor object is readonly.");
+    }
+    if (!IsValid(def().external_def, obj)) {
+      return InvalidArgumentError("Given object is not valid");
+    }
+    obj_ = obj;
     return OkStatus();
   }
 
@@ -109,14 +81,15 @@ class DefaultTensorTie : public TensorTie {
   DefaultTensorTie(const TensorTieDef& def, TensorObject internal_obj)
       : TensorTie(def), internal_obj_(internal_obj) {}
 
-  static bool IsSupported(const TensorTieDef& def,
-                          TensorObjectConverterBuilder* converter_builder) {
+  static bool IsSupported(
+      const TensorTieDef& def,
+      const TensorObjectConverterBuilder& converter_builder) {
     auto object_type = def.external_def.object_def.object_type;
     return (object_type == ObjectType::OPENCL_BUFFER ||
             object_type == ObjectType::OPENCL_TEXTURE ||
             object_type == ObjectType::CPU_MEMORY) &&
-           converter_builder->IsSupported(def.internal_def, def.external_def) &&
-           converter_builder->IsSupported(def.external_def, def.internal_def);
+           converter_builder.IsSupported(def.internal_def, def.external_def) &&
+           converter_builder.IsSupported(def.external_def, def.internal_def);
   }
 
   static Status New(const TensorTieDef& def, TensorObject internal_object,
@@ -214,8 +187,9 @@ class TwoStepTensorTie : public TensorTie {
  public:
   explicit TwoStepTensorTie(const TensorTieDef& def) : TensorTie(def) {}
 
-  static bool IsSupported(const TensorTieDef& def,
-                          TensorObjectConverterBuilder* converter_builder) {
+  static bool IsSupported(
+      const TensorTieDef& def,
+      const TensorObjectConverterBuilder& converter_builder) {
     auto defs = MakeOuterInnerDefs(def);
     return DefaultTensorTie::IsSupported(defs.first, converter_builder) &&
            DefaultTensorTie::IsSupported(defs.second, converter_builder);
@@ -287,8 +261,9 @@ class GlBufferHolder : public TensorTie {
         gl_interop_fabric_(gl_interop_fabric),
         environment_(env) {}
 
-  static bool IsSupported(const TensorTieDef& def,
-                          TensorObjectConverterBuilder* converter_builder) {
+  static bool IsSupported(
+      const TensorTieDef& def,
+      const TensorObjectConverterBuilder& converter_builder) {
     if (!def.external_def.object_def.user_provided ||
         def.external_def.object_def.object_type != ObjectType::OPENGL_SSBO) {
       return false;
@@ -370,12 +345,11 @@ class TensorTieFactory {
         converter_builder_(NewConverterBuilder(env)) {}
 
   bool IsSupported(const TensorTieDef& def) const {
-    auto converter = converter_builder_.get();
     return IsValid(def.external_def.object_def) &&
            (NoopTensorTie::IsSupported(def) ||
-            DefaultTensorTie::IsSupported(def, converter) ||
-            GlBufferHolder::IsSupported(def, converter) ||
-            TwoStepTensorTie::IsSupported(def, converter));
+            DefaultTensorTie::IsSupported(def, *converter_builder_) ||
+            GlBufferHolder::IsSupported(def, *converter_builder_) ||
+            TwoStepTensorTie::IsSupported(def, *converter_builder_));
   }
 
   Status NewTensorTie(const TensorTieDef& def,
@@ -383,12 +357,13 @@ class TensorTieFactory {
     TensorObject internal_object = TensorToObj(*context_.GetTensor(def.id));
     auto converter = converter_builder_.get();
     if (NoopTensorTie::IsSupported(def)) {
-      return NoopTensorTie::New(def, internal_object, tie);
+      *tie = absl::make_unique<NoopTensorTie>(def, internal_object);
+      return OkStatus();
     }
-    if (DefaultTensorTie::IsSupported(def, converter)) {
+    if (DefaultTensorTie::IsSupported(def, *converter)) {
       return DefaultTensorTie::New(def, internal_object, converter, &env_, tie);
     }
-    if (GlBufferHolder::IsSupported(def, converter)) {
+    if (GlBufferHolder::IsSupported(def, *converter)) {
       if (!gl_interop_fabric_) {
         return InvalidArgumentError(
             "GL object is used but InferenceEnvironmentOptions does not have "
@@ -397,7 +372,7 @@ class TensorTieFactory {
       return GlBufferHolder::New(def, internal_object, converter,
                                  gl_interop_fabric_, &env_, tie);
     }
-    if (TwoStepTensorTie::IsSupported(def, converter)) {
+    if (TwoStepTensorTie::IsSupported(def, *converter)) {
       return TwoStepTensorTie::New(def, internal_object, converter, &env_, tie);
     }
     return UnimplementedError("Unsupported tensor tie definition.");
@@ -435,7 +410,7 @@ class InferenceRunnerImpl : public InferenceRunner {
   }
 
   Status GetInputObject(int index, TensorObject* object) override {
-    if (index < 0 || index > inputs_.size()) {
+    if (index < 0 || index >= inputs_.size()) {
       return OutOfRangeError("Index is out of range");
     }
     *object = inputs_[index]->GetExternalObject();
@@ -443,7 +418,7 @@ class InferenceRunnerImpl : public InferenceRunner {
   }
 
   Status GetOutputObject(int index, TensorObject* object) override {
-    if (index < 0 || index > outputs_.size()) {
+    if (index < 0 || index >= outputs_.size()) {
       return OutOfRangeError("Index is out of range");
     }
     *object = outputs_[index]->GetExternalObject();
@@ -451,14 +426,14 @@ class InferenceRunnerImpl : public InferenceRunner {
   }
 
   Status SetInputObject(int index, TensorObject object) override {
-    if (index < 0 || index > inputs_.size()) {
+    if (index < 0 || index >= inputs_.size()) {
       return OutOfRangeError("Index is out of range");
     }
     return inputs_[index]->SetExternalObject(object);
   }
 
   Status SetOutputObject(int index, TensorObject object) override {
-    if (index < 0 || index > outputs_.size()) {
+    if (index < 0 || index >= outputs_.size()) {
       return OutOfRangeError("Index is out of range");
     }
     return outputs_[index]->SetExternalObject(object);
@@ -585,14 +560,14 @@ class InferenceBuilderImpl : public InferenceBuilder {
   }
 
   Status SetInputShape(int index, const Dimensions& dimensions) override {
-    if (index < 0 || index > inputs_.size()) {
+    if (index < 0 || index >= inputs_.size()) {
       return OutOfRangeError("Index is out of range");
     }
     return UnimplementedError("Changing input shapes is not supported");
   }
 
   Status SetInputObjectDef(int index, ObjectDef new_def) override {
-    if (index < 0 || index > inputs_.size()) {
+    if (index < 0 || index >= inputs_.size()) {
       return OutOfRangeError("Index is out of range");
     }
     auto def = inputs_[index];
@@ -605,7 +580,7 @@ class InferenceBuilderImpl : public InferenceBuilder {
   }
 
   Status SetOutputObjectDef(int index, ObjectDef new_def) override {
-    if (index < 0 || index > outputs_.size()) {
+    if (index < 0 || index >= outputs_.size()) {
       return OutOfRangeError("Index is out of range");
     }
     auto def = outputs_[index];
diff --git a/tensorflow/lite/delegates/gpu/cl/cl_device.cc b/tensorflow/lite/delegates/gpu/cl/cl_device.cc
index 81709c5b12c..d4ffaf8b1ea 100644
--- a/tensorflow/lite/delegates/gpu/cl/cl_device.cc
+++ b/tensorflow/lite/delegates/gpu/cl/cl_device.cc
@@ -276,6 +276,7 @@ DeviceInfo::DeviceInfo(cl_device_id id)
   compute_units_count = GetDeviceInfo<cl_uint>(id, CL_DEVICE_MAX_COMPUTE_UNITS);
   image2d_max_width = GetDeviceInfo<size_t>(id, CL_DEVICE_IMAGE2D_MAX_HEIGHT);
   image2d_max_height = GetDeviceInfo<size_t>(id, CL_DEVICE_IMAGE2D_MAX_WIDTH);
+  buffer_max_size = GetDeviceInfo<cl_ulong>(id, CL_DEVICE_MAX_MEM_ALLOC_SIZE);
   if (cl_version >= OpenCLVersion::CL_1_2) {
     image_buffer_max_size =
         GetDeviceInfo<size_t>(id, CL_DEVICE_IMAGE_MAX_BUFFER_SIZE);
diff --git a/tensorflow/lite/delegates/gpu/cl/cl_device.h b/tensorflow/lite/delegates/gpu/cl/cl_device.h
index 08490892955..60b40913de2 100644
--- a/tensorflow/lite/delegates/gpu/cl/cl_device.h
+++ b/tensorflow/lite/delegates/gpu/cl/cl_device.h
@@ -71,10 +71,11 @@ struct DeviceInfo {
   Vendor vendor;
   OpenCLVersion cl_version;
   int compute_units_count;
-  int image2d_max_width;
-  int image2d_max_height;
-  int image_buffer_max_size;
-  int image_array_max_layers;
+  uint64_t buffer_max_size;
+  uint64_t image2d_max_width;
+  uint64_t image2d_max_height;
+  uint64_t image_buffer_max_size;
+  uint64_t image_array_max_layers;
   int3 max_work_group_sizes;
 
   AdrenoInfo adreno_info;
diff --git a/tensorflow/lite/delegates/gpu/cl/cl_program.cc b/tensorflow/lite/delegates/gpu/cl/cl_program.cc
index c96e6d31327..733ac0adac4 100644
--- a/tensorflow/lite/delegates/gpu/cl/cl_program.cc
+++ b/tensorflow/lite/delegates/gpu/cl/cl_program.cc
@@ -84,6 +84,8 @@ std::string CompilerOptionToString(const CLDevice& device,
       }
     case CompilerOptions::POWERVR_FP16:
       return "-cl-fast-relaxed-math";
+    case CompilerOptions::CL_OPT_DISABLE:
+      return "-cl-opt-disable";
   }
 }
 
diff --git a/tensorflow/lite/delegates/gpu/cl/cl_program.h b/tensorflow/lite/delegates/gpu/cl/cl_program.h
index 5b7423fe807..2832dc097d5 100644
--- a/tensorflow/lite/delegates/gpu/cl/cl_program.h
+++ b/tensorflow/lite/delegates/gpu/cl/cl_program.h
@@ -29,13 +29,18 @@ namespace tflite {
 namespace gpu {
 namespace cl {
 
-// ADRENO_FULL_SIMD_LINE:
-//   Adreno can have 2 sizes for SIMD size.
-//   On Adreno 4xx/5xx it is 32/64, on Adreno6xx it is 64/128.
-//   Some our algorithms actually rely on exact size, for example on full
-//   SIMD size, so we need this define.
-//   This define is actually -qcom-accelerate-16-bit, but it controls SIMD size.
-enum class CompilerOptions { ADRENO_FULL_SIMD_LINE, POWERVR_FP16 };
+enum class CompilerOptions {
+  // ADRENO_FULL_SIMD_LINE:
+  //   Adreno can have 2 sizes for SIMD size.
+  //   On Adreno 4xx/5xx it is 32/64, on Adreno6xx it is 64/128.
+  //   Some our algorithms actually rely on exact size, for example on full
+  //   SIMD size, so we need this define.
+  //   This define is actually -qcom-accelerate-16-bit, but it controls SIMD
+  //   size.
+  ADRENO_FULL_SIMD_LINE,
+  POWERVR_FP16,
+  CL_OPT_DISABLE
+};
 
 std::string CompilerOptionsToString(
     const CLDevice& device,
diff --git a/tensorflow/lite/delegates/gpu/cl/inference_context.cc b/tensorflow/lite/delegates/gpu/cl/inference_context.cc
index e0ad1355cd1..499a01eeb07 100644
--- a/tensorflow/lite/delegates/gpu/cl/inference_context.cc
+++ b/tensorflow/lite/delegates/gpu/cl/inference_context.cc
@@ -31,6 +31,7 @@ limitations under the License.
 #include "tensorflow/lite/delegates/gpu/common/memory_management.h"
 #include "tensorflow/lite/delegates/gpu/common/model.h"
 #include "tensorflow/lite/delegates/gpu/common/model_transformer.h"
+#include "tensorflow/lite/delegates/gpu/common/operations.h"
 #include "tensorflow/lite/delegates/gpu/common/transformations/add_bias.h"
 #include "tensorflow/lite/delegates/gpu/common/transformations/merge_padding_with.h"
 #include "tensorflow/lite/delegates/gpu/common/types.h"
@@ -53,32 +54,29 @@ bool IsReady(const std::unordered_set<ValueId>& ready_tensors,
 std::vector<std::pair<ValueId, TensorDescriptor>> GetCLNodeTensors(
     const CLNode& node) {
   std::vector<std::pair<ValueId, TensorDescriptor>> result;
-  for (int i = 0; i < node.operations.size(); ++i) {
-    const OperationDef op_def = node.operations[i]->GetDefinition();
-    const auto& first_range = node.ranges[0];
-    for (int k = first_range.x; k < first_range.y; ++k) {
-      result.push_back({node.inputs[k], op_def.src_tensors[k - first_range.x]});
-    }
-    for (int j = 1; j < node.ranges.size(); ++j) {
-      const auto& range = node.ranges[j];
-      for (int k = range.x; k < range.y; ++k) {
-        result.push_back({node.inputs[k], op_def.src_tensors[k - range.x + 1]});
-      }
-    }
-    for (int j = 0; j < node.outputs.size(); ++j) {
-      result.push_back({node.outputs[j], op_def.dst_tensors[j]});
+  const OperationDef main_def = node.operations[0]->GetDefinition();
+  const auto& first_range = node.ranges[0];
+  for (int k = first_range.x; k < first_range.y; ++k) {
+    result.push_back({node.inputs[k], main_def.src_tensors[k - first_range.x]});
+  }
+  for (int j = 1; j < node.ranges.size(); ++j) {
+    const auto& range = node.ranges[j];
+    const OperationDef op_def = node.operations[j]->GetDefinition();
+    for (int k = range.x; k < range.y; ++k) {
+      result.push_back({node.inputs[k], op_def.src_tensors[k - range.x + 1]});
     }
   }
+  for (int j = 0; j < node.outputs.size(); ++j) {
+    result.push_back({node.outputs[j], main_def.dst_tensors[j]});
+  }
 
   return result;
 }
 
 void MergeCLNodes(CLNode* src, CLNode* dst) {
   int offset = dst->inputs.size();
-  for (int j = 0; j < src->inputs.size(); ++j) {
-    if (src->inputs[j] != dst->outputs[0]) {
-      dst->inputs.push_back(src->inputs[j]);
-    }
+  for (int j = 1; j < src->inputs.size(); ++j) {
+    dst->inputs.push_back(src->inputs[j]);
   }
   auto first_range = src->ranges[0];
   dst->ranges.push_back(
@@ -186,10 +184,41 @@ Status InferenceContext::ConvertOperations(
     const CreationContext& creation_context, const GraphFloat32& graph,
     ModelHints hints) {
   std::vector<Node*> graph_nodes = graph.nodes();
+  std::map<ValueId, int>
+      tensor_usages;  // keeps latest index of operation that updated tensor
+  for (const auto& input_id : input_ids_) {
+    tensor_usages[input_id] = -1;  // so as inputs "updated" before operation 0,
+                                   // we will mark them with -1
+  }
   for (int i = 0; i < graph_nodes.size(); ++i) {
     const Node& node = *graph_nodes[i];
     auto inputs = graph.FindInputs(node.id);
     auto outputs = graph.FindOutputs(node.id);
+
+    // Reordering of input ids and updating of temporary tensors_usage struct.
+    // This stage is necessary because we are building OperationDef that rely on
+    // order of input ids. But we also should have input id on first position
+    // that potentially can be "linking" tensor and as result eliminated(unused)
+    // We apply it only for ADD operation, because of ADD associativity and
+    // ADD can be linked.
+    // In current approach "linking" tensor can be only latest written
+    // tensor(during linear order of execution) among input tensors.
+    const OperationType op_type = OperationTypeFromString(node.operation.type);
+    if (inputs.size() > 1 && op_type == OperationType::ADD) {
+      int latest_written_tensor_index = 0;
+      int last_usage = tensor_usages[inputs[0]->id];
+      for (int j = 1; j < inputs.size(); ++j) {
+        if (tensor_usages[inputs[j]->id] > last_usage) {
+          last_usage = tensor_usages[inputs[j]->id];
+          latest_written_tensor_index = j;
+        }
+      }
+      std::swap(inputs[0], inputs[latest_written_tensor_index]);
+    }
+    for (const auto& out_id : outputs) {
+      tensor_usages[out_id->id] = i;
+    }
+
     OperationDef op_def;
     op_def.precision = precision_;
     auto data_type = DeduceDataTypeFromPrecision(precision_);
@@ -200,8 +229,8 @@ Status InferenceContext::ConvertOperations(
       op_def.dst_tensors.push_back({data_type, storage_type_});
     }
     std::unique_ptr<GPUOperation> gpu_op;
-    RETURN_IF_ERROR(GPUOperationFromNode(creation_context, op_def, hints, graph,
-                                         node, &gpu_op));
+    RETURN_IF_ERROR(GPUOperationFromNode(creation_context, op_def, hints,
+                                         inputs, outputs, node, &gpu_op));
     CLNode cl_node;
     cl_node.operations.push_back(std::move(gpu_op));
     cl_node.ranges.push_back(int2(0, static_cast<int>(inputs.size())));
@@ -235,14 +264,16 @@ void InferenceContext::Merge() {
       continue;
     }
     std::vector<int> next_nodes;
+    int link_index = 0;
     for (int j = i + 1; j < nodes_.size(); ++j) {
       for (int k = 0; k < nodes_[j].inputs.size(); ++k) {
         if (nodes_[j].inputs[k] == node.outputs[0]) {
           next_nodes.push_back(j);
+          link_index = k;
         }
       }
     }
-    if (next_nodes.size() != 1) {
+    if (next_nodes.size() != 1 || link_index != 0) {
       continue;
     }
     auto& linkable_node = nodes_[next_nodes[0]];
@@ -309,9 +340,8 @@ Status InferenceContext::AllocateMemory(const GraphFloat32& graph,
       if (it == tensors_.end()) {
         const auto& shape = assignment.object_sizes[tensor.first];
         Tensor* t = &tensors_[tensor.first];
-        RETURN_IF_ERROR(CreateTensor(*context, device, shape.w, shape.h,
-                                     shape.c, tensor.second.data_type,
-                                     tensor.second.storage_type, t));
+        RETURN_IF_ERROR(
+            CreateTensor(*context, device, shape, tensor.second, t));
       }
     }
   }
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/BUILD b/tensorflow/lite/delegates/gpu/cl/kernels/BUILD
index f7ff135a438..252b394db1a 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/BUILD
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/BUILD
@@ -74,6 +74,7 @@ cc_library(
         "//tensorflow/lite/delegates/gpu/common:operations",
         "//tensorflow/lite/delegates/gpu/common:status",
         "//tensorflow/lite/delegates/gpu/common:types",
+        "@com_google_absl//absl/strings",
     ],
 )
 
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/abs.cc b/tensorflow/lite/delegates/gpu/cl/kernels/abs.cc
index e57901168be..32476065ff8 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/abs.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/abs.cc
@@ -32,9 +32,8 @@ Abs& Abs::operator=(Abs&& operation) {
   return *this;
 }
 
-std::string Abs::GetCoreCode(const std::string& src, const std::string& z_coord,
-                             const std::string& address) const {
-  return absl::StrCat(src, " = fabs(", src, ");\n");
+std::string Abs::GetCoreCode(const LinkingContext& context) const {
+  return absl::StrCat(context.var_name, " = fabs(", context.var_name, ");\n");
 }
 
 Abs CreateAbs(const OperationDef& definition) {
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/abs.h b/tensorflow/lite/delegates/gpu/cl/kernels/abs.h
index 2663794f8cd..7e86dad3c3d 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/abs.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/abs.h
@@ -35,8 +35,7 @@ class Abs : public ElementwiseOperation {
   Abs(const Abs&) = delete;
   Abs& operator=(const Abs&) = delete;
 
-  std::string GetCoreCode(const std::string& src, const std::string& z_coord,
-                          const std::string& address) const override;
+  std::string GetCoreCode(const LinkingContext& context) const override;
 };
 
 Abs CreateAbs(const OperationDef& definition);
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/add.cc b/tensorflow/lite/delegates/gpu/cl/kernels/add.cc
index 1ea8da90c49..b2ca7569b33 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/add.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/add.cc
@@ -15,6 +15,8 @@ limitations under the License.
 
 #include "tensorflow/lite/delegates/gpu/cl/kernels/add.h"
 
+#include <string>
+
 #include "absl/strings/str_cat.h"
 #include "tensorflow/lite/delegates/gpu/cl/kernels/util.h"
 #include "tensorflow/lite/delegates/gpu/common/util.h"
@@ -22,24 +24,12 @@ limitations under the License.
 namespace tflite {
 namespace gpu {
 namespace cl {
-namespace {
-
-bool HasTexture2DStorageType(const OperationDef& def) {
-  for (auto& src_tensor : def.src_tensors) {
-    if (src_tensor.storage_type == TensorStorageType::TEXTURE_2D) {
-      return true;
-    }
-  }
-  return false;
-}
-
-}  // namespace
 
 std::string Add::GetElementWiseCode(
     const TensorDescriptor& src_descriptor,
     const TensorDescriptor& dst_descriptor, CalculationsPrecision precision,
     const std::vector<ElementwiseOperation*>& linked_operations) {
-  TensorCodeGenerator src_tensor("src_data", "dst_size", src_descriptor);
+  TensorCodeGenerator src_tensor("src_data", "src_size", src_descriptor);
   TensorCodeGenerator dst_tensor("dst_data", "dst_size", dst_descriptor);
 
   std::string c = GetCommonDefines(precision);
@@ -49,37 +39,32 @@ std::string Add::GetElementWiseCode(
   c += GetArgsDeclaration();
   c += ::tflite::gpu::cl::GetArgsDeclaration(linked_operations);
   c += dst_tensor.GetDeclaration(AccessType::WRITE) + ",\n";
+  c += "    int4 src_size,\n";
   c += "    int4 dst_size\n";
   c += ") {\n";
   c += "  int X = get_global_id(0);\n";
   c += "  int Y = get_global_id(1);\n";
   c += "  int Z = get_global_id(2);\n";
-  c += "  if (X >= dst_size.x || Y >= dst_size.y) { \n";
+  c += "  if (X >= dst_size.x || Y >= dst_size.y || Z >= dst_size.w) { \n";
   c += "    return; \n";
   c += "  } \n";
   c += "  FLT4 src = (FLT4)(0.0);\n";
   c += "    " + dst_tensor.GetAddress("address", "X", "Y", "Z") + "\n";
   if (src_depthes_[0] != dst_depth_) {
     c += "  if (Z < " + std::to_string(src_depthes_[0]) + ") {\n";
-    if (src_descriptor.storage_type == TensorStorageType::TEXTURE_2D) {
-      c += "    float t_y = address.y - Z; \n";
-      c += "    int ti_y = (t_y + 0.5) * " + inv_divisor_name_ + "; \n";
-      c += "    int2 tmp_add = (int2)(address.x, ti_y  * " +
-           std::to_string(src_depthes_[0]) + " + Z);\n";
-      c += "    src += " +
-           src_tensor.Read3D("tmp_add", TextureAddressMode::DONT_CARE) + ";\n";
-    } else {
-      c += "    src += " +
-           src_tensor.Read3D("address", TextureAddressMode::DONT_CARE) + ";\n";
-    }
+    c += "    src += " +
+         src_tensor.Read3D("X", "Y", "Z", TextureAddressMode::DONT_CARE) +
+         ";\n";
     c += "  }\n";
   } else {
     c += "  src += " +
-         src_tensor.Read3D("address", TextureAddressMode::DONT_CARE) + ";\n";
+         src_tensor.Read3D("X", "Y", "Z", TextureAddressMode::DONT_CARE) +
+         ";\n";
   }
-  c += "  " + GetCoreCode("src", "Z", "address");
-  c += PostProcess(linked_operations, "src", "Z", "address");
-  c += "  " + dst_tensor.Write3D("src", "address") + "\n";
+  const LinkingContext context{"src", "X", "Y", "Z"};
+  c += "  " + GetCoreCode(context);
+  c += PostProcess(linked_operations, context);
+  c += "  " + dst_tensor.Write3D("src", "X", "Y", "Z") + "\n";
   c += "} \n";
   return c;
 }
@@ -97,14 +82,12 @@ Add::Add(const OperationDef& definition, const std::vector<int>& channels,
 Add::Add(Add&& operation)
     : ElementwiseOperation(std::move(operation)),
       link_index_(operation.link_index_),
-      inv_divisor_name_(std::move(operation.inv_divisor_name_)),
       src_depthes_(std::move(operation.src_depthes_)),
       dst_depth_(operation.dst_depth_) {}
 
 Add& Add::operator=(Add&& operation) {
   if (this != &operation) {
     link_index_ = operation.link_index_;
-    inv_divisor_name_ = std::move(operation.inv_divisor_name_);
     src_depthes_ = std::move(operation.src_depthes_);
     dst_depth_ = operation.dst_depth_;
     ElementwiseOperation::operator=(std::move(operation));
@@ -113,43 +96,33 @@ Add& Add::operator=(Add&& operation) {
 }
 
 void Add::SetLinkIndex(int index) {
-  inv_divisor_name_ = absl::StrCat("inv_divisor_", index);
   link_index_ = index;
 }
 
-std::string Add::GetCoreCode(const std::string& src, const std::string& z_coord,
-                             const std::string& address) const {
+std::string Add::GetCoreCode(const LinkingContext& context) const {
   std::string result;
   for (int i = 1; i < src_depthes_.size(); ++i) {
     const std::string tensor_name =
         absl::StrCat("src_data_", link_index_, "_", i);
-    TensorCodeGenerator src_tensor(tensor_name, "", definition_.src_tensors[i]);
+    const std::string size_name =
+        "src_size_" + std::to_string(link_index_) + "_" + std::to_string(i);
+    TensorCodeGenerator src_tensor(tensor_name, size_name,
+                                   definition_.src_tensors[i]);
     if (src_depthes_[i] != dst_depth_) {
-      absl::StrAppend(&result, "  if (", z_coord, " < ", src_depthes_[i],
-                      ") {\n");
-      if (definition_.src_tensors[i].storage_type ==
-          TensorStorageType::TEXTURE_2D) {
-        absl::StrAppend(&result, "    float t_y = ", address, ".y - ", z_coord,
-                        ";\n");
-        absl::StrAppend(&result, "    int ti_y = (t_y + 0.5) * ",
-                        inv_divisor_name_, ";\n");
-        absl::StrAppend(&result, "    int2 tmp_add = (int2)(", address,
-                        ".x, ti_y * ", src_depthes_[i], " + ", z_coord, ");\n");
-        absl::StrAppend(
-            &result, "    ", src,
-            " += ", src_tensor.Read3D("tmp_add", TextureAddressMode::DONT_CARE),
-            ";\n");
-      } else {
-        absl::StrAppend(
-            &result, "    ", src,
-            " += ", src_tensor.Read3D(address, TextureAddressMode::DONT_CARE),
-            ";\n");
-      }
+      absl::StrAppend(&result, "  if (", context.z_coord, " < ",
+                      src_depthes_[i], ") {\n");
+      absl::StrAppend(
+          &result, "  ", context.var_name, " += ",
+          src_tensor.Read3D(context.x_coord, context.y_coord, context.z_coord,
+                            TextureAddressMode::DONT_CARE) +
+              ";\n");
       absl::StrAppend(&result, "  }\n");
     } else {
       absl::StrAppend(
-          &result, "  ", src, " += ",
-          src_tensor.Read3D(address, TextureAddressMode::DONT_CARE) + ";\n");
+          &result, "  ", context.var_name, " += ",
+          src_tensor.Read3D(context.x_coord, context.y_coord, context.z_coord,
+                            TextureAddressMode::DONT_CARE) +
+              ";\n");
     }
   }
   return result;
@@ -163,8 +136,10 @@ std::string Add::GetArgsDeclaration() const {
     TensorCodeGenerator src_tensor(tensor_name, "", definition_.src_tensors[i]);
     absl::StrAppend(&args, ",\n", src_tensor.GetDeclaration(AccessType::READ));
   }
-  if (HasTexture2DStorageType(definition_)) {
-    absl::StrAppend(&args, ",\n   float ", inv_divisor_name_);
+  for (int i = 1; i < src_depthes_.size(); ++i) {
+    const std::string size_name =
+        "src_size_" + std::to_string(link_index_) + "_" + std::to_string(i);
+    absl::StrAppend(&args, ",\n   int4 ", size_name);
   }
   return args;
 }
@@ -173,9 +148,8 @@ Status Add::BindArguments(CLKernel* kernel) {
   for (int i = 1; i < src_depthes_.size(); ++i) {
     RETURN_IF_ERROR(kernel->SetMemoryAuto(src_[i]->GetMemoryPtr()));
   }
-  if (HasTexture2DStorageType(definition_)) {
-    float inv_divisor = 1.0f / static_cast<float>(dst_depth_);
-    RETURN_IF_ERROR(kernel->SetBytesAuto(inv_divisor));
+  for (int i = 1; i < src_depthes_.size(); ++i) {
+    RETURN_IF_ERROR(kernel->SetBytesAuto(src_[i]->GetSizeWithDepth()));
   }
   return OkStatus();
 }
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/add.h b/tensorflow/lite/delegates/gpu/cl/kernels/add.h
index 1779673cf81..cad591b9577 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/add.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/add.h
@@ -27,7 +27,7 @@ namespace tflite {
 namespace gpu {
 namespace cl {
 
-// Add operation inherited from ElementwiseOperation, but it is much more
+// Add operation inherited from ElementwiseOperation, but it is more
 // complicated than usual elementwise, that is why it has own versions for
 // Compile. Add operation support not equal tensors on input (for possibility to
 // remove Padding operation with zeroes in Z dimension)
@@ -45,8 +45,7 @@ class Add : public ElementwiseOperation {
   Add& operator=(const Add&) = delete;
 
   void SetLinkIndex(int index) override;
-  std::string GetCoreCode(const std::string& src, const std::string& z_coord,
-                          const std::string& address) const override;
+  std::string GetCoreCode(const LinkingContext& context) const override;
   std::string GetArgsDeclaration() const override;
   Status BindArguments(CLKernel* kernel) override;
 
@@ -57,7 +56,6 @@ class Add : public ElementwiseOperation {
       const std::vector<ElementwiseOperation*>& linked_operations);
 
   int link_index_;
-  std::string inv_divisor_name_;
   std::vector<int> src_depthes_;
   int dst_depth_;
 };
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/apply_mask.cc b/tensorflow/lite/delegates/gpu/cl/kernels/apply_mask.cc
index 8ce45987775..6631c51ad06 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/apply_mask.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/apply_mask.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <string>
 #include <vector>
 
+#include "absl/strings/str_cat.h"
 #include "tensorflow/lite/delegates/gpu/cl/kernels/util.h"
 #include "tensorflow/lite/delegates/gpu/cl/kernels/work_group_picking.h"
 #include "tensorflow/lite/delegates/gpu/common/operations.h"
@@ -26,113 +27,75 @@ limitations under the License.
 namespace tflite {
 namespace gpu {
 namespace cl {
-namespace {
-
-std::string GetApplyMaskKernelCode(
-    const OperationDef& definition,
-    const std::vector<ElementwiseOperation*>& linked_operations) {
-  TensorCodeGenerator src("src_data", "src_size", definition.src_tensors[0]);
-  TensorCodeGenerator mask("src_mask", "src_size_1", definition.src_tensors[1]);
-  TensorCodeGenerator dst("dst_data", "dst_size", definition.dst_tensors[0]);
-
-  std::string c = GetCommonDefines(definition.precision);
-
-  c += "__kernel void main_function(\n";
-  c += src.GetDeclaration(AccessType::READ) + ",\n";
-  c += mask.GetDeclaration(AccessType::READ) + ",\n";
-  c += dst.GetDeclaration(AccessType::WRITE);
-  c += GetArgsDeclaration(linked_operations);
-  c += "    int apply_mask_type,\n";
-  c += "    int4 src_size,\n";
-  c += "    int4 src_size_1,\n";
-  c += "    int4 dst_size  \n";
-  c += ") {\n";
-  c += "  int X = get_global_id(0);\n";
-  c += "  int Y = get_global_id(1);\n";
-  c += "  int Z = get_global_id(2);\n";
-  c += "  if (X >= dst_size.x || Y >= dst_size.y) return;\n";
-  c += "  FLT4 result = " +
-       src.Read3D("X", "Y", "Z", TextureAddressMode::DONT_CARE) + ";\n";
-  c += "  if (apply_mask_type == 1) {\n";
-  c += "    result *= " +
-       mask.Read3D("X", "Y", "Z", TextureAddressMode::DONT_CARE) + ";\n";
-  c += "  } else if (apply_mask_type == 2) {\n";
-  c += "    result *= " +
-       mask.Read3D("0", "0", "Z", TextureAddressMode::DONT_CARE) + ";\n";
-  c += "  } else {\n";
-  c += "    result *= " +
-       mask.Read3D("X", "Y", "0", TextureAddressMode::DONT_CARE) + ".x;\n";
-  c += "  }\n";
-  c += "  " + dst.GetAddress("dst_adr", "X", "Y", "Z");
-  c += PostProcess(linked_operations, "result", "Z", "dst_adr");
-  c += "  " + dst.Write3D("result", "dst_adr");
-  c += "}\n";
-  return c;
-}
-
-int GetMaskType(int4 src_size, int4 mask_size) {
-  if (mask_size.z == 1) {
-    return 0;
-  } else if (src_size.x == mask_size.x && src_size.y == mask_size.y) {
-    return 1;
-  } else {
-    return 2;
-  }
-}
-
-}  // namespace
 
 ApplyMask::ApplyMask(ApplyMask&& operation)
-    : GPUOperation(std::move(operation)),
-      kernel_(std::move(operation.kernel_)),
-      work_group_size_(operation.work_group_size_) {}
+    : ElementwiseOperation(std::move(operation)),
+      mask_type_(operation.mask_type_),
+      link_index_(operation.link_index_) {}
 
 ApplyMask& ApplyMask::operator=(ApplyMask&& operation) {
   if (this != &operation) {
-    kernel_ = std::move(operation.kernel_);
-    std::swap(work_group_size_, operation.work_group_size_);
-    GPUOperation::operator=(std::move(operation));
+    mask_type_ = operation.mask_type_;
+    link_index_ = operation.link_index_;
+    ElementwiseOperation::operator=(std::move(operation));
   }
   return *this;
 }
 
-Status ApplyMask::Compile(const CreationContext& creation_context) {
-  const auto code = GetApplyMaskKernelCode(definition_, linked_operations_);
-  return creation_context.cache->GetOrCreateCLKernel(
-      code, "main_function", *creation_context.context,
-      *creation_context.device, &kernel_);
+void ApplyMask::SetLinkIndex(int index) { link_index_ = index; }
+
+std::string ApplyMask::GetCoreCode(const LinkingContext& context) const {
+  const std::string size_name = "mask_size_op" + std::to_string(link_index_);
+  const std::string tensor_name = absl::StrCat("mask_data_op", link_index_);
+  TensorCodeGenerator mask(tensor_name, size_name, definition_.src_tensors[1]);
+  switch (mask_type_) {
+    case MaskType::TENSOR:
+      return context.var_name + " *= " +
+             mask.Read3D(context.x_coord, context.y_coord, context.z_coord,
+                         TextureAddressMode::DONT_CARE) +
+             ";\n";
+    case MaskType::CHANNELS:
+      return context.var_name + " *= " +
+             mask.Read3D("0", "0", context.z_coord,
+                         TextureAddressMode::DONT_CARE) +
+             ";\n";
+    case MaskType::LAYER:
+      return context.var_name + " *= " +
+             mask.Read3D(context.x_coord, context.y_coord, "0",
+                         TextureAddressMode::DONT_CARE) +
+             ".x;\n";
+  }
 }
 
-Status ApplyMask::BindArguments() {
-  kernel_.ResetBindingCounter();
-  RETURN_IF_ERROR(kernel_.SetMemoryAuto(src_[0]->GetMemoryPtr()));
-  RETURN_IF_ERROR(kernel_.SetMemoryAuto(src_[1]->GetMemoryPtr()));
-  RETURN_IF_ERROR(kernel_.SetMemoryAuto(dst_[0]->GetMemoryPtr()));
-  RETURN_IF_ERROR(BindArgs(&kernel_, linked_operations_));
-  RETURN_IF_ERROR(kernel_.SetBytesAuto(int32_t(
-      GetMaskType(src_[0]->GetSizeWithDepth(), src_[1]->GetSizeWithDepth()))));
-  RETURN_IF_ERROR(kernel_.SetBytesAuto(src_[0]->GetSizeWithDepth()));
-  RETURN_IF_ERROR(kernel_.SetBytesAuto(src_[1]->GetSizeWithDepth()));
-  RETURN_IF_ERROR(kernel_.SetBytesAuto(dst_[0]->GetSizeWithDepth()));
+std::string ApplyMask::GetArgsDeclaration() const {
+  std::string args;
+  const std::string tensor_name = absl::StrCat("mask_data_op", link_index_);
+  TensorCodeGenerator src_tensor(tensor_name, "", definition_.src_tensors[1]);
+  absl::StrAppend(&args, ",\n", src_tensor.GetDeclaration(AccessType::READ));
+  const std::string size_name = "mask_size_op" + std::to_string(link_index_);
+  absl::StrAppend(&args, ",\n   int4 ", size_name);
+  return args;
+}
+
+Status ApplyMask::BindArguments(CLKernel* kernel) {
+  RETURN_IF_ERROR(kernel->SetMemoryAuto(src_[1]->GetMemoryPtr()));
+  RETURN_IF_ERROR(kernel->SetBytesAuto(src_[1]->GetSizeWithDepth()));
   return OkStatus();
 }
 
-int3 ApplyMask::GetGridSize() const {
-  return int3(dst_[0]->Width(), dst_[0]->Height(), dst_[0]->Depth());
-}
-
-Status ApplyMask::Tune(const TuningParameters& params) {
-  RETURN_IF_ERROR(BindArguments());
-  return GetBestWorkGroup(params, kernel_, GetGridSize(), &work_group_size_);
-}
-
-Status ApplyMask::AddToQueue(CLCommandQueue* queue) {
-  RETURN_IF_ERROR(BindArguments());
-  return queue->DispatchImplicit(kernel_, GetGridSize(), work_group_size_);
-}
-
-ApplyMask CreateApplyMask(const OperationDef& definition) {
-  return ApplyMask(definition);
+ApplyMask CreateApplyMask(const OperationDef& definition, const BHWC& src_shape,
+                          const BHWC& mask_shape) {
+  ApplyMask::MaskType mask_type;
+  if (mask_shape == src_shape) {
+    mask_type = ApplyMask::MaskType::TENSOR;
+  } else if (mask_shape.c == 1) {
+    mask_type = ApplyMask::MaskType::LAYER;
+  } else {
+    mask_type = ApplyMask::MaskType::CHANNELS;
+  }
+  ApplyMask operation(definition, mask_type);
+  operation.SetLinkIndex(0);
+  return operation;
 }
 
 }  // namespace cl
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/apply_mask.h b/tensorflow/lite/delegates/gpu/cl/kernels/apply_mask.h
index 81303598e0a..51691fcb5b3 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/apply_mask.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/apply_mask.h
@@ -26,30 +26,35 @@ namespace tflite {
 namespace gpu {
 namespace cl {
 
-class ApplyMask : public GPUOperation {
+class ApplyMask : public ElementwiseOperation {
  public:
-  explicit ApplyMask(const OperationDef& definition)
-      : GPUOperation(definition) {}
-  Status AddToQueue(CLCommandQueue* queue) override;
-  Status Tune(const TuningParameters& params) override;
-
-  Status Compile(const CreationContext& creation_context) override;
-
   // Move only
   ApplyMask(ApplyMask&& operation);
   ApplyMask& operator=(ApplyMask&& operation);
   ApplyMask(const ApplyMask&) = delete;
   ApplyMask& operator=(const ApplyMask&) = delete;
 
- private:
-  Status BindArguments();
-  int3 GetGridSize() const;
+  void SetLinkIndex(int index) override;
+  std::string GetCoreCode(const LinkingContext& context) const override;
+  std::string GetArgsDeclaration() const override;
+  Status BindArguments(CLKernel* kernel) override;
 
-  CLKernel kernel_;
-  int3 work_group_size_ = int3(8, 4, 1);
+ private:
+  friend ApplyMask CreateApplyMask(const OperationDef& definition,
+                                   const BHWC& src_shape,
+                                   const BHWC& mask_shape);
+
+  enum class MaskType { LAYER, CHANNELS, TENSOR };
+
+  explicit ApplyMask(const OperationDef& definition, MaskType mask_type)
+      : ElementwiseOperation(definition), mask_type_(mask_type) {}
+
+  MaskType mask_type_;
+  int link_index_;
 };
 
-ApplyMask CreateApplyMask(const OperationDef& definition);
+ApplyMask CreateApplyMask(const OperationDef& definition, const BHWC& src_shape,
+                          const BHWC& mask_shape);
 
 }  // namespace cl
 }  // namespace gpu
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/apply_mask_test.cc b/tensorflow/lite/delegates/gpu/cl/kernels/apply_mask_test.cc
index a179244ca7e..5218b83136e 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/apply_mask_test.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/apply_mask_test.cc
@@ -49,7 +49,8 @@ TEST_F(OpenCLOperationTest, ApplyMaskOneChannel) {
       op_def.src_tensors.push_back({data_type, storage});
       op_def.dst_tensors.push_back({data_type, storage});
       TensorFloat32 dst_tensor;
-      ApplyMask operation = CreateApplyMask(op_def);
+      ApplyMask operation =
+          CreateApplyMask(op_def, src_tensor.shape, mask_tensor.shape);
       ASSERT_OK(ExecuteGPUOperation({src_tensor, mask_tensor},
                                     creation_context_, &operation,
                                     BHWC(1, 2, 2, 2), &dst_tensor));
@@ -78,7 +79,8 @@ TEST_F(OpenCLOperationTest, ApplyMaskEqualSizes) {
       op_def.src_tensors.push_back({data_type, storage});
       op_def.dst_tensors.push_back({data_type, storage});
       TensorFloat32 dst_tensor;
-      ApplyMask operation = CreateApplyMask(op_def);
+      ApplyMask operation =
+          CreateApplyMask(op_def, src_tensor.shape, mask_tensor.shape);
       ASSERT_OK(ExecuteGPUOperation({src_tensor, mask_tensor},
                                     creation_context_, &operation,
                                     BHWC(1, 2, 2, 2), &dst_tensor));
@@ -107,7 +109,8 @@ TEST_F(OpenCLOperationTest, ApplyMaskVector) {
       op_def.src_tensors.push_back({data_type, storage});
       op_def.dst_tensors.push_back({data_type, storage});
       TensorFloat32 dst_tensor;
-      ApplyMask operation = CreateApplyMask(op_def);
+      ApplyMask operation =
+          CreateApplyMask(op_def, src_tensor.shape, mask_tensor.shape);
       ASSERT_OK(ExecuteGPUOperation({src_tensor, mask_tensor},
                                     creation_context_, &operation,
                                     BHWC(1, 2, 2, 2), &dst_tensor));
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/cl_test.cc b/tensorflow/lite/delegates/gpu/cl/kernels/cl_test.cc
index 8e3311afa94..e1fad85d17f 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/cl_test.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/cl_test.cc
@@ -30,10 +30,9 @@ Status ExecuteGPUOperation(const std::vector<TensorFloat32>& src_cpu,
   std::vector<Tensor> src(src_cpu.size());
   for (int i = 0; i < src_cpu.size(); ++i) {
     auto src_shape = src_cpu[i].shape;
-    RETURN_IF_ERROR(CreateTensor(
-        *creation_context.context, *creation_context.device, src_shape.w,
-        src_shape.h, src_shape.c, op_def.src_tensors[0].data_type,
-        op_def.src_tensors[0].storage_type, &src[i]));
+    RETURN_IF_ERROR(CreateTensor(*creation_context.context,
+                                 *creation_context.device, src_shape,
+                                 op_def.src_tensors[0], &src[i]));
     RETURN_IF_ERROR(src[i].WriteData(creation_context.queue, src_cpu[i]));
     operation->SetSrc(&src[i], i);
   }
@@ -41,10 +40,9 @@ Status ExecuteGPUOperation(const std::vector<TensorFloat32>& src_cpu,
   std::vector<Tensor> dst(dst_cpu.size());
   for (int i = 0; i < dst_cpu.size(); ++i) {
     auto dst_shape = dst_sizes[i];
-    RETURN_IF_ERROR(CreateTensor(
-        *creation_context.context, *creation_context.device, dst_shape.w,
-        dst_shape.h, dst_shape.c, op_def.dst_tensors[0].data_type,
-        op_def.dst_tensors[0].storage_type, &dst[i]));
+    RETURN_IF_ERROR(CreateTensor(*creation_context.context,
+                                 *creation_context.device, dst_shape,
+                                 op_def.dst_tensors[0], &dst[i]));
 
     operation->SetDst(&dst[i], i);
   }
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/concat_xy.cc b/tensorflow/lite/delegates/gpu/cl/kernels/concat_xy.cc
index fd19052a158..94301b58d1a 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/concat_xy.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/concat_xy.cc
@@ -68,9 +68,9 @@ std::string GetConcatKernelCode(
          srcs[i]->Read3D("X", "Y", "Z", TextureAddressMode::DONT_CARE) + ";\n";
     c += "    int dst_x = X + " + offset_name + ".x;\n";
     c += "    int dst_y = Y + " + offset_name + ".y;\n";
-    c += "    " + dst.GetAddress("dst_adr", "dst_x", "dst_y", "Z");
-    c += PostProcess(linked_operations, "result", "Z", "dst_adr");
-    c += "    " + dst.Write3D("result", "dst_adr");
+    const LinkingContext context{"result", "dst_x", "dst_y", "Z"};
+    c += PostProcess(linked_operations, context);
+    c += "    " + dst.Write3D("result", "dst_x", "dst_y", "Z");
     c += "  } \n";
   }
   c += "}\n";
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/concat_z.cc b/tensorflow/lite/delegates/gpu/cl/kernels/concat_z.cc
index 9f8f0ada52b..c4336043fa4 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/concat_z.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/concat_z.cc
@@ -90,10 +90,12 @@ std::string GetConcatKernelCode(
             ";\n";
         code += "    " + dst.GetAddress("dst_adr0", "X", "Y", "Z") + "\n";
         code += "    " + dst.GetAddress("dst_adr1", "X", "Y", "Z + 1") + "\n";
-        code += PostProcess(linked_operations, "result0", "Z", "dst_adr0");
-        code += PostProcess(linked_operations, "result1", "Z + 1", "dst_adr1");
-        code += "    " + dst.Write3D("result0", "dst_adr0");
-        code += "    " + dst.Write3D("result1", "dst_adr1");
+        const LinkingContext context_0{"result0", "X", "Y", "Z"};
+        const LinkingContext context_1{"result1", "X", "Y", "Z + 1"};
+        code += PostProcess(linked_operations, context_0);
+        code += PostProcess(linked_operations, context_1);
+        code += "    " + dst.Write3D("result0", "X", "Y", "Z");
+        code += "    " + dst.Write3D("result1", "X", "Y", "Z + 1");
         code += "    Z += 2;\n";
         code += "  }\n";
       } else {
@@ -101,9 +103,9 @@ std::string GetConcatKernelCode(
         code += "    FLT4 result = " +
                 srcs[i]->Read3D("X", "Y", "i", TextureAddressMode::DONT_CARE) +
                 ";\n";
-        code += "    " + dst.GetAddress("dst_adr", "X", "Y", "Z") + "\n";
-        code += PostProcess(linked_operations, "result", "Z", "dst_adr");
-        code += "    " + dst.Write3D("result", "dst_adr");
+        const LinkingContext context{"result", "X", "Y", "Z"};
+        code += PostProcess(linked_operations, context);
+        code += "    " + dst.Write3D("result", "X", "Y", "Z");
         code += "    Z++;\n";
         code += "  }\n";
       }
@@ -129,12 +131,9 @@ std::string GetConcatKernelCode(
           if (out_channel == 4) {
             out_channel = 0;
             code += "  {\n";
-            code += "  " +
-                    dst.GetAddress("dst_adr", "X", "Y", std::to_string(z)) +
-                    "\n";
-            code += PostProcess(linked_operations, "result", std::to_string(z),
-                                "dst_adr");
-            code += "  " + dst.Write3D("result", "dst_adr");
+            const LinkingContext context{"result", "X", "Y", std::to_string(z)};
+            code += PostProcess(linked_operations, context);
+            code += "  " + dst.Write3D("result", "X", "Y", std::to_string(z));
             code += "  }\n";
             z++;
           }
@@ -144,11 +143,9 @@ std::string GetConcatKernelCode(
     }
     if (out_channel != 0) {
       code += "  {\n";
-      code +=
-          "  " + dst.GetAddress("dst_adr", "X", "Y", std::to_string(z)) + "\n";
-      code += PostProcess(linked_operations, "result", std::to_string(z),
-                          "dst_adr");
-      code += "  " + dst.Write3D("result", "dst_adr");
+      const LinkingContext context{"result", "X", "Y", std::to_string(z)};
+      code += PostProcess(linked_operations, context);
+      code += "  " + dst.Write3D("result", "X", "Y", std::to_string(z));
       code += "  }\n";
     }
   }
@@ -176,8 +173,14 @@ ConcatZ& ConcatZ::operator=(ConcatZ&& kernel) {
 Status ConcatZ::Compile(const CreationContext& creation_context) {
   const auto code =
       GetConcatKernelCode(definition_, channels_, linked_operations_);
+  std::vector<CompilerOptions> options;
+  if (definition_.precision == CalculationsPrecision::F32 &&
+      creation_context.device->IsPowerVR() && !IsAllChannelsX4(channels_)) {
+    // BUG, some PowerVRs (GE8320) produce incorrect result without it
+    options.push_back(CompilerOptions::CL_OPT_DISABLE);
+  }
   return creation_context.cache->GetOrCreateCLKernel(
-      code, "main_function", *creation_context.context,
+      code, "main_function", options, *creation_context.context,
       *creation_context.device, &kernel_);
 }
 
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/conv_buffer.cc b/tensorflow/lite/delegates/gpu/cl/kernels/conv_buffer.cc
index f8994014aa3..adb9ddb750c 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/conv_buffer.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/conv_buffer.cc
@@ -72,7 +72,7 @@ std::string GenerateConvBuffer(
   c += "    int4 src_size,                   \n";
   c += "    int4 dst_size,                   \n";
   c += "    int2 kernel_size,                \n";
-  c += "    int2 dillation,                  \n";
+  c += "    int2 dilation,                   \n";
   c += "    int2 stride,                     \n";
   c += "    int2 padding                     \n";
   c += ") {\n";
@@ -97,7 +97,7 @@ std::string GenerateConvBuffer(
   c += "  for (int y = 0; y < kernel_size.y; ++y) {\n";
   for (int y = 0; y < y_elements; ++y) {
     std::string y_s = std::to_string(y);
-    c += "  int c" + y_s + "y = y * dillation.y + yc" + y_s + ";\n";
+    c += "  int c" + y_s + "y = y * dilation.y + yc" + y_s + ";\n";
     c += "  bool y" + y_s + "_in = c" + y_s + "y >= 0 && c" + y_s +
          "y < src_size.y;\n";
     c += "  c" + y_s + "y = clamp(c" + y_s + "y, 0, src_size.y - 1);\n";
@@ -105,7 +105,7 @@ std::string GenerateConvBuffer(
   c += "  for (int x = 0; x < kernel_size.x; ++x) {\n";
   for (int x = 0; x < x_elements; ++x) {
     std::string x_s = std::to_string(x);
-    c += "  int c" + x_s + "x = x * dillation.x + xc" + x_s + ";\n";
+    c += "  int c" + x_s + "x = x * dilation.x + xc" + x_s + ";\n";
     c += "  bool x" + x_s + "_in = c" + x_s + "x >= 0 && c" + x_s +
          "x < src_size.x;\n";
     c += "  c" + x_s + "x = clamp(c" + x_s + "x, 0, src_size.x - 1);\n";
@@ -151,11 +151,10 @@ std::string GenerateConvBuffer(
       c += "  if (X + " + x_s + " < dst_size.x && Y + " + y_s +
            " < dst_size.y) {\n";
       c += "    FLT4 res = TO_FLT4(r" + i_s + ");\n";
-      c += "  " +
-           dst_tensor.GetAddress("address", "X + " + x_s, "Y + " + y_s, "Z") +
+      const LinkingContext context{"res", "X + " + x_s, "Y + " + y_s, "Z"};
+      c += PostProcess(linked_operations, context);
+      c += "  " + dst_tensor.Write3D("res", "X + " + x_s, "Y + " + y_s, "Z") +
            "\n";
-      c += PostProcess(linked_operations, "res", "Z", "address");
-      c += "  " + dst_tensor.Write3D("res", "address") + "\n";
       c += "  }\n";
     }
   }
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/conv_buffer_1x1.cc b/tensorflow/lite/delegates/gpu/cl/kernels/conv_buffer_1x1.cc
index 545463881ea..7b35a20f334 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/conv_buffer_1x1.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/conv_buffer_1x1.cc
@@ -170,11 +170,10 @@ std::string GenerateConvBuffer1x1(
       c += "  if (X + " + x_s + " < dst_size.x && Y + " + y_s +
            " < dst_size.y) {\n";
       c += "    FLT4 res = TO_FLT4(r" + i_s + ");\n";
-      c += "  " +
-           dst_tensor.GetAddress("address", "X + " + x_s, "Y + " + y_s, "Z") +
+      const LinkingContext context{"res", "X + " + x_s, "Y + " + y_s, "Z"};
+      c += PostProcess(linked_operations, context);
+      c += "  " + dst_tensor.Write3D("res", "X + " + x_s, "Y + " + y_s, "Z") +
            "\n";
-      c += PostProcess(linked_operations, "res", "Z", "address");
-      c += "  " + dst_tensor.Write3D("res", "address") + "\n";
       c += "  }\n";
     }
   }
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/conv_constants.cc b/tensorflow/lite/delegates/gpu/cl/kernels/conv_constants.cc
index f188d09b6c2..a34b3214275 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/conv_constants.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/conv_constants.cc
@@ -146,9 +146,9 @@ std::string GenerateConvolutionConstantCode(
     std::string s_i = std::to_string(i);
     c += "  {\n";
     c += "    FLT4 res = TO_FLT4(r[" + s_i + "]) + biases[" + s_i + "];\n";
-    c += "  " + dst_tensor.GetAddress("dst_adr", "X", "Y", s_i) + "\n";
-    c += PostProcess(linked_operations, "res", s_i, "dst_adr");
-    c += "  " + dst_tensor.Write3D("res", "dst_adr");
+    const LinkingContext context{"res", "X", "Y", s_i};
+    c += PostProcess(linked_operations, context);
+    c += "  " + dst_tensor.Write3D("res", "X", "Y", s_i);
     c += "  }\n";
   }
   c += "}\n";
@@ -217,6 +217,11 @@ Status ConvConstants::Compile(const CreationContext& creation_context) {
       creation_context.device->IsAdreno3xx()) {
     options.push_back(CompilerOptions::ADRENO_FULL_SIMD_LINE);
   }
+  if (definition_.precision != CalculationsPrecision::F32 &&
+      creation_context.device->IsPowerVR()) {
+    // BUG, some PowerVRs (GE8320) produce incorrect result without it
+    options.push_back(CompilerOptions::CL_OPT_DISABLE);
+  }
   return creation_context.cache->GetOrCreateCLKernel(
       code, "main_function", options, *creation_context.context,
       *creation_context.device, &kernel_);
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/conv_powervr.cc b/tensorflow/lite/delegates/gpu/cl/kernels/conv_powervr.cc
index 611994da3c4..40b2e124860 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/conv_powervr.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/conv_powervr.cc
@@ -98,15 +98,16 @@ int3 ConvPowerVR::GetGridSize() const {
       IntegralDivideRoundUp(dst_[0]->Height(), conv_params_.block_size.y);
   const int grid_z =
       IntegralDivideRoundUp(dst_[0]->Depth(), conv_params_.block_size.z);
-  const int wg_x =
-      IntegralDivideRoundUp(grid_x, conv_params_.work_group_size.x);
-  const int wg_y =
-      IntegralDivideRoundUp(grid_y, conv_params_.work_group_size.y);
-  const int wg_z =
-      IntegralDivideRoundUp(grid_z, conv_params_.work_group_size.z);
-  return int3(wg_z * conv_params_.work_group_size.x,
-              wg_x * conv_params_.work_group_size.y,
-              wg_y * conv_params_.work_group_size.z);
+  int3 wg;
+  wg.x = IntegralDivideRoundUp(grid_x, conv_params_.work_group_size.x);
+  wg.y = IntegralDivideRoundUp(grid_y, conv_params_.work_group_size.y);
+  wg.z = IntegralDivideRoundUp(grid_z, conv_params_.work_group_size.z);
+  return int3(wg[conv_params_.work_group_launch_order[0]] *
+                  conv_params_.work_group_size.x,
+              wg[conv_params_.work_group_launch_order[1]] *
+                  conv_params_.work_group_size.y,
+              wg[conv_params_.work_group_launch_order[2]] *
+                  conv_params_.work_group_size.z);
 }
 
 Status ConvPowerVR::AddToQueue(CLCommandQueue* queue) {
@@ -156,12 +157,19 @@ std::string GenerateConvPowerVR1x1(
   c += "    int4 src_size,                   \n";
   c += "    int4 dst_size                    \n";
   c += ") {\n";
-  c += "  int X = (get_group_id(1) * " + std::to_string(work_group_size.x) +
-       " + get_local_id(0)) * " + std::to_string(block_size.x) + ";\n";
-  c += "  int Y = (get_group_id(2) * " + std::to_string(work_group_size.y) +
-       " + get_local_id(1)) * " + std::to_string(block_size.y) + ";\n";
-  c += "  int Z = (get_group_id(0) * " + std::to_string(work_group_size.z) +
-       " + get_local_id(2)) * " + std::to_string(block_size.z) + ";\n";
+  int3 launch_remap;
+  launch_remap[conv_params.work_group_launch_order.x] = 0;
+  launch_remap[conv_params.work_group_launch_order.y] = 1;
+  launch_remap[conv_params.work_group_launch_order.z] = 2;
+  c += "  int X = (get_group_id(" + std::to_string(launch_remap[0]) + ") * " +
+       std::to_string(work_group_size.x) + " + get_local_id(0)) * " +
+       std::to_string(block_size.x) + ";\n";
+  c += "  int Y = (get_group_id(" + std::to_string(launch_remap[1]) + ") * " +
+       std::to_string(work_group_size.y) + " + get_local_id(1)) * " +
+       std::to_string(block_size.y) + ";\n";
+  c += "  int Z = (get_group_id(" + std::to_string(launch_remap[2]) + ") * " +
+       std::to_string(work_group_size.z) + " + get_local_id(2)) * " +
+       std::to_string(block_size.z) + ";\n";
   for (int z = 0; z < block_size.z; ++z) {
     for (int y = 0; y < block_size.y; ++y) {
       for (int x = 0; x < block_size.x; ++x) {
@@ -349,9 +357,9 @@ std::string GenerateConvPowerVR1x1(
         }
         c += "    FLT4 res = TO_FLT4(r" + r_id + " + data[" +
              std::to_string(z) + "]);\n";
-        c += "    " + dst_tensor.GetAddress("address", xs, ys, zs) + "\n";
-        c += PostProcess(linked_operations, "res", zs, "address");
-        c += "    " + dst_tensor.Write3D("res", "address") + "\n";
+        const LinkingContext context{"res", xs, ys, zs};
+        c += PostProcess(linked_operations, context);
+        c += "    " + dst_tensor.Write3D("res", xs, ys, zs) + "\n";
         c += "  }\n";
       }
     }
@@ -366,6 +374,7 @@ ConvPowerVR::ConvParams GuessBestParams(const CLDevice& device,
   ConvPowerVR::ConvParams conv_params;
   conv_params.block_size = int3(1, 1, 4);
   conv_params.work_group_size = int3(8, 4, 1);
+  conv_params.work_group_launch_order = int3(2, 0, 1);
   conv_params.src_depth_loop_size = 1;
   conv_params.explicit_sync = !device.IsPowerVR();
   const int dst_depth = IntegralDivideRoundUp(attr.weights.shape.o, 4);
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/conv_powervr.h b/tensorflow/lite/delegates/gpu/cl/kernels/conv_powervr.h
index e531cbf9e37..d3f29cf00f4 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/conv_powervr.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/conv_powervr.h
@@ -53,6 +53,7 @@ class ConvPowerVR : public GPUOperation {
   struct ConvParams {
     int3 block_size;
     int3 work_group_size;
+    int3 work_group_launch_order;
     int src_depth_loop_size;
     bool explicit_sync;
     bool x_kernel_is_1;
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/conv_texture.cc b/tensorflow/lite/delegates/gpu/cl/kernels/conv_texture.cc
index 686c7be4318..f0558f27d58 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/conv_texture.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/conv_texture.cc
@@ -73,7 +73,7 @@ std::string GenerateConvCode(
   c += "    int4 dst_size,                   \n";
   if (!is1x1) {
     c += "    int2 kernel_size,              \n";
-    c += "    int2 dillation,                \n";
+    c += "    int2 dilation,                 \n";
   }
   c += "    int2 stride,                     \n";
   c += "    int2 padding                     \n";
@@ -100,11 +100,11 @@ std::string GenerateConvCode(
     c += "  int2 c1;\n";
     c += "  int filter_offset = 0;\n";
     c += "  for (int y = 0; y < kernel_size.y; ++y) {\n";
-    c += "  c0.y = y * dillation.y + yc0;\n";
-    c += "  c1.y = y * dillation.y + yc1;\n";
+    c += "  c0.y = y * dilation.y + yc0;\n";
+    c += "  c1.y = y * dilation.y + yc1;\n";
     c += "  for (int x = 0; x < kernel_size.x; ++x) {\n";
-    c += "  c0.x = x * dillation.x + xc0;\n";
-    c += "  c1.x = x * dillation.x + xc1;\n";
+    c += "  c0.x = x * dilation.x + xc0;\n";
+    c += "  c1.x = x * dilation.x + xc1;\n";
   }
   c += "  for (int s = 0; s < src_size.w; ++s) {\n";
   std::string fc0 = "(int2)(Z, " + f_y + ")";
@@ -149,9 +149,9 @@ std::string GenerateConvCode(
     c += "  int yc = " + dst_y + " + " + std::to_string(i / 2) + ";\n";
     c += "  if (xc < dst_size.x && yc < dst_size.y) {\n";
     c += "    FLT4 res = TO_FLT4(r" + std::to_string(i) + ") + bias_val;\n";
-    c += "  " + dst_tensor.GetAddress("address", "xc", "yc", "Z") + "\n";
-    c += PostProcess(linked_operations, "res", "Z", "address");
-    c += "  " + dst_tensor.Write3D("res", "address") + "\n";
+    const LinkingContext context{"res", "xc", "yc", "Z"};
+    c += PostProcess(linked_operations, context);
+    c += "  " + dst_tensor.Write3D("res", "xc", "yc", "Z") + "\n";
     c += "  }\n";
     c += "  }\n";
   }
@@ -165,9 +165,9 @@ std::string GenerateConvCode(
     c += "  int yc = " + dst_y + " + " + std::to_string(i / 2) + ";\n";
     c += "  if (xc < dst_size.x && yc < dst_size.y) {\n";
     c += "    FLT4 res = TO_FLT4(r" + std::to_string(i + 4) + ") + bias_val;\n";
-    c += "  " + dst_tensor.GetAddress("address", "xc", "yc", "Z") + "\n";
-    c += PostProcess(linked_operations, "res", "Z", "address");
-    c += "  " + dst_tensor.Write3D("res", "address") + "\n";
+    const LinkingContext context{"res", "xc", "yc", "Z"};
+    c += PostProcess(linked_operations, context);
+    c += "  " + dst_tensor.Write3D("res", "xc", "yc", "Z") + "\n";
     c += "  }\n";
     c += "  }\n";
   }
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/converter.cc b/tensorflow/lite/delegates/gpu/cl/kernels/converter.cc
index 48746a604a8..05020a2c8b9 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/converter.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/converter.cc
@@ -425,7 +425,7 @@ class OpenClTensorConverterBuilder : public TensorObjectConverterBuilder {
       : environment_(environment) {}
 
   bool IsSupported(const TensorObjectDef& input,
-                   const TensorObjectDef& output) final {
+                   const TensorObjectDef& output) const final {
     const auto& input_def = input.object_def;
     const auto& output_def = output.object_def;
     return input.dimensions == output.dimensions &&
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed.cc b/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed.cc
index 22dc05510d7..30de5e631b3 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed.cc
@@ -155,9 +155,9 @@ std::string GenerateConvolutionTransposedCode(
   c += "  }\n";
   c += "  FLT4 bias_val = " + biases.ReadLinearFLT4("Z") + ";\n";
   c += "  FLT4 res0 = TO_FLT4(r0) + bias_val;\n";
-  c += "  " + dst_tensor.GetAddress("address", "X", "Y", "Z") + "\n";
-  c += PostProcess(linked_operations, "res0", "Z", "address");
-  c += "  " + dst_tensor.Write3D("res0", "address") + "\n";
+  const LinkingContext context{"res0", "X", "Y", "Z"};
+  c += PostProcess(linked_operations, context);
+  c += "  " + dst_tensor.Write3D("res0", "X", "Y", "Z") + "\n";
   c += "}\n";
 
   return c;
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_3x3_thin.cc b/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_3x3_thin.cc
index 75f6862a4db..c480b611913 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_3x3_thin.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_3x3_thin.cc
@@ -132,12 +132,13 @@ std::string GenerateConvolutionTransposedCode(
         c += "  {\n";
         c += "    FLT4 result = TO_FLT4(r" + layer + "[" + std::to_string(y) +
              "][" + std::to_string(x) + "]) + bias_val;\n";
+        const LinkingContext context{"result", "X + " + std::to_string(x),
+                                     "Y + " + std::to_string(y), layer};
+        c += PostProcess(linked_operations, context);
         c += "    " +
-             dst_tensor.GetAddress("address", "X + " + std::to_string(x),
-                                   "Y + " + std::to_string(y), layer) +
+             dst_tensor.Write3D("result", context.x_coord, context.y_coord,
+                                context.z_coord) +
              "\n";
-        c += PostProcess(linked_operations, "result", layer, "address");
-        c += "    " + dst_tensor.Write3D("result", "address") + "\n";
         c += "  }\n";
       }
     }
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_thin.cc b/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_thin.cc
index 9e3348547f5..ab47c6cdfbc 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_thin.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_thin.cc
@@ -127,12 +127,13 @@ std::string GenerateConvolutionTransposedCode(
         c += "    result" + channel[d] + " += r[" + std::to_string(y) + "][" +
              std::to_string(x) + "]" + postfix[d] + ";\n";
       }
+      const LinkingContext context{"result", "X + " + std::to_string(x),
+                                   "Y + " + std::to_string(y), "0"};
+      c += PostProcess(linked_operations, context);
       c += "    " +
-           dst_tensor.GetAddress("address", "X + " + std::to_string(x),
-                                 "Y + " + std::to_string(y), "0") +
+           dst_tensor.Write3D("result", context.x_coord, context.y_coord,
+                              context.z_coord) +
            "\n";
-      c += PostProcess(linked_operations, "result", "0", "address");
-      c += "    " + dst_tensor.Write3D("result", "address") + "\n";
       c += "  }\n";
     }
   }
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/depth_wise_conv.cc b/tensorflow/lite/delegates/gpu/cl/kernels/depth_wise_conv.cc
index 8367cecd14d..194189bda8b 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/depth_wise_conv.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/depth_wise_conv.cc
@@ -147,9 +147,9 @@ std::string GenerateDepthWiseConvolutionCode(
   }
   c += "  FLT4 bias_val = " + biases.ReadLinearFLT4("Z") + ";\n";
   c += "  FLT4 res0 = TO_FLT4(r) + bias_val;\n";
-  c += "  " + dst_tensor.GetAddress("address", "X", "Y", "Z") + "\n";
-  c += PostProcess(linked_operations, "res0", "Z", "address");
-  c += "  " + dst_tensor.Write3D("res0", "address") + "\n";
+  const LinkingContext context{"res0", "X", "Y", "Z"};
+  c += PostProcess(linked_operations, context);
+  c += "  " + dst_tensor.Write3D("res0", "X", "Y", "Z") + "\n";
   c += "}\n";
 
   return c;
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/depth_wise_conv_3x3.cc b/tensorflow/lite/delegates/gpu/cl/kernels/depth_wise_conv_3x3.cc
index 67c8a469296..54461fa6355 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/depth_wise_conv_3x3.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/depth_wise_conv_3x3.cc
@@ -217,26 +217,27 @@ std::string GenerateDepthWiseConvCode(
   c += "  if(X + 0 < dst_size.x && Y + 0 < dst_size.y) {\n";
   c += "    FLT4 result = TO_FLT4(r0);\n";
   c += "  " + dst_tensor.GetAddress("address", "X + 0", "Y + 0", "Z") + "\n";
-  c += PostProcess(linked_operations, "result", "Z", "address");
-  c += "  " + dst_tensor.Write3D("result", "address") + "\n";
+  LinkingContext context{"result", "X + 0", "Y + 0", "Z"};
+  c += PostProcess(linked_operations, context);
+  c += "  " + dst_tensor.Write3D("result", "X + 0", "Y + 0", "Z") + "\n";
   c += "  }\n";
   c += "  if(X + 1 < dst_size.x && Y + 0 < dst_size.y) {\n";
   c += "    FLT4 result = TO_FLT4(r1);\n";
-  c += "  " + dst_tensor.GetAddress("address", "X + 1", "Y + 0", "Z") + "\n";
-  c += PostProcess(linked_operations, "result", "Z", "address");
-  c += "  " + dst_tensor.Write3D("result", "address") + "\n";
+  context = {"result", "X + 1", "Y + 0", "Z"};
+  c += PostProcess(linked_operations, context);
+  c += "  " + dst_tensor.Write3D("result", "X + 1", "Y + 0", "Z") + "\n";
   c += "  }\n";
   c += "  if(X + 0 < dst_size.x && Y + 1 < dst_size.y) {\n";
   c += "    FLT4 result = TO_FLT4(r2);\n";
-  c += "  " + dst_tensor.GetAddress("address", "X + 0", "Y + 1", "Z") + "\n";
-  c += PostProcess(linked_operations, "result", "Z", "address");
-  c += "  " + dst_tensor.Write3D("result", "address") + "\n";
+  context = {"result", "X + 0", "Y + 1", "Z"};
+  c += PostProcess(linked_operations, context);
+  c += "  " + dst_tensor.Write3D("result", "X + 0", "Y + 1", "Z") + "\n";
   c += "  }\n";
   c += "  if(X + 1 < dst_size.x && Y + 1 < dst_size.y) {\n";
   c += "    FLT4 result = TO_FLT4(r3);\n";
-  c += "  " + dst_tensor.GetAddress("address", "X + 1", "Y + 1", "Z") + "\n";
-  c += PostProcess(linked_operations, "result", "Z", "address");
-  c += "  " + dst_tensor.Write3D("result", "address") + "\n";
+  context = {"result", "X + 1", "Y + 1", "Z"};
+  c += PostProcess(linked_operations, context);
+  c += "  " + dst_tensor.Write3D("result", "X + 1", "Y + 1", "Z") + "\n";
   c += "  }\n";
   c += "}\n";
 
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/fully_connected_texture.cc b/tensorflow/lite/delegates/gpu/cl/kernels/fully_connected_texture.cc
index 3e62224095f..a3567249515 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/fully_connected_texture.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/fully_connected_texture.cc
@@ -104,9 +104,9 @@ std::string GetFullyConnectedKernelCode(
   c += "    s += temp[tid.x][3];\n";
   c += "    FLT4 r0 = TO_FLT4(s) + READ_IMAGE(biases, smp_none, (int2)(gid, "
        "0));\n";
-  c += "  " + dst_tensor.GetAddress("dst_adr", "0", "0", "gid") + "\n";
-  c += PostProcess(linked_operations, "r0", "gid", "dst_adr");
-  c += "  " + dst_tensor.Write3D("r0", "dst_adr") + "\n";
+  const LinkingContext context{"r0", "0", "0", "gid"};
+  c += PostProcess(linked_operations, context);
+  c += "  " + dst_tensor.Write3D("r0", "0", "0", "gid") + "\n";
   c += "  }\n";
   c += "}\n";
 
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/gpu_operation.cc b/tensorflow/lite/delegates/gpu/cl/kernels/gpu_operation.cc
index 46235756cfc..254bc6b9aec 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/gpu_operation.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/gpu_operation.cc
@@ -29,7 +29,7 @@ std::string GetElementWiseCode(
     const TensorDescriptor& dst_descriptor, CalculationsPrecision precision,
     const ElementwiseOperation& op,
     const std::vector<ElementwiseOperation*>& linked_operations) {
-  TensorCodeGenerator src_tensor("src_data", "dst_size", src_descriptor);
+  TensorCodeGenerator src_tensor("src_data", "src_size", src_descriptor);
   TensorCodeGenerator dst_tensor("dst_data", "dst_size", dst_descriptor);
 
   std::string c = GetCommonDefines(precision);
@@ -39,19 +39,21 @@ std::string GetElementWiseCode(
   c += op.GetArgsDeclaration();
   c += GetArgsDeclaration(linked_operations);
   c += dst_tensor.GetDeclaration(AccessType::WRITE) + ",\n";
+  c += "    int4 src_size,\n";
   c += "    int4 dst_size\n";
   c += ") {\n";
   c += "  int X = get_global_id(0);\n";
   c += "  int Y = get_global_id(1);\n";
   c += "  int Z = get_global_id(2);\n";
-  c += "  if (X >= dst_size.x || Y >= dst_size.y) { \n";
+  c += "  if (X >= dst_size.x || Y >= dst_size.y || Z >= dst_size.w) { \n";
   c += "    return; \n";
   c += "  } \n";
-  c += "  " + src_tensor.GetAddress("address", "X", "Y", "Z") + "\n";
-  c += "  FLT4 src = " + src_tensor.Read3D("address") + ";\n";
-  c += "  " + op.GetCoreCode("src", "Z", "address");
-  c += PostProcess(linked_operations, "src", "Z", "address");
-  c += "  " + dst_tensor.Write3D("src", "address") + "\n";
+  c += "  FLT4 src = " +
+       src_tensor.Read3D("X", "Y", "Z", TextureAddressMode::DONT_CARE) + ";\n";
+  const LinkingContext context{"src", "X", "Y", "Z"};
+  c += "  " + op.GetCoreCode(context);
+  c += PostProcess(linked_operations, context);
+  c += "  " + dst_tensor.Write3D("src", "X", "Y", "Z") + "\n";
   c += "} \n";
   return c;
 }
@@ -128,6 +130,7 @@ Status ElementwiseOperation::BindArguments() {
   RETURN_IF_ERROR(BindArguments(&kernel_));
   RETURN_IF_ERROR(BindArgs(&kernel_, linked_operations_));
   RETURN_IF_ERROR(kernel_.SetMemoryAuto(dst_[0]->GetMemoryPtr()));
+  RETURN_IF_ERROR(kernel_.SetBytesAuto(src_[0]->GetSizeWithDepth()));
   RETURN_IF_ERROR(kernel_.SetBytesAuto(dst_[0]->GetSizeWithDepth()));
   return OkStatus();
 }
@@ -170,11 +173,10 @@ std::string GetArgsDeclaration(
 }
 
 std::string PostProcess(const std::vector<ElementwiseOperation*>& linked_ops,
-                        const std::string& var_name, const std::string& z_coord,
-                        const std::string& global_address) {
+                        const LinkingContext& context) {
   std::string code;
   for (auto linked_op : linked_ops) {
-    code += linked_op->GetCoreCode(var_name, z_coord, global_address);
+    code += linked_op->GetCoreCode(context);
   }
   return code;
 }
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/gpu_operation.h b/tensorflow/lite/delegates/gpu/cl/kernels/gpu_operation.h
index 1a076d88036..f2afdf28329 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/gpu_operation.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/gpu_operation.h
@@ -41,6 +41,17 @@ struct CreationContext {
   ProgramCache* cache;
 };
 
+struct LinkingContext {
+  // variable(FLT4) name to apply subsequent transformations
+  std::string var_name;
+  // x coordinate name (as it appears in kernel) for variable
+  std::string x_coord;
+  // y coordinate name (as it appears in kernel) for variable
+  std::string y_coord;
+  // z coordinate name (as it appears in kernel) for variable
+  std::string z_coord;
+};
+
 struct OperationDef {
   CalculationsPrecision precision;
   std::vector<TensorDescriptor> src_tensors;
@@ -135,9 +146,7 @@ class ElementwiseOperation : public GPUOperation {
   // ElementwiseOperation to generate right names).
   virtual void SetLinkIndex(int index) {}
 
-  virtual std::string GetCoreCode(const std::string& src,
-                                  const std::string& z_coord,
-                                  const std::string& address) const = 0;
+  virtual std::string GetCoreCode(const LinkingContext& context) const = 0;
   virtual std::string GetArgsDeclaration() const { return ""; }
   virtual Status BindArguments(CLKernel* kernel) { return OkStatus(); }
 
@@ -154,18 +163,8 @@ class ElementwiseOperation : public GPUOperation {
 std::string GetArgsDeclaration(
     const std::vector<ElementwiseOperation*>& linked_ops);
 
-// Generates shader code for every elementwise operation in
-// linked_ops.
-// linked_ops - vector of operations pointers
-// var_name - name of variable in shader code that we update/change
-// z_coord - name of variable in shader code for currently processed Z -
-//   coordinate in 3D grid (WHC/XYZ) for tensor, this coordinate is in
-//   layer/slice(group of 4 channels) space not in channels.
-// global_address - name of variable for coordinates in 3D grid (WHC/XYZ) for
-//   tensor, different tensor layouts encode this address differently.
 std::string PostProcess(const std::vector<ElementwiseOperation*>& linked_ops,
-                        const std::string& var_name, const std::string& z_coord,
-                        const std::string& global_address);
+                        const LinkingContext& context);
 
 // Binds arguments to given kernel for elementwise operations in
 // linked_ops.
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/hard_swish.h b/tensorflow/lite/delegates/gpu/cl/kernels/hard_swish.h
index 6bb880f61d7..aa13b3ffffd 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/hard_swish.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/hard_swish.h
@@ -48,12 +48,11 @@ class HardSwish : public ElementwiseOperation {
     return *this;
   }
 
-  std::string GetCoreCode(const std::string& src, const std::string& z_coord,
-                          const std::string& address) const override {
+  std::string GetCoreCode(const LinkingContext& context) const override {
     return absl::Substitute(
         "$0 *= clamp($0 * (FLT)(0.16666667f) + (FLT)(0.5f), (FLT4)(0.0f), "
         "(FLT4)(1.0f));\n",
-        src);
+        context.var_name);
   }
 };
 
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/max_unpooling.cc b/tensorflow/lite/delegates/gpu/cl/kernels/max_unpooling.cc
index de87a8ebbcf..a5cae47f5b7 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/max_unpooling.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/max_unpooling.cc
@@ -84,9 +84,9 @@ std::string GetMaxUnoolingKernelCode(
     const auto& s = channels[i];
     code += "  result" + s + "= t_index == ind" + s + "? src" + s + ": 0.0f;\n";
   }
-  code += "  " + dst.GetAddress("address", "X", "Y", "Z") + "\n";
-  code += PostProcess(linked_operations, "result", "Z", "address");
-  code += "  " + dst.Write3D("result", "address");
+  const LinkingContext context{"result", "X", "Y", "Z"};
+  code += PostProcess(linked_operations, context);
+  code += "  " + dst.Write3D("result", "X", "Y", "Z");
   code += "}\n";
 
   return code;
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/multiply_add.cc b/tensorflow/lite/delegates/gpu/cl/kernels/multiply_add.cc
index cfe60a98331..7244b968794 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/multiply_add.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/multiply_add.cc
@@ -55,18 +55,16 @@ void MultiplyAdd::SetLinkIndex(int index) {
   add_vec_.SetName(absl::StrCat("mad_add_", index));
 }
 
-std::string MultiplyAdd::GetCoreCode(const std::string& src,
-                                     const std::string& z_coord,
-                                     const std::string& address) const {
-  std::string result = absl::StrCat(src, " = ", src);
+std::string MultiplyAdd::GetCoreCode(const LinkingContext& context) const {
+  std::string result = absl::StrCat(context.var_name, " = ", context.var_name);
   if (use_mul_vec_) {
-    absl::StrAppend(&result, " * ", mul_vec_.ReadLinearFLT4(z_coord));
+    absl::StrAppend(&result, " * ", mul_vec_.ReadLinearFLT4(context.z_coord));
   }
   if (scalar_mul_.Active()) {
     absl::StrAppend(&result, " * (FLT)(", scalar_mul_.GetName(), ")");
   }
   if (use_add_vec_) {
-    absl::StrAppend(&result, " + ", add_vec_.ReadLinearFLT4(z_coord));
+    absl::StrAppend(&result, " + ", add_vec_.ReadLinearFLT4(context.z_coord));
   }
   if (scalar_add_.Active()) {
     absl::StrAppend(&result, " + (FLT)(", scalar_add_.GetName(), ")");
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/multiply_add.h b/tensorflow/lite/delegates/gpu/cl/kernels/multiply_add.h
index c6f9e977b48..0e45bc634e9 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/multiply_add.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/multiply_add.h
@@ -54,8 +54,8 @@ class MultiplyAdd : public ElementwiseOperation {
                    CLContext* context);
 
   void SetLinkIndex(int index) override;
-  std::string GetCoreCode(const std::string& src, const std::string& z_coord,
-                          const std::string& address) const override;
+  std::string GetCoreCode(const LinkingContext& context) const override;
+
   std::string GetArgsDeclaration() const override;
   Status BindArguments(CLKernel* kernel) override;
 
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/padding.cc b/tensorflow/lite/delegates/gpu/cl/kernels/padding.cc
index 5da0ae7fb51..d123f1e622c 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/padding.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/padding.cc
@@ -70,9 +70,9 @@ std::string GetPaddingCode(
     code += "    }\n";
   }
   code += "  }\n";
-  code += "  " + dst_tensor.GetAddress("address", "X", "Y", "Z") + "\n";
-  code += PostProcess(linked_operations, "result", "Z", "address");
-  code += "  " + dst_tensor.Write3D("result", "address");
+  const LinkingContext context{"result", "X", "Y", "Z"};
+  code += PostProcess(linked_operations, context);
+  code += "  " + dst_tensor.Write3D("result", "X", "Y", "Z");
   code += "}\n";
 
   return code;
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/pooling.cc b/tensorflow/lite/delegates/gpu/cl/kernels/pooling.cc
index 411560e1184..29e10e081ae 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/pooling.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/pooling.cc
@@ -74,9 +74,9 @@ std::string GetAveragePoolingKernelCode(
   // If window_size==0, window covered nothing. This situation is a sign of
   // incorrectly constructed operation. NaNs are expected as output.
   code += "  FLT4 result = TO_FLT4(r / window_size);\n";
-  code += "  " + dst_tensor.GetAddress("address", "X", "Y", "Z") + "\n";
-  code += PostProcess(linked_operations, "result", "Z", "address");
-  code += "  " + dst_tensor.Write3D("result", "address");
+  const LinkingContext context{"result", "X", "Y", "Z"};
+  code += PostProcess(linked_operations, context);
+  code += "  " + dst_tensor.Write3D("result", "X", "Y", "Z");
   code += "}\n";
 
   return code;
@@ -149,7 +149,8 @@ std::string GetMaxPoolingKernelCode(
   code += "    }\n";
   code += "  }\n";
   code += "  " + dst_tensor.GetAddress("address", "X", "Y", "Z") + "\n";
-  code += PostProcess(linked_operations, "maximum", "Z", "address");
+  const LinkingContext context{"maximum", "X", "Y", "Z"};
+  code += PostProcess(linked_operations, context);
   code += "  " + dst_tensor.Write3D("maximum", "address");
   if (output_indices) {
     code += "  " + indices_tensor.Write3D("indexes", "address");
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/prelu.cc b/tensorflow/lite/delegates/gpu/cl/kernels/prelu.cc
index 1c07b08b845..7cc410bd7e3 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/prelu.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/prelu.cc
@@ -51,17 +51,17 @@ void PReLU::SetLinkIndex(int index) {
   alpha_.SetName(absl::StrCat("prelu_alpha_", index));
 }
 
-std::string PReLU::GetCoreCode(const std::string& src,
-                               const std::string& z_coord,
-                               const std::string& address) const {
+std::string PReLU::GetCoreCode(const LinkingContext& context) const {
   if (!clip_.Active()) {
-    return absl::StrCat(src, " = max((FLT4)(0.0f), ", src,
-                        ") + min((FLT4)(0.0f), ", src, ") * ",
-                        alpha_.ReadLinearFLT4(z_coord), ";\n");
+    return absl::StrCat(context.var_name, " = max((FLT4)(0.0f), ",
+                        context.var_name, ") + min((FLT4)(0.0f), ",
+                        context.var_name, ") * ",
+                        alpha_.ReadLinearFLT4(context.z_coord), ";\n");
   } else {
-    return absl::StrCat(src, " = clamp(", src, ", (FLT4)(0.0f), (FLT4)(",
-                        clip_.GetName(), ")) + min((FLT4)(0.0f), ", src, ") * ",
-                        alpha_.ReadLinearFLT4(z_coord), ";\n");
+    return absl::StrCat(context.var_name, " = clamp(", context.var_name,
+                        ", (FLT4)(0.0f), (FLT4)(", clip_.GetName(),
+                        ")) + min((FLT4)(0.0f), ", context.var_name, ") * ",
+                        alpha_.ReadLinearFLT4(context.z_coord), ";\n");
   }
 }
 
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/prelu.h b/tensorflow/lite/delegates/gpu/cl/kernels/prelu.h
index b58b68a16e1..0feb387e644 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/prelu.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/prelu.h
@@ -42,8 +42,7 @@ class PReLU : public ElementwiseOperation {
   PReLU& operator=(const PReLU&) = delete;
 
   void SetLinkIndex(int index) override;
-  std::string GetCoreCode(const std::string& src, const std::string& z_coord,
-                          const std::string& address) const override;
+  std::string GetCoreCode(const LinkingContext& context) const override;
   std::string GetArgsDeclaration() const override;
   Status BindArguments(CLKernel* kernel) override;
 
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/relu.cc b/tensorflow/lite/delegates/gpu/cl/kernels/relu.cc
index ef2ea3f8f06..ce903972c35 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/relu.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/relu.cc
@@ -52,21 +52,20 @@ void ReLU::SetLinkIndex(int index) {
   clip_.SetName(absl::StrCat("relu_clip", index));
 }
 
-std::string ReLU::GetCoreCode(const std::string& src,
-                              const std::string& z_coord,
-                              const std::string& address) const {
+std::string ReLU::GetCoreCode(const LinkingContext& context) const {
   std::string min_func;
   if (!alpha_.Active()) {
     min_func = "(FLT)(0.0f)";
   } else {
-    min_func = absl::StrCat("min(", src, " * (FLT)(", alpha_.GetName(),
-                            "), (FLT)(0.0f))");
+    min_func = absl::StrCat("min(", context.var_name, " * (FLT)(",
+                            alpha_.GetName(), "), (FLT)(0.0f))");
   }
   if (!clip_.Active()) {
-    return absl::StrCat(src, " = max(", src, ", ", min_func, ");\n");
+    return absl::StrCat(context.var_name, " = max(", context.var_name, ", ",
+                        min_func, ");\n");
   } else {
-    return absl::StrCat(src, " = clamp(", src, ", " + min_func + ", (FLT)(",
-                        clip_.GetName(), "));\n");
+    return absl::StrCat(context.var_name, " = clamp(", context.var_name,
+                        ", " + min_func + ", (FLT)(", clip_.GetName(), "));\n");
   }
 }
 
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/relu.h b/tensorflow/lite/delegates/gpu/cl/kernels/relu.h
index b1fb87e469e..c4fb68588d3 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/relu.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/relu.h
@@ -35,8 +35,7 @@ class ReLU : public ElementwiseOperation {
   ReLU& operator=(const ReLU&) = delete;
 
   void SetLinkIndex(int index) override;
-  std::string GetCoreCode(const std::string& src, const std::string& z_coord,
-                          const std::string& address) const override;
+  std::string GetCoreCode(const LinkingContext& context) const override;
   std::string GetArgsDeclaration() const override;
   Status BindArguments(CLKernel* kernel) override;
 
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/reshape.cc b/tensorflow/lite/delegates/gpu/cl/kernels/reshape.cc
index 988fae76b0a..575505e527c 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/reshape.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/reshape.cc
@@ -70,9 +70,9 @@ std::string GetReshapeCode(
   c += "    }\n";
   c += "  }\n";
   c += "  FLT4 result = (FLT4)(temps[0], temps[1], temps[2], temps[3]);\n";
-  c += "  " + dst_tensor.GetAddress("dst_adr", "X", "Y", "Z");
-  c += PostProcess(linked_operations, "result", "Z", "dst_adr");
-  c += "  " + dst_tensor.Write3D("result", "dst_adr");
+  const LinkingContext context{"result", "X", "Y", "Z"};
+  c += PostProcess(linked_operations, context);
+  c += "  " + dst_tensor.Write3D("result", "X", "Y", "Z");
   c += "}\n";
   return c;
 }
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/reshapex4.cc b/tensorflow/lite/delegates/gpu/cl/kernels/reshapex4.cc
index 317a002c605..c309109305c 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/reshapex4.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/reshapex4.cc
@@ -55,9 +55,9 @@ std::string GetReshapeCode(
        src_tensor.Read3D("src_x", "src_y", "src_z",
                          TextureAddressMode::DONT_CARE) +
        ";\n";
-  c += "  " + dst_tensor.GetAddress("dst_adr", "X", "Y", "Z");
-  c += PostProcess(linked_operations, "result", "Z", "dst_adr");
-  c += "  " + dst_tensor.Write3D("result", "dst_adr");
+  const LinkingContext context{"result", "X", "Y", "Z"};
+  c += PostProcess(linked_operations, context);
+  c += "  " + dst_tensor.Write3D("result", "X", "Y", "Z");
   c += "}\n";
   return c;
 }
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/run_tests.sh b/tensorflow/lite/delegates/gpu/cl/kernels/run_tests.sh
index 0b4a5459727..4dfd90c5f1a 100755
--- a/tensorflow/lite/delegates/gpu/cl/kernels/run_tests.sh
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/run_tests.sh
@@ -14,26 +14,52 @@
 # limitations under the License.
 # ==============================================================================
 
-source gbash.sh || exit 1
+shopt -s expand_aliases  # to work with commands aliases in .sh
 
-set -e  # Exit immediately if a command exits with a non-zero status.
+description="Script for running tests on android devices
+How to use:
+[-h or --help, print instructions]
+[-t or --test_target, test target]
+[-d or --device, select device](optional, if you have few connected devices)"
 
-DEFINE_string test_target
+test_target=""
+alias ADB='adb'
 
-gbash::init_google "$@"
+while [[ "$1" != "" ]]; do
+  case $1 in
+    -t | --test_target)
+      shift
+      test_target=$1
+      ;;
+    -d | --device)
+      shift
+      alias ADB='adb -s '$1''
+      ;;
+    -h | --help)
+      echo "$description"
+      exit
+      ;;
+  esac
+  shift
+done
 
-_DEVICE="${DEVICE:+-s $DEVICE}"
+if [ "$test_target" = "" ]
+then
+echo "No test target provided."
+echo "$description"
+exit
+fi
 
 OPENCL_DIR=/data/local/tmp/opencl_tests/
 
 cleanup_device() {
-  adb ${_DEVICE} shell rm -rf $OPENCL_DIR
+  ADB shell rm -rf $OPENCL_DIR
 }
 
-adb ${_DEVICE} shell mkdir -p $OPENCL_DIR
+ADB shell mkdir -p $OPENCL_DIR
 trap "cleanup_device" EXIT
 
-targets=($(bazel query 'tests('${FLAGS_test_target}')'))
+targets=($(bazel query 'tests('$test_target')'))
 num_targets=${#targets[@]}
 if ((num_targets == 1)); then
   target=${targets[0]}
@@ -41,10 +67,10 @@ if ((num_targets == 1)); then
   bazel build --config=android_arm64 -c opt $target
   test_path=$(echo $target | tr : /)
   exec_path=bazel-bin/$(echo $test_path | cut -c 3-)
-  adb ${_DEVICE} push "$exec_path" $OPENCL_DIR
-  adb ${_DEVICE} shell chmod +x $OPENCL_DIR/$executable
-  adb ${_DEVICE} shell ./$OPENCL_DIR/$executable
-  adb ${_DEVICE} shell rm -f $OPENCL_DIR/$executable
+  ADB push "$exec_path" $OPENCL_DIR
+  ADB shell chmod +x $OPENCL_DIR/$executable
+  ADB shell ./$OPENCL_DIR/$executable
+  ADB shell rm -f $OPENCL_DIR/$executable
 else # Cleaning log records for multiple test targets
   for ((i = 0; i < num_targets; i++)); do
     target=${targets[i]}
@@ -52,9 +78,9 @@ else # Cleaning log records for multiple test targets
     bazel build --config=android_arm64 -c opt $target > /dev/null 2>&1
     test_path=$(echo $target | tr : /)
     exec_path=bazel-bin/$(echo $test_path | cut -c 3-)
-    adb ${_DEVICE} push "$exec_path" $OPENCL_DIR > /dev/null 2>&1
-    adb ${_DEVICE} shell chmod +x $OPENCL_DIR/$executable
-    adb ${_DEVICE} shell ./$OPENCL_DIR/$executable --logtostderr 2> /dev/null | grep '\][[:space:]][a-zA-Z][a-zA-Z0-9_]*\.'
-    adb ${_DEVICE} shell rm -f $OPENCL_DIR/$executable
+    ADB push "$exec_path" $OPENCL_DIR > /dev/null 2>&1
+    ADB shell chmod +x $OPENCL_DIR/$executable
+    ADB shell ./$OPENCL_DIR/$executable --logtostderr 2> /dev/null | grep '\][[:space:]][a-zA-Z][a-zA-Z0-9_]*\.'
+    ADB shell rm -f $OPENCL_DIR/$executable
   done
 fi
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/sigmoid.cc b/tensorflow/lite/delegates/gpu/cl/kernels/sigmoid.cc
index 4afa7adb50d..5558e541d50 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/sigmoid.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/sigmoid.cc
@@ -33,22 +33,22 @@ Sigmoid& Sigmoid::operator=(Sigmoid&& operation) {
   return *this;
 }
 
-std::string Sigmoid::GetCoreCode(const std::string& src,
-                                 const std::string& z_coord,
-                                 const std::string& address) const {
+std::string Sigmoid::GetCoreCode(const LinkingContext& context) const {
   if (definition_.precision != CalculationsPrecision::F32) {
     return absl::StrCat(
-        src, ".x = convert_half(native_recip(1.0f + native_exp(convert_float(-",
-        src, ".x))));\n", "  ", src,
-        ".y = convert_half(native_recip(1.0f + native_exp(convert_float(-", src,
-        ".y))));\n", "  ", src,
-        ".z = convert_half(native_recip(1.0f + native_exp(convert_float(-", src,
-        ".z))));\n", "  ", src,
-        ".w = convert_half(native_recip(1.0f + native_exp(convert_float(-", src,
-        ".w))));\n");
+        context.var_name,
+        ".x = convert_half(native_recip(1.0f + native_exp(convert_float(-",
+        context.var_name, ".x))));\n", "  ", context.var_name,
+        ".y = convert_half(native_recip(1.0f + native_exp(convert_float(-",
+        context.var_name, ".y))));\n", "  ", context.var_name,
+        ".z = convert_half(native_recip(1.0f + native_exp(convert_float(-",
+        context.var_name, ".z))));\n", "  ", context.var_name,
+        ".w = convert_half(native_recip(1.0f + native_exp(convert_float(-",
+        context.var_name, ".w))));\n");
   } else {
-    return absl::StrCat(src, " = (FLT4)(1.0f) / ((FLT4)(1.0f) + exp(-(", src,
-                        ")));\n");
+    return absl::StrCat(context.var_name,
+                        " = (FLT4)(1.0f) / ((FLT4)(1.0f) + exp(-(",
+                        context.var_name, ")));\n");
   }
 }
 
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/sigmoid.h b/tensorflow/lite/delegates/gpu/cl/kernels/sigmoid.h
index e3340a8a3d4..55f36a0af6d 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/sigmoid.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/sigmoid.h
@@ -36,8 +36,7 @@ class Sigmoid : public ElementwiseOperation {
   Sigmoid(const Sigmoid&) = delete;
   Sigmoid& operator=(const Sigmoid&) = delete;
 
-  std::string GetCoreCode(const std::string& src, const std::string& z_coord,
-                          const std::string& address) const override;
+  std::string GetCoreCode(const LinkingContext& context) const override;
 };
 
 Sigmoid CreateSigmoid(const OperationDef& definition);
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/softmax.cc b/tensorflow/lite/delegates/gpu/cl/kernels/softmax.cc
index d2a6524d8e6..78ea3005158 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/softmax.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/softmax.cc
@@ -68,8 +68,9 @@ std::string GetSoftmaxKernelCode(
           ";\n";
   code += "    t = exp(t) / sum;\n";
   code += "    FLT4 result = TO_FLT4(t);\n";
-  code += PostProcess(linked_operations, "result", "d", "address");
-  code += "    " + dst_tensor.Write3D("result", "address");
+  const LinkingContext context{"result", "X", "Y", "d"};
+  code += PostProcess(linked_operations, context);
+  code += "    " + dst_tensor.Write3D("result", "X", "Y", "d");
   code += "  }\n";
   code += "}\n";
   return code;
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/softmax1x1.cc b/tensorflow/lite/delegates/gpu/cl/kernels/softmax1x1.cc
index 20bb6428180..91829dd9520 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/softmax1x1.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/softmax1x1.cc
@@ -86,7 +86,8 @@ std::string GetSoftmaxKernelCode(
   code += "      FLT4 value = TO_FLT4(exp(" +
           src_tensor.ReadAsFloat3D("address", TextureAddressMode::DONT_CARE) +
           ") * sum);\n";
-  code += PostProcess(linked_operations, "value", "z", "address");
+  const LinkingContext context{"value", "0", "0", "z"};
+  code += PostProcess(linked_operations, context);
   code += "    " + dst_tensor.Write3D("value", "address");
   code += "      offset += 32;\n";
   code += "    }\n";
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/strided_slice.cc b/tensorflow/lite/delegates/gpu/cl/kernels/strided_slice.cc
index a956bcfa4ea..d9ccb755b21 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/strided_slice.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/strided_slice.cc
@@ -74,9 +74,9 @@ std::string GetStridedSliceCode(
       c += "  }\n";
     }
   }
-  c += "  " + dst_tensor.GetAddress("dst_adr", "X", "Y", "Z");
-  c += PostProcess(linked_operations, "result", "Z", "dst_adr");
-  c += "  " + dst_tensor.Write3D("result", "dst_adr");
+  const LinkingContext context{"result", "X", "Y", "Z"};
+  c += PostProcess(linked_operations, context);
+  c += "  " + dst_tensor.Write3D("result", "X", "Y", "Z");
   c += "}\n";
   return c;
 }
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/upsample.cc b/tensorflow/lite/delegates/gpu/cl/kernels/upsample.cc
index 960f39861e0..e92a368b036 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/upsample.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/upsample.cc
@@ -70,9 +70,9 @@ std::string GetUpsampleCode(
        ";\n";
   c += "  FLT4 r0 = TO_FLT4(mix(mix(src0, src1, t.x), mix(src2, src3, t.x), "
        "t.y));\n";
-  c += "  " + dst_tensor.GetAddress("dst_addr", "X", "Y", "Z") + "\n";
-  c += PostProcess(linked_operations, "r0", "Z", "dst_addr");
-  c += "  " + dst_tensor.Write3D("r0", "dst_addr");
+  const LinkingContext context{"r0", "X", "Y", "Z"};
+  c += PostProcess(linked_operations, context);
+  c += "  " + dst_tensor.Write3D("r0", "X", "Y", "Z");
   c += "}\n";
   return c;
 }
diff --git a/tensorflow/lite/delegates/gpu/cl/opencl_wrapper.h b/tensorflow/lite/delegates/gpu/cl/opencl_wrapper.h
index 673adec8058..a84cf8b8ad0 100644
--- a/tensorflow/lite/delegates/gpu/cl/opencl_wrapper.h
+++ b/tensorflow/lite/delegates/gpu/cl/opencl_wrapper.h
@@ -28,152 +28,152 @@ namespace cl {
 Status LoadOpenCL();
 void LoadOpenCLFunctions(void *libopencl, bool is_pixel);
 
-typedef cl_int (*PFN_clGetPlatformIDs)(
+typedef cl_int(CL_API_CALL *PFN_clGetPlatformIDs)(
     cl_uint /* num_entries */, cl_platform_id * /* platforms */,
     cl_uint * /* num_platforms */) CL_API_SUFFIX__VERSION_1_0;
-typedef cl_int (*PFN_clGetPlatformInfo)(
+typedef cl_int(CL_API_CALL *PFN_clGetPlatformInfo)(
     cl_platform_id /* platform */, cl_platform_info /* param_name */,
     size_t /* param_value_size */, void * /* param_value */,
     size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
-typedef cl_int (*PFN_clGetDeviceIDs)(
+typedef cl_int(CL_API_CALL *PFN_clGetDeviceIDs)(
     cl_platform_id /* platform */, cl_device_type /* device_type */,
     cl_uint /* num_entries */, cl_device_id * /* devices */,
     cl_uint * /* num_devices */) CL_API_SUFFIX__VERSION_1_0;
-typedef cl_int (*PFN_clGetDeviceInfo)(
+typedef cl_int(CL_API_CALL *PFN_clGetDeviceInfo)(
     cl_device_id /* device */, cl_device_info /* param_name */,
     size_t /* param_value_size */, void * /* param_value */,
     size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
-typedef cl_int (*PFN_clCreateSubDevices)(
+typedef cl_int(CL_API_CALL *PFN_clCreateSubDevices)(
     cl_device_id /* in_device */,
     const cl_device_partition_property * /* properties */,
     cl_uint /* num_devices */, cl_device_id * /* out_devices */,
     cl_uint * /* num_devices_ret */) CL_API_SUFFIX__VERSION_1_2;
-typedef cl_int (*PFN_clRetainDevice)(cl_device_id /* device */)
+typedef cl_int(CL_API_CALL *PFN_clRetainDevice)(cl_device_id /* device */)
     CL_API_SUFFIX__VERSION_1_2;
-typedef cl_int (*PFN_clReleaseDevice)(cl_device_id /* device */)
+typedef cl_int(CL_API_CALL *PFN_clReleaseDevice)(cl_device_id /* device */)
     CL_API_SUFFIX__VERSION_1_2;
-typedef cl_context (*PFN_clCreateContext)(
+typedef cl_context(CL_API_CALL *PFN_clCreateContext)(
     const cl_context_properties * /* properties */, cl_uint /* num_devices */,
     const cl_device_id * /* devices */,
     void(CL_CALLBACK * /* pfn_notify */)(const char *, const void *, size_t,
                                          void *),
     void * /* user_data */,
     cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
-typedef cl_context (*PFN_clCreateContextFromType)(
+typedef cl_context(CL_API_CALL *PFN_clCreateContextFromType)(
     const cl_context_properties * /* properties */,
     cl_device_type /* device_type */,
     void(CL_CALLBACK * /* pfn_notify*/)(const char *, const void *, size_t,
                                         void *),
     void * /* user_data */,
     cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
-typedef cl_int (*PFN_clRetainContext)(cl_context /* context */)
+typedef cl_int(CL_API_CALL *PFN_clRetainContext)(cl_context /* context */)
     CL_API_SUFFIX__VERSION_1_0;
-typedef cl_int (*PFN_clReleaseContext)(cl_context /* context */)
+typedef cl_int(CL_API_CALL *PFN_clReleaseContext)(cl_context /* context */)
     CL_API_SUFFIX__VERSION_1_0;
-typedef cl_int (*PFN_clGetContextInfo)(
+typedef cl_int(CL_API_CALL *PFN_clGetContextInfo)(
     cl_context /* context */, cl_context_info /* param_name */,
     size_t /* param_value_size */, void * /* param_value */,
     size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
-typedef cl_command_queue (*PFN_clCreateCommandQueueWithProperties)(
+typedef cl_command_queue(CL_API_CALL *PFN_clCreateCommandQueueWithProperties)(
     cl_context /* context */, cl_device_id /* device */,
     const cl_queue_properties * /* properties */,
     cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_2_0;
-typedef cl_int (*PFN_clRetainCommandQueue)(cl_command_queue /* command_queue */)
-    CL_API_SUFFIX__VERSION_1_0;
-typedef cl_int (*PFN_clReleaseCommandQueue)(
+typedef cl_int(CL_API_CALL *PFN_clRetainCommandQueue)(
     cl_command_queue /* command_queue */) CL_API_SUFFIX__VERSION_1_0;
-typedef cl_int (*PFN_clGetCommandQueueInfo)(
+typedef cl_int(CL_API_CALL *PFN_clReleaseCommandQueue)(
+    cl_command_queue /* command_queue */) CL_API_SUFFIX__VERSION_1_0;
+typedef cl_int(CL_API_CALL *PFN_clGetCommandQueueInfo)(
     cl_command_queue /* command_queue */,
     cl_command_queue_info /* param_name */, size_t /* param_value_size */,
     void * /* param_value */,
     size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
-typedef cl_mem (*PFN_clCreateBuffer)(
+typedef cl_mem(CL_API_CALL *PFN_clCreateBuffer)(
     cl_context /* context */, cl_mem_flags /* flags */, size_t /* size */,
     void * /* host_ptr */,
     cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
-typedef cl_mem (*PFN_clCreateSubBuffer)(
+typedef cl_mem(CL_API_CALL *PFN_clCreateSubBuffer)(
     cl_mem /* buffer */, cl_mem_flags /* flags */,
     cl_buffer_create_type /* buffer_create_type */,
     const void * /* buffer_create_info */,
     cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_1;
-typedef cl_mem (*PFN_clCreateImage)(
+typedef cl_mem(CL_API_CALL *PFN_clCreateImage)(
     cl_context /* context */, cl_mem_flags /* flags */,
     const cl_image_format * /* image_format */,
     const cl_image_desc * /* image_desc */, void * /* host_ptr */,
     cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_2;
-typedef cl_mem (*PFN_clCreatePipe)(
+typedef cl_mem(CL_API_CALL *PFN_clCreatePipe)(
     cl_context /* context */, cl_mem_flags /* flags */,
     cl_uint /* pipe_packet_size */, cl_uint /* pipe_max_packets */,
     const cl_pipe_properties * /* properties */,
     cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_2_0;
-typedef cl_int (*PFN_clRetainMemObject)(cl_mem /* memobj */)
+typedef cl_int(CL_API_CALL *PFN_clRetainMemObject)(cl_mem /* memobj */)
     CL_API_SUFFIX__VERSION_1_0;
-typedef cl_int (*PFN_clReleaseMemObject)(cl_mem /* memobj */)
+typedef cl_int(CL_API_CALL *PFN_clReleaseMemObject)(cl_mem /* memobj */)
     CL_API_SUFFIX__VERSION_1_0;
-typedef cl_int (*PFN_clGetSupportedImageFormats)(
+typedef cl_int(CL_API_CALL *PFN_clGetSupportedImageFormats)(
     cl_context /* context */, cl_mem_flags /* flags */,
     cl_mem_object_type /* image_type */, cl_uint /* num_entries */,
     cl_image_format * /* image_formats */,
     cl_uint * /* num_image_formats */) CL_API_SUFFIX__VERSION_1_0;
-typedef cl_int (*PFN_clGetMemObjectInfo)(
+typedef cl_int(CL_API_CALL *PFN_clGetMemObjectInfo)(
     cl_mem /* memobj */, cl_mem_info /* param_name */,
     size_t /* param_value_size */, void * /* param_value */,
     size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
-typedef cl_int (*PFN_clGetImageInfo)(
+typedef cl_int(CL_API_CALL *PFN_clGetImageInfo)(
     cl_mem /* image */, cl_image_info /* param_name */,
     size_t /* param_value_size */, void * /* param_value */,
     size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
-typedef cl_int (*PFN_clGetPipeInfo)(
+typedef cl_int(CL_API_CALL *PFN_clGetPipeInfo)(
     cl_mem /* pipe */, cl_pipe_info /* param_name */,
     size_t /* param_value_size */, void * /* param_value */,
     size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_2_0;
-typedef cl_int (*PFN_clSetMemObjectDestructorCallback)(
+typedef cl_int(CL_API_CALL *PFN_clSetMemObjectDestructorCallback)(
     cl_mem /* memobj */,
     void(CL_CALLBACK * /*pfn_notify*/)(cl_mem /* memobj */,
                                        void * /*user_data*/),
     void * /*user_data */) CL_API_SUFFIX__VERSION_1_1;
-typedef void *(*PFN_clSVMAlloc)(
+typedef void *(CL_API_CALL *PFN_clSVMAlloc)(
     cl_context /* context */, cl_svm_mem_flags /* flags */, size_t /* size */,
     cl_uint /* alignment */)CL_API_SUFFIX__VERSION_2_0;
-typedef void (*PFN_clSVMFree)(cl_context /* context */,
-                              void * /* svm_pointer */)
+typedef void(CL_API_CALL *PFN_clSVMFree)(cl_context /* context */,
+                                         void * /* svm_pointer */)
     CL_API_SUFFIX__VERSION_2_0;
-typedef cl_sampler (*PFN_clCreateSamplerWithProperties)(
+typedef cl_sampler(CL_API_CALL *PFN_clCreateSamplerWithProperties)(
     cl_context /* context */,
     const cl_sampler_properties * /* normalized_coords */,
     cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_2_0;
-typedef cl_int (*PFN_clRetainSampler)(cl_sampler /* sampler */)
+typedef cl_int(CL_API_CALL *PFN_clRetainSampler)(cl_sampler /* sampler */)
     CL_API_SUFFIX__VERSION_1_0;
-typedef cl_int (*PFN_clReleaseSampler)(cl_sampler /* sampler */)
+typedef cl_int(CL_API_CALL *PFN_clReleaseSampler)(cl_sampler /* sampler */)
     CL_API_SUFFIX__VERSION_1_0;
-typedef cl_int (*PFN_clGetSamplerInfo)(
+typedef cl_int(CL_API_CALL *PFN_clGetSamplerInfo)(
     cl_sampler /* sampler */, cl_sampler_info /* param_name */,
     size_t /* param_value_size */, void * /* param_value */,
     size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
-typedef cl_program (*PFN_clCreateProgramWithSource)(
+typedef cl_program(CL_API_CALL *PFN_clCreateProgramWithSource)(
     cl_context /* context */, cl_uint /* count */, const char ** /* strings */,
     const size_t * /* lengths */,
     cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
-typedef cl_program (*PFN_clCreateProgramWithBinary)(
+typedef cl_program(CL_API_CALL *PFN_clCreateProgramWithBinary)(
     cl_context /* context */, cl_uint /* num_devices */,
     const cl_device_id * /* device_list */, const size_t * /* lengths */,
     const unsigned char ** /* binaries */, cl_int * /* binary_status */,
     cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
-typedef cl_program (*PFN_clCreateProgramWithBuiltInKernels)(
+typedef cl_program(CL_API_CALL *PFN_clCreateProgramWithBuiltInKernels)(
     cl_context /* context */, cl_uint /* num_devices */,
     const cl_device_id * /* device_list */, const char * /* kernel_names */,
     cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_2;
-typedef cl_int (*PFN_clRetainProgram)(cl_program /* program */)
+typedef cl_int(CL_API_CALL *PFN_clRetainProgram)(cl_program /* program */)
     CL_API_SUFFIX__VERSION_1_0;
-typedef cl_int (*PFN_clReleaseProgram)(cl_program /* program */)
+typedef cl_int(CL_API_CALL *PFN_clReleaseProgram)(cl_program /* program */)
     CL_API_SUFFIX__VERSION_1_0;
-typedef cl_int (*PFN_clBuildProgram)(
+typedef cl_int(CL_API_CALL *PFN_clBuildProgram)(
     cl_program /* program */, cl_uint /* num_devices */,
     const cl_device_id * /* device_list */, const char * /* options */,
     void(CL_CALLBACK * /* pfn_notify */)(cl_program /* program */,
                                          void * /* user_data */),
     void * /* user_data */) CL_API_SUFFIX__VERSION_1_0;
-typedef cl_int (*PFN_clCompileProgram)(
+typedef cl_int(CL_API_CALL *PFN_clCompileProgram)(
     cl_program /* program */, cl_uint /* num_devices */,
     const cl_device_id * /* device_list */, const char * /* options */,
     cl_uint /* num_input_headers */, const cl_program * /* input_headers */,
@@ -181,7 +181,7 @@ typedef cl_int (*PFN_clCompileProgram)(
     void(CL_CALLBACK * /* pfn_notify */)(cl_program /* program */,
                                          void * /* user_data */),
     void * /* user_data */) CL_API_SUFFIX__VERSION_1_2;
-typedef cl_program (*PFN_clLinkProgram)(
+typedef cl_program(CL_API_CALL *PFN_clLinkProgram)(
     cl_context /* context */, cl_uint /* num_devices */,
     const cl_device_id * /* device_list */, const char * /* options */,
     cl_uint /* num_input_programs */, const cl_program * /* input_programs */,
@@ -189,88 +189,88 @@ typedef cl_program (*PFN_clLinkProgram)(
                                          void * /* user_data */),
     void * /* user_data */,
     cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_2;
-typedef cl_int (*PFN_clUnloadPlatformCompiler)(cl_platform_id /* platform */)
-    CL_API_SUFFIX__VERSION_1_2;
-typedef cl_int (*PFN_clGetProgramInfo)(
+typedef cl_int(CL_API_CALL *PFN_clUnloadPlatformCompiler)(
+    cl_platform_id /* platform */) CL_API_SUFFIX__VERSION_1_2;
+typedef cl_int(CL_API_CALL *PFN_clGetProgramInfo)(
     cl_program /* program */, cl_program_info /* param_name */,
     size_t /* param_value_size */, void * /* param_value */,
     size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
-typedef cl_int (*PFN_clGetProgramBuildInfo)(
+typedef cl_int(CL_API_CALL *PFN_clGetProgramBuildInfo)(
     cl_program /* program */, cl_device_id /* device */,
     cl_program_build_info /* param_name */, size_t /* param_value_size */,
     void * /* param_value */,
     size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
-typedef cl_kernel (*PFN_clCreateKernel)(
+typedef cl_kernel(CL_API_CALL *PFN_clCreateKernel)(
     cl_program /* program */, const char * /* kernel_name */,
     cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
-typedef cl_int (*PFN_clCreateKernelsInProgram)(
+typedef cl_int(CL_API_CALL *PFN_clCreateKernelsInProgram)(
     cl_program /* program */, cl_uint /* num_kernels */,
     cl_kernel * /* kernels */,
     cl_uint * /* num_kernels_ret */) CL_API_SUFFIX__VERSION_1_0;
-typedef cl_int (*PFN_clRetainKernel)(cl_kernel /* kernel */)
+typedef cl_int(CL_API_CALL *PFN_clRetainKernel)(cl_kernel /* kernel */)
     CL_API_SUFFIX__VERSION_1_0;
-typedef cl_int (*PFN_clReleaseKernel)(cl_kernel /* kernel */)
+typedef cl_int(CL_API_CALL *PFN_clReleaseKernel)(cl_kernel /* kernel */)
     CL_API_SUFFIX__VERSION_1_0;
-typedef cl_int (*PFN_clSetKernelArg)(
+typedef cl_int(CL_API_CALL *PFN_clSetKernelArg)(
     cl_kernel /* kernel */, cl_uint /* arg_index */, size_t /* arg_size */,
     const void * /* arg_value */) CL_API_SUFFIX__VERSION_1_0;
-typedef cl_int (*PFN_clSetKernelArgSVMPointer)(
+typedef cl_int(CL_API_CALL *PFN_clSetKernelArgSVMPointer)(
     cl_kernel /* kernel */, cl_uint /* arg_index */,
     const void * /* arg_value */) CL_API_SUFFIX__VERSION_2_0;
-typedef cl_int (*PFN_clSetKernelExecInfo)(
+typedef cl_int(CL_API_CALL *PFN_clSetKernelExecInfo)(
     cl_kernel /* kernel */, cl_kernel_exec_info /* param_name */,
     size_t /* param_value_size */,
     const void * /* param_value */) CL_API_SUFFIX__VERSION_2_0;
-typedef cl_int (*PFN_clGetKernelInfo)(
+typedef cl_int(CL_API_CALL *PFN_clGetKernelInfo)(
     cl_kernel /* kernel */, cl_kernel_info /* param_name */,
     size_t /* param_value_size */, void * /* param_value */,
     size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
-typedef cl_int (*PFN_clGetKernelArgInfo)(
+typedef cl_int(CL_API_CALL *PFN_clGetKernelArgInfo)(
     cl_kernel /* kernel */, cl_uint /* arg_indx */,
     cl_kernel_arg_info /* param_name */, size_t /* param_value_size */,
     void * /* param_value */,
     size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_2;
-typedef cl_int (*PFN_clGetKernelWorkGroupInfo)(
+typedef cl_int(CL_API_CALL *PFN_clGetKernelWorkGroupInfo)(
     cl_kernel /* kernel */, cl_device_id /* device */,
     cl_kernel_work_group_info /* param_name */, size_t /* param_value_size */,
     void * /* param_value */,
     size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
-typedef cl_int (*PFN_clWaitForEvents)(cl_uint /* num_events */,
-                                      const cl_event * /* event_list */)
-    CL_API_SUFFIX__VERSION_1_0;
-typedef cl_int (*PFN_clGetEventInfo)(
+typedef cl_int(CL_API_CALL *PFN_clWaitForEvents)(
+    cl_uint /* num_events */,
+    const cl_event * /* event_list */) CL_API_SUFFIX__VERSION_1_0;
+typedef cl_int(CL_API_CALL *PFN_clGetEventInfo)(
     cl_event /* event */, cl_event_info /* param_name */,
     size_t /* param_value_size */, void * /* param_value */,
     size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
-typedef cl_event (*PFN_clCreateUserEvent)(cl_context /* context */,
-                                          cl_int * /* errcode_ret */)
+typedef cl_event(CL_API_CALL *PFN_clCreateUserEvent)(cl_context /* context */,
+                                                     cl_int * /* errcode_ret */)
     CL_API_SUFFIX__VERSION_1_1;
-typedef cl_int (*PFN_clRetainEvent)(cl_event /* event */)
+typedef cl_int(CL_API_CALL *PFN_clRetainEvent)(cl_event /* event */)
     CL_API_SUFFIX__VERSION_1_0;
-typedef cl_int (*PFN_clReleaseEvent)(cl_event /* event */)
+typedef cl_int(CL_API_CALL *PFN_clReleaseEvent)(cl_event /* event */)
     CL_API_SUFFIX__VERSION_1_0;
-typedef cl_int (*PFN_clSetUserEventStatus)(cl_event /* event */,
-                                           cl_int /* execution_status */)
-    CL_API_SUFFIX__VERSION_1_1;
-typedef cl_int (*PFN_clSetEventCallback)(
+typedef cl_int(CL_API_CALL *PFN_clSetUserEventStatus)(
+    cl_event /* event */,
+    cl_int /* execution_status */) CL_API_SUFFIX__VERSION_1_1;
+typedef cl_int(CL_API_CALL *PFN_clSetEventCallback)(
     cl_event /* event */, cl_int /* command_exec_callback_type */,
     void(CL_CALLBACK * /* pfn_notify */)(cl_event, cl_int, void *),
     void * /* user_data */) CL_API_SUFFIX__VERSION_1_1;
-typedef cl_int (*PFN_clGetEventProfilingInfo)(
+typedef cl_int(CL_API_CALL *PFN_clGetEventProfilingInfo)(
     cl_event /* event */, cl_profiling_info /* param_name */,
     size_t /* param_value_size */, void * /* param_value */,
     size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
-typedef cl_int (*PFN_clFlush)(cl_command_queue /* command_queue */)
+typedef cl_int(CL_API_CALL *PFN_clFlush)(cl_command_queue /* command_queue */)
     CL_API_SUFFIX__VERSION_1_0;
-typedef cl_int (*PFN_clFinish)(cl_command_queue /* command_queue */)
+typedef cl_int(CL_API_CALL *PFN_clFinish)(cl_command_queue /* command_queue */)
     CL_API_SUFFIX__VERSION_1_0;
-typedef cl_int (*PFN_clEnqueueReadBuffer)(
+typedef cl_int(CL_API_CALL *PFN_clEnqueueReadBuffer)(
     cl_command_queue /* command_queue */, cl_mem /* buffer */,
     cl_bool /* blocking_read */, size_t /* offset */, size_t /* size */,
     void * /* ptr */, cl_uint /* num_events_in_wait_list */,
     const cl_event * /* event_wait_list */,
     cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0;
-typedef cl_int (*PFN_clEnqueueReadBufferRect)(
+typedef cl_int(CL_API_CALL *PFN_clEnqueueReadBufferRect)(
     cl_command_queue /* command_queue */, cl_mem /* buffer */,
     cl_bool /* blocking_read */, const size_t * /* buffer_offset */,
     const size_t * /* host_offset */, const size_t * /* region */,
@@ -279,13 +279,13 @@ typedef cl_int (*PFN_clEnqueueReadBufferRect)(
     void * /* ptr */, cl_uint /* num_events_in_wait_list */,
     const cl_event * /* event_wait_list */,
     cl_event * /* event */) CL_API_SUFFIX__VERSION_1_1;
-typedef cl_int (*PFN_clEnqueueWriteBuffer)(
+typedef cl_int(CL_API_CALL *PFN_clEnqueueWriteBuffer)(
     cl_command_queue /* command_queue */, cl_mem /* buffer */,
     cl_bool /* blocking_write */, size_t /* offset */, size_t /* size */,
     const void * /* ptr */, cl_uint /* num_events_in_wait_list */,
     const cl_event * /* event_wait_list */,
     cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0;
-typedef cl_int (*PFN_clEnqueueWriteBufferRect)(
+typedef cl_int(CL_API_CALL *PFN_clEnqueueWriteBufferRect)(
     cl_command_queue /* command_queue */, cl_mem /* buffer */,
     cl_bool /* blocking_write */, const size_t * /* buffer_offset */,
     const size_t * /* host_offset */, const size_t * /* region */,
@@ -294,19 +294,19 @@ typedef cl_int (*PFN_clEnqueueWriteBufferRect)(
     const void * /* ptr */, cl_uint /* num_events_in_wait_list */,
     const cl_event * /* event_wait_list */,
     cl_event * /* event */) CL_API_SUFFIX__VERSION_1_1;
-typedef cl_int (*PFN_clEnqueueFillBuffer)(
+typedef cl_int(CL_API_CALL *PFN_clEnqueueFillBuffer)(
     cl_command_queue /* command_queue */, cl_mem /* buffer */,
     const void * /* pattern */, size_t /* pattern_size */, size_t /* offset */,
     size_t /* size */, cl_uint /* num_events_in_wait_list */,
     const cl_event * /* event_wait_list */,
     cl_event * /* event */) CL_API_SUFFIX__VERSION_1_2;
-typedef cl_int (*PFN_clEnqueueCopyBuffer)(
+typedef cl_int(CL_API_CALL *PFN_clEnqueueCopyBuffer)(
     cl_command_queue /* command_queue */, cl_mem /* src_buffer */,
     cl_mem /* dst_buffer */, size_t /* src_offset */, size_t /* dst_offset */,
     size_t /* size */, cl_uint /* num_events_in_wait_list */,
     const cl_event * /* event_wait_list */,
     cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0;
-typedef cl_int (*PFN_clEnqueueCopyBufferRect)(
+typedef cl_int(CL_API_CALL *PFN_clEnqueueCopyBufferRect)(
     cl_command_queue /* command_queue */, cl_mem /* src_buffer */,
     cl_mem /* dst_buffer */, const size_t * /* src_origin */,
     const size_t * /* dst_origin */, const size_t * /* region */,
@@ -315,7 +315,7 @@ typedef cl_int (*PFN_clEnqueueCopyBufferRect)(
     cl_uint /* num_events_in_wait_list */,
     const cl_event * /* event_wait_list */,
     cl_event * /* event */) CL_API_SUFFIX__VERSION_1_1;
-typedef cl_int (*PFN_clEnqueueReadImage)(
+typedef cl_int(CL_API_CALL *PFN_clEnqueueReadImage)(
     cl_command_queue /* command_queue */, cl_mem /* image */,
     cl_bool /* blocking_read */, const size_t * /* origin[3] */,
     const size_t * /* region[3] */, size_t /* row_pitch */,
@@ -323,7 +323,7 @@ typedef cl_int (*PFN_clEnqueueReadImage)(
     cl_uint /* num_events_in_wait_list */,
     const cl_event * /* event_wait_list */,
     cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0;
-typedef cl_int (*PFN_clEnqueueWriteImage)(
+typedef cl_int(CL_API_CALL *PFN_clEnqueueWriteImage)(
     cl_command_queue /* command_queue */, cl_mem /* image */,
     cl_bool /* blocking_write */, const size_t * /* origin[3] */,
     const size_t * /* region[3] */, size_t /* input_row_pitch */,
@@ -331,41 +331,41 @@ typedef cl_int (*PFN_clEnqueueWriteImage)(
     cl_uint /* num_events_in_wait_list */,
     const cl_event * /* event_wait_list */,
     cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0;
-typedef cl_int (*PFN_clEnqueueFillImage)(
+typedef cl_int(CL_API_CALL *PFN_clEnqueueFillImage)(
     cl_command_queue /* command_queue */, cl_mem /* image */,
     const void * /* fill_color */, const size_t * /* origin[3] */,
     const size_t * /* region[3] */, cl_uint /* num_events_in_wait_list */,
     const cl_event * /* event_wait_list */,
     cl_event * /* event */) CL_API_SUFFIX__VERSION_1_2;
-typedef cl_int (*PFN_clEnqueueCopyImage)(
+typedef cl_int(CL_API_CALL *PFN_clEnqueueCopyImage)(
     cl_command_queue /* command_queue */, cl_mem /* src_image */,
     cl_mem /* dst_image */, const size_t * /* src_origin[3] */,
     const size_t * /* dst_origin[3] */, const size_t * /* region[3] */,
     cl_uint /* num_events_in_wait_list */,
     const cl_event * /* event_wait_list */,
     cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0;
-typedef cl_int (*PFN_clEnqueueCopyImageToBuffer)(
+typedef cl_int(CL_API_CALL *PFN_clEnqueueCopyImageToBuffer)(
     cl_command_queue /* command_queue */, cl_mem /* src_image */,
     cl_mem /* dst_buffer */, const size_t * /* src_origin[3] */,
     const size_t * /* region[3] */, size_t /* dst_offset */,
     cl_uint /* num_events_in_wait_list */,
     const cl_event * /* event_wait_list */,
     cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0;
-typedef cl_int (*PFN_clEnqueueCopyBufferToImage)(
+typedef cl_int(CL_API_CALL *PFN_clEnqueueCopyBufferToImage)(
     cl_command_queue /* command_queue */, cl_mem /* src_buffer */,
     cl_mem /* dst_image */, size_t /* src_offset */,
     const size_t * /* dst_origin[3] */, const size_t * /* region[3] */,
     cl_uint /* num_events_in_wait_list */,
     const cl_event * /* event_wait_list */,
     cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0;
-typedef void *(*PFN_clEnqueueMapBuffer)(
+typedef void *(CL_API_CALL *PFN_clEnqueueMapBuffer)(
     cl_command_queue /* command_queue */, cl_mem /* buffer */,
     cl_bool /* blocking_map */, cl_map_flags /* map_flags */,
     size_t /* offset */, size_t /* size */,
     cl_uint /* num_events_in_wait_list */,
     const cl_event * /* event_wait_list */, cl_event * /* event */,
     cl_int * /* errcode_ret */)CL_API_SUFFIX__VERSION_1_0;
-typedef void *(*PFN_clEnqueueMapImage)(
+typedef void *(CL_API_CALL *PFN_clEnqueueMapImage)(
     cl_command_queue /* command_queue */, cl_mem /* image */,
     cl_bool /* blocking_map */, cl_map_flags /* map_flags */,
     const size_t * /* origin[3] */, const size_t * /* region[3] */,
@@ -373,25 +373,25 @@ typedef void *(*PFN_clEnqueueMapImage)(
     cl_uint /* num_events_in_wait_list */,
     const cl_event * /* event_wait_list */, cl_event * /* event */,
     cl_int * /* errcode_ret */)CL_API_SUFFIX__VERSION_1_0;
-typedef cl_int (*PFN_clEnqueueUnmapMemObject)(
+typedef cl_int(CL_API_CALL *PFN_clEnqueueUnmapMemObject)(
     cl_command_queue /* command_queue */, cl_mem /* memobj */,
     void * /* mapped_ptr */, cl_uint /* num_events_in_wait_list */,
     const cl_event * /* event_wait_list */,
     cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0;
-typedef cl_int (*PFN_clEnqueueMigrateMemObjects)(
+typedef cl_int(CL_API_CALL *PFN_clEnqueueMigrateMemObjects)(
     cl_command_queue /* command_queue */, cl_uint /* num_mem_objects */,
     const cl_mem * /* mem_objects */, cl_mem_migration_flags /* flags */,
     cl_uint /* num_events_in_wait_list */,
     const cl_event * /* event_wait_list */,
     cl_event * /* event */) CL_API_SUFFIX__VERSION_1_2;
-typedef cl_int (*PFN_clEnqueueNDRangeKernel)(
+typedef cl_int(CL_API_CALL *PFN_clEnqueueNDRangeKernel)(
     cl_command_queue /* command_queue */, cl_kernel /* kernel */,
     cl_uint /* work_dim */, const size_t * /* global_work_offset */,
     const size_t * /* global_work_size */, const size_t * /* local_work_size */,
     cl_uint /* num_events_in_wait_list */,
     const cl_event * /* event_wait_list */,
     cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0;
-typedef cl_int (*PFN_clEnqueueNativeKernel)(
+typedef cl_int(CL_API_CALL *PFN_clEnqueueNativeKernel)(
     cl_command_queue /* command_queue */,
     void(CL_CALLBACK * /*user_func*/)(void *), void * /* args */,
     size_t /* cb_args */, cl_uint /* num_mem_objects */,
@@ -399,15 +399,15 @@ typedef cl_int (*PFN_clEnqueueNativeKernel)(
     cl_uint /* num_events_in_wait_list */,
     const cl_event * /* event_wait_list */,
     cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0;
-typedef cl_int (*PFN_clEnqueueMarkerWithWaitList)(
+typedef cl_int(CL_API_CALL *PFN_clEnqueueMarkerWithWaitList)(
     cl_command_queue /* command_queue */, cl_uint /* num_events_in_wait_list */,
     const cl_event * /* event_wait_list */,
     cl_event * /* event */) CL_API_SUFFIX__VERSION_1_2;
-typedef cl_int (*PFN_clEnqueueBarrierWithWaitList)(
+typedef cl_int(CL_API_CALL *PFN_clEnqueueBarrierWithWaitList)(
     cl_command_queue /* command_queue */, cl_uint /* num_events_in_wait_list */,
     const cl_event * /* event_wait_list */,
     cl_event * /* event */) CL_API_SUFFIX__VERSION_1_2;
-typedef cl_int (*PFN_clEnqueueSVMFree)(
+typedef cl_int(CL_API_CALL *PFN_clEnqueueSVMFree)(
     cl_command_queue /* command_queue */, cl_uint /* num_svm_pointers */,
     void *[] /* svm_pointers[] */,
     void(CL_CALLBACK * /*pfn_free_func*/)(cl_command_queue /* queue */,
@@ -417,78 +417,78 @@ typedef cl_int (*PFN_clEnqueueSVMFree)(
     void * /* user_data */, cl_uint /* num_events_in_wait_list */,
     const cl_event * /* event_wait_list */,
     cl_event * /* event */) CL_API_SUFFIX__VERSION_2_0;
-typedef cl_int (*PFN_clEnqueueSVMMemcpy)(
+typedef cl_int(CL_API_CALL *PFN_clEnqueueSVMMemcpy)(
     cl_command_queue /* command_queue */, cl_bool /* blocking_copy */,
     void * /* dst_ptr */, const void * /* src_ptr */, size_t /* size */,
     cl_uint /* num_events_in_wait_list */,
     const cl_event * /* event_wait_list */,
     cl_event * /* event */) CL_API_SUFFIX__VERSION_2_0;
-typedef cl_int (*PFN_clEnqueueSVMMemFill)(
+typedef cl_int(CL_API_CALL *PFN_clEnqueueSVMMemFill)(
     cl_command_queue /* command_queue */, void * /* svm_ptr */,
     const void * /* pattern */, size_t /* pattern_size */, size_t /* size */,
     cl_uint /* num_events_in_wait_list */,
     const cl_event * /* event_wait_list */,
     cl_event * /* event */) CL_API_SUFFIX__VERSION_2_0;
-typedef cl_int (*PFN_clEnqueueSVMMap)(
+typedef cl_int(CL_API_CALL *PFN_clEnqueueSVMMap)(
     cl_command_queue /* command_queue */, cl_bool /* blocking_map */,
     cl_map_flags /* flags */, void * /* svm_ptr */, size_t /* size */,
     cl_uint /* num_events_in_wait_list */,
     const cl_event * /* event_wait_list */,
     cl_event * /* event */) CL_API_SUFFIX__VERSION_2_0;
-typedef cl_int (*PFN_clEnqueueSVMUnmap)(
+typedef cl_int(CL_API_CALL *PFN_clEnqueueSVMUnmap)(
     cl_command_queue /* command_queue */, void * /* svm_ptr */,
     cl_uint /* num_events_in_wait_list */,
     const cl_event * /* event_wait_list */,
     cl_event * /* event */) CL_API_SUFFIX__VERSION_2_0;
-typedef void *(*PFN_clGetExtensionFunctionAddressForPlatform)(
+typedef void *(CL_API_CALL *PFN_clGetExtensionFunctionAddressForPlatform)(
     cl_platform_id /* platform */,
     const char * /* func_name */)CL_API_SUFFIX__VERSION_1_2;
-typedef cl_mem (*PFN_clCreateImage2D)(
+typedef cl_mem(CL_API_CALL *PFN_clCreateImage2D)(
     cl_context /* context */, cl_mem_flags /* flags */,
     const cl_image_format * /* image_format */, size_t /* image_width */,
     size_t /* image_height */, size_t /* image_row_pitch */,
     void * /* host_ptr */, cl_int * /* errcode_ret */);
-typedef cl_mem (*PFN_clCreateImage3D)(
+typedef cl_mem(CL_API_CALL *PFN_clCreateImage3D)(
     cl_context /* context */, cl_mem_flags /* flags */,
     const cl_image_format * /* image_format */, size_t /* image_width */,
     size_t /* image_height */, size_t /* image_depth */,
     size_t /* image_row_pitch */, size_t /* image_slice_pitch */,
     void * /* host_ptr */, cl_int * /* errcode_ret */);
-typedef cl_int (*PFN_clEnqueueMarker)(cl_command_queue /* command_queue */,
-                                      cl_event * /* event */);
-typedef cl_int (*PFN_clEnqueueWaitForEvents)(
+typedef cl_int(CL_API_CALL *PFN_clEnqueueMarker)(
+    cl_command_queue /* command_queue */, cl_event * /* event */);
+typedef cl_int(CL_API_CALL *PFN_clEnqueueWaitForEvents)(
     cl_command_queue /* command_queue */, cl_uint /* num_events */,
     const cl_event * /* event_list */);
-typedef cl_int (*PFN_clEnqueueBarrier)(cl_command_queue /* command_queue */);
-typedef cl_int (*PFN_clUnloadCompiler)();
-typedef void *(*PFN_clGetExtensionFunctionAddress)(
+typedef cl_int(CL_API_CALL *PFN_clEnqueueBarrier)(
+    cl_command_queue /* command_queue */);
+typedef cl_int(CL_API_CALL *PFN_clUnloadCompiler)();
+typedef void *(CL_API_CALL *PFN_clGetExtensionFunctionAddress)(
     const char * /* func_name */);
-typedef cl_command_queue (*PFN_clCreateCommandQueue)(
+typedef cl_command_queue(CL_API_CALL *PFN_clCreateCommandQueue)(
     cl_context /* context */, cl_device_id /* device */,
     cl_command_queue_properties /* properties */, cl_int * /* errcode_ret */);
-typedef cl_sampler (*PFN_clCreateSampler)(
+typedef cl_sampler(CL_API_CALL *PFN_clCreateSampler)(
     cl_context /* context */, cl_bool /* normalized_coords */,
     cl_addressing_mode /* addressing_mode */, cl_filter_mode /* filter_mode */,
     cl_int * /* errcode_ret */);
-typedef cl_int (*PFN_clEnqueueTask)(cl_command_queue /* command_queue */,
-                                    cl_kernel /* kernel */,
-                                    cl_uint /* num_events_in_wait_list */,
-                                    const cl_event * /* event_wait_list */,
-                                    cl_event * /* event */);
+typedef cl_int(CL_API_CALL *PFN_clEnqueueTask)(
+    cl_command_queue /* command_queue */, cl_kernel /* kernel */,
+    cl_uint /* num_events_in_wait_list */,
+    const cl_event * /* event_wait_list */, cl_event * /* event */);
 
 // OpenGL sharing
-typedef cl_mem (*PFN_clCreateFromGLBuffer)(cl_context, cl_mem_flags, cl_GLuint,
-                                           int *);
-typedef cl_mem (*PFN_clCreateFromGLTexture)(
+typedef cl_mem(CL_API_CALL *PFN_clCreateFromGLBuffer)(cl_context, cl_mem_flags,
+                                                      cl_GLuint, int *);
+typedef cl_mem(CL_API_CALL *PFN_clCreateFromGLTexture)(
     cl_context /* context */, cl_mem_flags /* flags */, cl_GLenum /* target */,
     cl_GLint /* miplevel */, cl_GLuint /* texture */,
     cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_2;
-typedef cl_int (*PFN_clEnqueueAcquireGLObjects)(
+typedef cl_int(CL_API_CALL *PFN_clEnqueueAcquireGLObjects)(
     cl_command_queue /* command_queue */, cl_uint /* num_objects */,
     const cl_mem * /* mem_objects */, cl_uint /* num_events_in_wait_list */,
     const cl_event * /* event_wait_list */, cl_event * /* event */);
 
-typedef cl_int (*PFN_clEnqueueReleaseGLObjects)(
+typedef cl_int(CL_API_CALL *PFN_clEnqueueReleaseGLObjects)(
     cl_command_queue /* command_queue */, cl_uint /* num_objects */,
     const cl_mem * /* mem_objects */, cl_uint /* num_events_in_wait_list */,
     const cl_event * /* event_wait_list */,
@@ -502,7 +502,7 @@ typedef void *CLeglDisplayKHR;
 // CLeglSyncKHR is an opaque handle to an EGLSync object
 typedef void *CLeglSyncKHR;
 
-typedef cl_event (*PFN_clCreateEventFromEGLSyncKHR)(
+typedef cl_event(CL_API_CALL *PFN_clCreateEventFromEGLSyncKHR)(
     cl_context /* context */, CLeglSyncKHR /* sync */,
     CLeglDisplayKHR /* display */, cl_int * /* errcode_ret */);
 
diff --git a/tensorflow/lite/delegates/gpu/cl/selectors/operation_selector.cc b/tensorflow/lite/delegates/gpu/cl/selectors/operation_selector.cc
index 34bbd604a97..3f65fc171cd 100644
--- a/tensorflow/lite/delegates/gpu/cl/selectors/operation_selector.cc
+++ b/tensorflow/lite/delegates/gpu/cl/selectors/operation_selector.cc
@@ -33,11 +33,10 @@ namespace cl {
 
 Status GPUOperationFromNode(const CreationContext& creation_context,
                             const OperationDef& op_def, ModelHints hints,
-                            const GraphFloat32& graph, const Node& node,
+                            const std::vector<Value<TensorRef<BHWC>>*>& inputs,
+                            const std::vector<Value<TensorRef<BHWC>>*>& outputs,
+                            const Node& node,
                             std::unique_ptr<GPUOperation>* gpu_op) {
-  auto inputs = graph.FindInputs(node.id);
-  auto outputs = graph.FindOutputs(node.id);
-
   auto op_type = OperationTypeFromString(node.operation.type);
   switch (op_type) {
     case OperationType::ABS: {
@@ -64,7 +63,8 @@ Status GPUOperationFromNode(const CreationContext& creation_context,
       }
     }
     case OperationType::APPLY_MASK: {
-      SelectApplyMask(op_def, gpu_op);
+      SelectApplyMask(op_def, inputs[0]->tensor.shape, inputs[1]->tensor.shape,
+                      gpu_op);
       return OkStatus();
     }
     case OperationType::CONCAT: {
diff --git a/tensorflow/lite/delegates/gpu/cl/selectors/operation_selector.h b/tensorflow/lite/delegates/gpu/cl/selectors/operation_selector.h
index c1fb0557f6f..e1b779f9135 100644
--- a/tensorflow/lite/delegates/gpu/cl/selectors/operation_selector.h
+++ b/tensorflow/lite/delegates/gpu/cl/selectors/operation_selector.h
@@ -29,7 +29,9 @@ namespace cl {
 
 Status GPUOperationFromNode(const CreationContext& creation_context,
                             const OperationDef& op_def, ModelHints hints,
-                            const GraphFloat32& graph, const Node& node,
+                            const std::vector<Value<TensorRef<BHWC>>*>& inputs,
+                            const std::vector<Value<TensorRef<BHWC>>*>& outputs,
+                            const Node& node,
                             std::unique_ptr<GPUOperation>* gpu_op);
 
 }  // namespace cl
diff --git a/tensorflow/lite/delegates/gpu/cl/selectors/simple_selectors.cc b/tensorflow/lite/delegates/gpu/cl/selectors/simple_selectors.cc
index fcafe584ffc..4244c632e76 100644
--- a/tensorflow/lite/delegates/gpu/cl/selectors/simple_selectors.cc
+++ b/tensorflow/lite/delegates/gpu/cl/selectors/simple_selectors.cc
@@ -46,9 +46,10 @@ void SelectAbs(const OperationDef& op_def, std::unique_ptr<GPUOperation>* ptr) {
   *ptr = absl::make_unique<Abs>(std::move(operation));
 }
 
-void SelectApplyMask(const OperationDef& op_def,
+void SelectApplyMask(const OperationDef& op_def, const BHWC& src_shape,
+                     const BHWC& mask_shape,
                      std::unique_ptr<GPUOperation>* ptr) {
-  ApplyMask operation = CreateApplyMask(op_def);
+  ApplyMask operation = CreateApplyMask(op_def, src_shape, mask_shape);
   *ptr = absl::make_unique<ApplyMask>(std::move(operation));
 }
 
diff --git a/tensorflow/lite/delegates/gpu/cl/selectors/simple_selectors.h b/tensorflow/lite/delegates/gpu/cl/selectors/simple_selectors.h
index f78030a0746..f52f0bef652 100644
--- a/tensorflow/lite/delegates/gpu/cl/selectors/simple_selectors.h
+++ b/tensorflow/lite/delegates/gpu/cl/selectors/simple_selectors.h
@@ -29,7 +29,8 @@ namespace cl {
 
 void SelectAbs(const OperationDef& op_def, std::unique_ptr<GPUOperation>* ptr);
 
-void SelectApplyMask(const OperationDef& op_def,
+void SelectApplyMask(const OperationDef& op_def, const BHWC& src_shape,
+                     const BHWC& mask_shape,
                      std::unique_ptr<GPUOperation>* ptr);
 
 void SelectReLU(const CreationContext& creation_context,
diff --git a/tensorflow/lite/delegates/gpu/cl/tensor.cc b/tensorflow/lite/delegates/gpu/cl/tensor.cc
index 6a2d16c514d..523fbc72f86 100644
--- a/tensorflow/lite/delegates/gpu/cl/tensor.cc
+++ b/tensorflow/lite/delegates/gpu/cl/tensor.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include "absl/strings/str_cat.h"
 #include "tensorflow/lite/delegates/gpu/cl/cl_image_format.h"
+#include "tensorflow/lite/delegates/gpu/common/data_type.h"
 #include "tensorflow/lite/delegates/gpu/common/status.h"
 
 namespace tflite {
@@ -191,6 +192,37 @@ Status Tensor::ReadData(CLCommandQueue* queue, TensorFloat32* dst) const {
   return ReadDataBHWC(absl::MakeSpan(dst->data), queue);
 }
 
+bool CanCreateTensorWithShape(const CLContext& context, const CLDevice& device,
+                              const BHWC& shape,
+                              const TensorDescriptor& descriptor) {
+  if (shape.b != 1) {
+    return false;
+  }
+  const int depth = IntegralDivideRoundUp(shape.c, 4);
+  switch (descriptor.storage_type) {
+    case TensorStorageType::BUFFER: {
+      const int flt4_size =
+          4 * (descriptor.data_type == DataType::FLOAT32 ? 4 : 2);
+      const int buffer_size = shape.w * shape.h * depth * flt4_size;
+      return buffer_size <= device.GetInfo().buffer_max_size;
+    }
+    case TensorStorageType::TEXTURE_ARRAY:
+      return shape.w <= device.GetInfo().image2d_max_width &&
+             shape.h <= device.GetInfo().image2d_max_height &&
+             depth <= device.GetInfo().image_array_max_layers;
+    case TensorStorageType::TEXTURE_2D:
+      return shape.w <= device.GetInfo().image2d_max_width &&
+             shape.h * depth <= device.GetInfo().image2d_max_height;
+    case TensorStorageType::SINGLE_TEXTURE_2D:
+      return shape.c <= 4 &&
+             context.IsFloatTexture2DSupported(shape.c, descriptor.data_type) &&
+             shape.w <= device.GetInfo().image2d_max_width &&
+             shape.h <= device.GetInfo().image2d_max_height;
+    default:
+      return false;
+  }
+}
+
 Status CreateTensor(const CLContext& context, const CLDevice& device, int width,
                     int height, int channels, DataType data_type,
                     TensorStorageType storage_type, Tensor* result) {
@@ -202,6 +234,21 @@ Status CreateTensor(const CLContext& context, const CLDevice& device, int width,
   return OkStatus();
 }
 
+Status CreateTensor(const CLContext& context, const CLDevice& device,
+                    const BHWC& shape, const TensorDescriptor& descriptor,
+                    Tensor* result) {
+  if (shape.b != 1) {
+    return UnimplementedError("Batch is not supported.");
+  }
+  CLMemory memory;
+  RETURN_IF_ERROR(AllocateTensorMemory(context, device, shape.w, shape.h,
+                                       shape.c, descriptor.data_type,
+                                       descriptor.storage_type, &memory));
+  *result = Tensor(memory.Release(), shape.w, shape.h, shape.c,
+                   descriptor.data_type, descriptor.storage_type);
+  return OkStatus();
+}
+
 Status AllocateTensorMemory(const CLContext& context, const CLDevice& device,
                             int width, int height, int channels,
                             DataType data_type, TensorStorageType storage_type,
diff --git a/tensorflow/lite/delegates/gpu/cl/tensor.h b/tensorflow/lite/delegates/gpu/cl/tensor.h
index 7991d373584..074107871e8 100644
--- a/tensorflow/lite/delegates/gpu/cl/tensor.h
+++ b/tensorflow/lite/delegates/gpu/cl/tensor.h
@@ -57,8 +57,7 @@ class Tensor {
 
   int Depth() const { return IntegralDivideRoundUp(channels_, 4); }
   int4 GetSizeWithDepth() const {
-    return int4(width_, height_, channels_,
-                IntegralDivideRoundUp(channels_, 4));
+    return int4(width_, height_, channels_, Depth());
   }
   cl_mem GetMemoryPtr() const { return memory_; }
 
@@ -86,7 +85,7 @@ class Tensor {
       case TensorStorageType::TEXTURE_2D:
         return ((y * Depth() + d) * width_ + x) * 4 + sub_d;  // HDWC4
       case TensorStorageType::SINGLE_TEXTURE_2D:
-        return (sub_d * height_ + y) * width_ + x;
+        return (y * width_ + x) * channels_ + sub_d;
       case TensorStorageType::UNKNOWN:
         return -1;
     }
@@ -149,6 +148,10 @@ class TensorBHWC : public Tensor {
 
 using TensorPtr = std::shared_ptr<Tensor>;
 
+bool CanCreateTensorWithShape(const CLContext& context, const CLDevice& device,
+                              const BHWC& shape,
+                              const TensorDescriptor& descriptor);
+
 Status AllocateTensorMemory(const CLContext& context, const CLDevice& device,
                             int width, int height, int channels,
                             DataType data_type, TensorStorageType storage_type,
@@ -158,6 +161,10 @@ Status CreateTensor(const CLContext& context, const CLDevice& device, int width,
                     int height, int channels, DataType data_type,
                     TensorStorageType storage_type, Tensor* result);
 
+Status CreateTensor(const CLContext& context, const CLDevice& device,
+                    const BHWC& shape, const TensorDescriptor& descriptor,
+                    Tensor* result);
+
 Status CreateTensorBHWC(const CLContext& context, const HWC& shape,
                         DataType data_type, void* data, Tensor* result);
 
diff --git a/tensorflow/lite/delegates/gpu/cl/testing/run_performance_profiling.sh b/tensorflow/lite/delegates/gpu/cl/testing/run_performance_profiling.sh
old mode 100644
new mode 100755
diff --git a/tensorflow/lite/delegates/gpu/common/BUILD b/tensorflow/lite/delegates/gpu/common/BUILD
index d43defb3e80..29ee380b460 100644
--- a/tensorflow/lite/delegates/gpu/common/BUILD
+++ b/tensorflow/lite/delegates/gpu/common/BUILD
@@ -57,6 +57,7 @@ cc_library(
         "memory_management/internal.h",
         "memory_management/min_cost_flow_assignment.h",
         "memory_management/naive_assignment.h",
+        "memory_management/types.h",
     ],
     deps = [
         ":shape",
diff --git a/tensorflow/lite/delegates/gpu/common/gpu_info.h b/tensorflow/lite/delegates/gpu/common/gpu_info.h
index 44d10b323df..1e93a8fe064 100644
--- a/tensorflow/lite/delegates/gpu/common/gpu_info.h
+++ b/tensorflow/lite/delegates/gpu/common/gpu_info.h
@@ -82,6 +82,11 @@ struct GpuInfo {
   int max_array_texture_layers = 0;
 };
 
+inline bool IsOpenGl31OrAbove(const GpuInfo& gpu_info) {
+  return (gpu_info.major_version == 3 && gpu_info.minor_version >= 1) ||
+         gpu_info.major_version > 3;
+}
+
 // Analyzes `renderer` and returns matching `GpuType` and `GpuModel`.
 void GetGpuModelAndType(const std::string& renderer, GpuModel* gpu_model,
                         GpuType* gpu_type);
diff --git a/tensorflow/lite/delegates/gpu/common/memory_management.cc b/tensorflow/lite/delegates/gpu/common/memory_management.cc
index 6c7c7283c85..87ba4251aa9 100644
--- a/tensorflow/lite/delegates/gpu/common/memory_management.cc
+++ b/tensorflow/lite/delegates/gpu/common/memory_management.cc
@@ -23,12 +23,8 @@ limitations under the License.
 #include <type_traits>
 #include <vector>
 
-#include "tensorflow/lite/delegates/gpu/common/memory_management/equality_assignment.h"
 #include "tensorflow/lite/delegates/gpu/common/memory_management/greedy_by_breadth_assignment.h"
 #include "tensorflow/lite/delegates/gpu/common/memory_management/greedy_by_size_assignment.h"
-#include "tensorflow/lite/delegates/gpu/common/memory_management/greedy_in_order_assignment.h"
-#include "tensorflow/lite/delegates/gpu/common/memory_management/min_cost_flow_assignment.h"
-#include "tensorflow/lite/delegates/gpu/common/memory_management/naive_assignment.h"
 #include "tensorflow/lite/delegates/gpu/common/status.h"
 
 namespace tflite {
@@ -59,6 +55,19 @@ OffsetsAssignment ObjectsToOffsets(
   return result;
 }
 
+Status BestGreedy(const std::vector<TensorUsageRecord<size_t>>& usage_records,
+                  ObjectsAssignment<size_t>* assignment) {
+  RETURN_IF_ERROR(
+      GreedyBySizeDistPriorityAssignment(usage_records, assignment));
+  ObjectsAssignment<size_t> assignment_by_breadth;
+  if (GreedyByBreadthAssignment(usage_records, &assignment_by_breadth).ok() &&
+      TotalSize(assignment_by_breadth) < TotalSize(*assignment)) {
+    std::swap(*assignment, assignment_by_breadth);
+  }
+  return OkStatus();
+}
+
+template <>
 Status AssignObjectsToTensors(
     const std::vector<TensorUsageRecord<size_t>>& usage_records,
     MemoryStrategy strategy, ObjectsAssignment<size_t>* assignment) {
@@ -66,24 +75,15 @@ Status AssignObjectsToTensors(
     case MemoryStrategy::NAIVE:
       return NaiveAssignment(usage_records, assignment);
     case MemoryStrategy::EQUALITY:
-      return EqualityAssignment(usage_records, assignment);
+      return EqualityAssignmentWithHash(usage_records, assignment);
     case MemoryStrategy::GREEDY_IN_ORDER:
       return GreedyInOrderAssignment(usage_records, assignment);
     case MemoryStrategy::GREEDY_BY_BREADTH:
       return GreedyByBreadthAssignment(usage_records, assignment);
     case MemoryStrategy::GREEDY_BY_SIZE:
       return GreedyBySizeDistPriorityAssignment(usage_records, assignment);
-    case MemoryStrategy::GREEDY_BEST: {
-      RETURN_IF_ERROR(
-          GreedyBySizeDistPriorityAssignment(usage_records, assignment));
-      ObjectsAssignment<size_t> assignment_by_breadth;
-      if (GreedyByBreadthAssignment(usage_records, &assignment_by_breadth)
-              .ok() &&
-          TotalSize(assignment_by_breadth) < TotalSize(*assignment)) {
-        std::swap(*assignment, assignment_by_breadth);
-      }
-      return OkStatus();
-    }
+    case MemoryStrategy::GREEDY_BEST:
+      return BestGreedy(usage_records, assignment);
     case MemoryStrategy::MINCOSTFLOW:
       return MinCostFlowAssignment(usage_records, assignment);
     default:
@@ -93,6 +93,7 @@ Status AssignObjectsToTensors(
   return OkStatus();
 }
 
+template <>
 Status AssignObjectsToTensors(
     const std::vector<TensorUsageRecord<BHWC>>& usage_records,
     MemoryStrategy strategy, ObjectsAssignment<BHWC>* assignment) {
@@ -100,7 +101,7 @@ Status AssignObjectsToTensors(
     case MemoryStrategy::NAIVE:
       return NaiveAssignment(usage_records, assignment);
     case MemoryStrategy::EQUALITY:
-      return EqualityAssignment(usage_records, assignment);
+      return EqualityAssignmentWithHash(usage_records, assignment);
     default:
       return InternalError(
           "MemoryStrategy is not supported with current tensor size type.");
@@ -108,12 +109,15 @@ Status AssignObjectsToTensors(
   return OkStatus();
 }
 
+template <>
 Status AssignObjectsToTensors(
     const std::vector<TensorUsageRecord<uint2>>& usage_records,
     MemoryStrategy strategy, ObjectsAssignment<uint2>* assignment) {
   switch (strategy) {
     case MemoryStrategy::NAIVE:
       return NaiveAssignment(usage_records, assignment);
+    case MemoryStrategy::EQUALITY:
+      return EqualityAssignment(usage_records, assignment);
     case MemoryStrategy::GREEDY_IN_ORDER:
       return GreedyInOrderAssignmentMultidimensional(usage_records, assignment);
     default:
@@ -123,12 +127,15 @@ Status AssignObjectsToTensors(
   return OkStatus();
 }
 
+template <>
 Status AssignObjectsToTensors(
     const std::vector<TensorUsageRecord<uint3>>& usage_records,
     MemoryStrategy strategy, ObjectsAssignment<uint3>* assignment) {
   switch (strategy) {
     case MemoryStrategy::NAIVE:
       return NaiveAssignment(usage_records, assignment);
+    case MemoryStrategy::EQUALITY:
+      return EqualityAssignment(usage_records, assignment);
     case MemoryStrategy::GREEDY_IN_ORDER:
       return GreedyInOrderAssignmentMultidimensional(usage_records, assignment);
     default:
diff --git a/tensorflow/lite/delegates/gpu/common/memory_management.h b/tensorflow/lite/delegates/gpu/common/memory_management.h
index fb2e3f9eb01..53d7a170d90 100644
--- a/tensorflow/lite/delegates/gpu/common/memory_management.h
+++ b/tensorflow/lite/delegates/gpu/common/memory_management.h
@@ -21,6 +21,13 @@ limitations under the License.
 #include <vector>
 
 #include "absl/memory/memory.h"
+#include "tensorflow/lite/delegates/gpu/common/memory_management/equality_assignment.h"
+#include "tensorflow/lite/delegates/gpu/common/memory_management/greedy_by_breadth_assignment.h"
+#include "tensorflow/lite/delegates/gpu/common/memory_management/greedy_by_size_assignment.h"
+#include "tensorflow/lite/delegates/gpu/common/memory_management/greedy_in_order_assignment.h"
+#include "tensorflow/lite/delegates/gpu/common/memory_management/min_cost_flow_assignment.h"
+#include "tensorflow/lite/delegates/gpu/common/memory_management/naive_assignment.h"
+#include "tensorflow/lite/delegates/gpu/common/memory_management/types.h"
 #include "tensorflow/lite/delegates/gpu/common/shape.h"
 #include "tensorflow/lite/delegates/gpu/common/status.h"
 #include "tensorflow/lite/delegates/gpu/common/types.h"
@@ -30,41 +37,6 @@ namespace gpu {
 
 using TaskId = size_t;
 
-// Record, containing tensor size/shape and IDs of the first and the last task,
-// that use this tensor as input or output. For example: tensor #3 with size
-// tensor_size=65536 is first introduced in program #2 (first_task=2) and used
-// for the last time in program #7 (last_task=7).
-template <typename TensorSizeT>
-struct TensorUsageRecord {
-  TensorSizeT tensor_size;
-  TaskId first_task;
-  TaskId last_task;
-
-  TensorUsageRecord(TensorSizeT size, TaskId first, TaskId last)
-      : tensor_size(size), first_task(first), last_task(last) {}
-
-  // Default order of tensor usage records is increasing order of first_task.
-  bool operator<(const TensorUsageRecord<TensorSizeT>& other) const {
-    return first_task < other.first_task;
-  }
-};
-
-// Information about assignment of tensors to shared objects
-template <typename TensorSizeT>
-struct ObjectsAssignment {
-  // shared_object_ids_[i] is ID of shared object, that tensor i will be using.
-  std::vector<size_t> object_ids;
-  // shared_object_sizes_[i] is a size of shared object with ID equal to i.
-  std::vector<TensorSizeT> object_sizes;
-};
-
-// Information about assignment of tensors to offsets for the case, when all of
-// them are going to be allocated in one continuous memory block.
-struct OffsetsAssignment {
-  std::vector<size_t> offsets;
-  size_t total_size;
-};
-
 // Converts given assignment of tensors to shared objects to the assignment of
 // the same tensors to offsets in continuous memory block.
 OffsetsAssignment ObjectsToOffsets(
@@ -105,30 +77,46 @@ enum class MemoryStrategy {
   MINCOSTFLOW,
 };
 
+// Chooses greedy algorithm with the lowest memory consumption for given usage
+// records and returns corresponding shared objects assignment.
+Status BestGreedy(const std::vector<TensorUsageRecord<size_t>>& usage_records,
+                  ObjectsAssignment<size_t>* assignment);
+
 // Calculates the assignement of shared objects to given tensors, including
-// objects' sizes. Initial tensor sizes are given as size_t. This function is
-// intended to use with GPU buffers and one-dimensional textures.
+// objects' sizes. Below there are specializations for different types, that
+// support more memory strategies.
+template <typename TensorSizeT>
+Status AssignObjectsToTensors(
+    const std::vector<TensorUsageRecord<TensorSizeT>>& usage_records,
+    MemoryStrategy strategy, ObjectsAssignment<TensorSizeT>* assignment) {
+  switch (strategy) {
+    case MemoryStrategy::NAIVE:
+      return NaiveAssignment(usage_records, assignment);
+    case MemoryStrategy::EQUALITY:
+      return EqualityAssignment(usage_records, assignment);
+    default:
+      return InternalError(
+          "MemoryStrategy is not supported with current tensor size type.");
+  }
+  return OkStatus();
+}
+
+template <>
 Status AssignObjectsToTensors(
     const std::vector<TensorUsageRecord<size_t>>& usage_records,
     MemoryStrategy strategy, ObjectsAssignment<size_t>* assignment);
 
-// Calculates the assignement of shared objects to given tensors, including
-// objects' sizes. Initial tensor sizes are given as BHWC. This function is
-// intended to use with OpenCL textures.
+template <>
 Status AssignObjectsToTensors(
     const std::vector<TensorUsageRecord<BHWC>>& usage_records,
     MemoryStrategy strategy, ObjectsAssignment<BHWC>* assignment);
 
-// Calculates the assignement of shared objects to given tensors, including
-// objects' sizes. Initial tensor sizes are given as uint2. This function is
-// intended to use with OpenGL textures.
+template <>
 Status AssignObjectsToTensors(
     const std::vector<TensorUsageRecord<uint2>>& usage_records,
     MemoryStrategy strategy, ObjectsAssignment<uint2>* assignment);
 
-// Calculates the assignement of shared objects to given tensors, including
-// objects' sizes. Initial tensor sizes are given as uint3. This function is
-// intended to use with OpenGL textures.
+template <>
 Status AssignObjectsToTensors(
     const std::vector<TensorUsageRecord<uint3>>& usage_records,
     MemoryStrategy strategy, ObjectsAssignment<uint3>* assignment);
diff --git a/tensorflow/lite/delegates/gpu/common/memory_management/equality_assignment.h b/tensorflow/lite/delegates/gpu/common/memory_management/equality_assignment.h
index a5e6c3a85eb..0955393e00c 100644
--- a/tensorflow/lite/delegates/gpu/common/memory_management/equality_assignment.h
+++ b/tensorflow/lite/delegates/gpu/common/memory_management/equality_assignment.h
@@ -20,15 +20,16 @@ limitations under the License.
 #include <vector>
 
 #include "absl/container/flat_hash_map.h"
-#include "tensorflow/lite/delegates/gpu/common/memory_management.h"
 #include "tensorflow/lite/delegates/gpu/common/memory_management/internal.h"
+#include "tensorflow/lite/delegates/gpu/common/memory_management/types.h"
 #include "tensorflow/lite/delegates/gpu/common/status.h"
 
 namespace tflite {
 namespace gpu {
 
+// Fast version of Equality Assignments for hashable types.
 template <typename TensorSizeT>
-Status EqualityAssignment(
+Status EqualityAssignmentWithHash(
     const std::vector<TensorUsageRecord<TensorSizeT>>& usage_records,
     ObjectsAssignment<TensorSizeT>* assignment) {
   size_t num_records = usage_records.size();
@@ -50,7 +51,7 @@ Status EqualityAssignment(
       objects_in_use.pop();
     }
 
-    TensorSizeT tensor_size = usage_records[i].tensor_size;
+    const TensorSizeT tensor_size = usage_records[i].tensor_size;
     auto pool_it = pool.find(tensor_size);
     if (pool_it == pool.end() || pool_it->second.empty()) {
       // No free shared object with size equal to tensor_size. Create a new one,
@@ -71,6 +72,46 @@ Status EqualityAssignment(
   return OkStatus();
 }
 
+// Slower version of Equality Assignments for unhashable types.
+template <typename TensorSizeT>
+Status EqualityAssignment(
+    const std::vector<TensorUsageRecord<TensorSizeT>>& usage_records,
+    ObjectsAssignment<TensorSizeT>* assignment) {
+  size_t num_records = usage_records.size();
+  assignment->object_sizes.clear();
+  assignment->object_ids.assign(num_records, kNotAssigned);
+
+  // Index of operation, after execution of which the shared object can be
+  // deallocated.
+  std::vector<size_t> dealloc_task;
+  for (size_t i = 0; i < num_records; ++i) {
+    const TensorSizeT tensor_size = usage_records[i].tensor_size;
+    size_t best_obj = kNotAssigned;
+    for (size_t obj = 0; obj < assignment->object_sizes.size(); ++obj) {
+      // Find a shared object, that has equal size with current tensor and has
+      // been deallocated before the execution of its first_task.
+      if (dealloc_task[obj] < usage_records[i].first_task &&
+          assignment->object_sizes[obj] == tensor_size) {
+        best_obj = obj;
+        break;
+      }
+    }
+    if (best_obj == kNotAssigned) {
+      // No free shared object with size equal to tensor_size. Create a new one,
+      // assign i-th tensor to it and save its last task as deallocation task.
+      assignment->object_ids[i] = assignment->object_sizes.size();
+      assignment->object_sizes.push_back(tensor_size);
+      dealloc_task.push_back(usage_records[i].last_task);
+    } else {
+      // Shared object with id it->second has size equal to tensor_size. Reuse
+      // this object and update its deallocation task.
+      assignment->object_ids[i] = best_obj;
+      dealloc_task[best_obj] = usage_records[i].last_task;
+    }
+  }
+  return OkStatus();
+}
+
 }  // namespace gpu
 }  // namespace tflite
 
diff --git a/tensorflow/lite/delegates/gpu/common/memory_management/greedy_by_breadth_assignment.h b/tensorflow/lite/delegates/gpu/common/memory_management/greedy_by_breadth_assignment.h
index b073c505837..c139ba0fe0f 100644
--- a/tensorflow/lite/delegates/gpu/common/memory_management/greedy_by_breadth_assignment.h
+++ b/tensorflow/lite/delegates/gpu/common/memory_management/greedy_by_breadth_assignment.h
@@ -19,7 +19,7 @@ limitations under the License.
 #include <cstdint>
 #include <vector>
 
-#include "tensorflow/lite/delegates/gpu/common/memory_management.h"
+#include "tensorflow/lite/delegates/gpu/common/memory_management/types.h"
 #include "tensorflow/lite/delegates/gpu/common/status.h"
 
 namespace tflite {
diff --git a/tensorflow/lite/delegates/gpu/common/memory_management/greedy_by_size_assignment.h b/tensorflow/lite/delegates/gpu/common/memory_management/greedy_by_size_assignment.h
index ba77a83cfc8..2cb8ceee0e1 100644
--- a/tensorflow/lite/delegates/gpu/common/memory_management/greedy_by_size_assignment.h
+++ b/tensorflow/lite/delegates/gpu/common/memory_management/greedy_by_size_assignment.h
@@ -18,7 +18,7 @@ limitations under the License.
 
 #include <vector>
 
-#include "tensorflow/lite/delegates/gpu/common/memory_management.h"
+#include "tensorflow/lite/delegates/gpu/common/memory_management/types.h"
 #include "tensorflow/lite/delegates/gpu/common/status.h"
 
 namespace tflite {
diff --git a/tensorflow/lite/delegates/gpu/common/memory_management/greedy_in_order_assignment.h b/tensorflow/lite/delegates/gpu/common/memory_management/greedy_in_order_assignment.h
index 7acf81afd29..102171f783c 100644
--- a/tensorflow/lite/delegates/gpu/common/memory_management/greedy_in_order_assignment.h
+++ b/tensorflow/lite/delegates/gpu/common/memory_management/greedy_in_order_assignment.h
@@ -17,12 +17,13 @@ limitations under the License.
 #define TENSORFLOW_LITE_DELEGATES_GPU_COMMON_MEMORY_MANAGEMENT_GREEDY_IN_ORDER_ASSIGNMENT_H_
 
 #include <algorithm>
+#include <list>
 #include <queue>
 #include <set>
 #include <vector>
 
-#include "tensorflow/lite/delegates/gpu/common/memory_management.h"
 #include "tensorflow/lite/delegates/gpu/common/memory_management/internal.h"
+#include "tensorflow/lite/delegates/gpu/common/memory_management/types.h"
 #include "tensorflow/lite/delegates/gpu/common/status.h"
 
 namespace tflite {
diff --git a/tensorflow/lite/delegates/gpu/common/memory_management/internal.h b/tensorflow/lite/delegates/gpu/common/memory_management/internal.h
index 35050fd2b1d..58d9571d9f9 100644
--- a/tensorflow/lite/delegates/gpu/common/memory_management/internal.h
+++ b/tensorflow/lite/delegates/gpu/common/memory_management/internal.h
@@ -22,7 +22,7 @@ limitations under the License.
 #include <vector>
 
 #include "absl/memory/memory.h"
-#include "tensorflow/lite/delegates/gpu/common/memory_management.h"
+#include "tensorflow/lite/delegates/gpu/common/memory_management/types.h"
 #include "tensorflow/lite/delegates/gpu/common/types.h"
 
 namespace tflite {
diff --git a/tensorflow/lite/delegates/gpu/common/memory_management/min_cost_flow_assignment.h b/tensorflow/lite/delegates/gpu/common/memory_management/min_cost_flow_assignment.h
index 494dbf9abb8..7e45f83c79e 100644
--- a/tensorflow/lite/delegates/gpu/common/memory_management/min_cost_flow_assignment.h
+++ b/tensorflow/lite/delegates/gpu/common/memory_management/min_cost_flow_assignment.h
@@ -18,7 +18,7 @@ limitations under the License.
 
 #include <vector>
 
-#include "tensorflow/lite/delegates/gpu/common/memory_management.h"
+#include "tensorflow/lite/delegates/gpu/common/memory_management/types.h"
 #include "tensorflow/lite/delegates/gpu/common/status.h"
 
 namespace tflite {
diff --git a/tensorflow/lite/delegates/gpu/common/memory_management/naive_assignment.h b/tensorflow/lite/delegates/gpu/common/memory_management/naive_assignment.h
index 0d637934974..94cd41ed9a5 100644
--- a/tensorflow/lite/delegates/gpu/common/memory_management/naive_assignment.h
+++ b/tensorflow/lite/delegates/gpu/common/memory_management/naive_assignment.h
@@ -18,8 +18,8 @@ limitations under the License.
 
 #include <vector>
 
-#include "tensorflow/lite/delegates/gpu/common/memory_management.h"
 #include "tensorflow/lite/delegates/gpu/common/memory_management/internal.h"
+#include "tensorflow/lite/delegates/gpu/common/memory_management/types.h"
 #include "tensorflow/lite/delegates/gpu/common/status.h"
 
 namespace tflite {
diff --git a/tensorflow/lite/delegates/gpu/common/memory_management/types.h b/tensorflow/lite/delegates/gpu/common/memory_management/types.h
new file mode 100644
index 00000000000..079a14d1069
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/common/memory_management/types.h
@@ -0,0 +1,66 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_COMMON_MEMORY_MANAGEMENT_TYPES_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_COMMON_MEMORY_MANAGEMENT_TYPES_H_
+
+#include <cstdint>
+#include <memory>
+#include <vector>
+
+namespace tflite {
+namespace gpu {
+
+using TaskId = size_t;
+
+// Record, containing tensor size/shape and IDs of the first and the last task,
+// that use this tensor as input or output. For example: tensor #3 with size
+// tensor_size=65536 is first introduced in program #2 (first_task=2) and used
+// for the last time in program #7 (last_task=7).
+template <typename TensorSizeT>
+struct TensorUsageRecord {
+  TensorSizeT tensor_size;
+  TaskId first_task;
+  TaskId last_task;
+
+  TensorUsageRecord(TensorSizeT size, TaskId first, TaskId last)
+      : tensor_size(size), first_task(first), last_task(last) {}
+
+  // Default order of tensor usage records is increasing order of first_task.
+  bool operator<(const TensorUsageRecord<TensorSizeT>& other) const {
+    return first_task < other.first_task;
+  }
+};
+
+// Information about assignment of tensors to shared objects
+template <typename TensorSizeT>
+struct ObjectsAssignment {
+  // shared_object_ids_[i] is ID of shared object, that tensor i will be using.
+  std::vector<size_t> object_ids;
+  // shared_object_sizes_[i] is a size of shared object with ID equal to i.
+  std::vector<TensorSizeT> object_sizes;
+};
+
+// Information about assignment of tensors to offsets for the case, when all of
+// them are going to be allocated in one continuous memory block.
+struct OffsetsAssignment {
+  std::vector<size_t> offsets;
+  size_t total_size;
+};
+
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_COMMON_MEMORY_MANAGEMENT_TYPES_H_
diff --git a/tensorflow/lite/delegates/gpu/common/memory_management_test.cc b/tensorflow/lite/delegates/gpu/common/memory_management_test.cc
index 6b915e2caed..12f5b6ebe6c 100644
--- a/tensorflow/lite/delegates/gpu/common/memory_management_test.cc
+++ b/tensorflow/lite/delegates/gpu/common/memory_management_test.cc
@@ -317,6 +317,14 @@ TEST(Model, UInt2Records) {
                           uint2(8, 2), uint2(2, 8), uint2(1, 8), uint2(2, 8),
                           uint2(4, 1)));
 
+  ASSERT_TRUE(AssignObjectsToTensors(usage_records, MemoryStrategy::EQUALITY,
+                                     &assignment)
+                  .ok());
+  EXPECT_THAT(assignment.object_ids, ElementsAre(0, 1, 2, 0, 3, 1, 4, 0, 5));
+  EXPECT_THAT(assignment.object_sizes,
+              ElementsAre(uint2(2, 8), uint2(2, 8), uint2(1, 12), uint2(8, 2),
+                          uint2(1, 8), uint2(4, 1)));
+
   ASSERT_TRUE(AssignObjectsToTensors(
                   usage_records, MemoryStrategy::GREEDY_IN_ORDER, &assignment)
                   .ok());
@@ -347,6 +355,15 @@ TEST(Model, UInt3Records) {
                           uint3(2, 4, 1), uint3(2, 2, 2), uint3(8, 1, 2),
                           uint3(1, 2, 1), uint3(1, 1, 1), uint3(2, 2, 2)));
 
+  ASSERT_TRUE(AssignObjectsToTensors(usage_records, MemoryStrategy::EQUALITY,
+                                     &assignment)
+                  .ok());
+  EXPECT_THAT(assignment.object_ids, ElementsAre(0, 1, 2, 3, 4, 5, 6, 2, 4));
+  EXPECT_THAT(assignment.object_sizes,
+              ElementsAre(uint3(1, 2, 8), uint3(4, 3, 2), uint3(1, 1, 1),
+                          uint3(2, 4, 1), uint3(2, 2, 2), uint3(8, 1, 2),
+                          uint3(1, 2, 1)));
+
   ASSERT_TRUE(AssignObjectsToTensors(
                   usage_records, MemoryStrategy::GREEDY_IN_ORDER, &assignment)
                   .ok());
diff --git a/tensorflow/lite/delegates/gpu/common/transformations/merge_padding_with.cc b/tensorflow/lite/delegates/gpu/common/transformations/merge_padding_with.cc
index cd0b282e1cc..da48eed9b09 100644
--- a/tensorflow/lite/delegates/gpu/common/transformations/merge_padding_with.cc
+++ b/tensorflow/lite/delegates/gpu/common/transformations/merge_padding_with.cc
@@ -145,11 +145,12 @@ class MergePaddingWithAddOperation : public NodeTransformation {
 
     AddAttributes add_attr =
         absl::any_cast<AddAttributes>(add_node->operation.attributes);
-    auto add_broadcated_vector =
+    const auto add_broadcast =
         absl::get_if<Tensor<Linear, DataType::FLOAT32>>(&add_attr.param);
-    if (add_broadcated_vector) {
+    const float* add_scalar = absl::get_if<float>(&add_attr.param);
+    if (add_broadcast || add_scalar) {
       return {TransformStatus::SKIPPED,
-              "Can not remove padding when this broadcasted ADD"};
+              "Cannot remove padding when this broadcast/scalar ADD"};
     }
 
     Status status = RemovePrecedingNode(graph, node, add_node);
diff --git a/tensorflow/lite/delegates/gpu/gl/BUILD b/tensorflow/lite/delegates/gpu/gl/BUILD
index 734d34b67b3..1e380de24c7 100644
--- a/tensorflow/lite/delegates/gpu/gl/BUILD
+++ b/tensorflow/lite/delegates/gpu/gl/BUILD
@@ -40,6 +40,33 @@ cc_library(
     }),
 )
 
+cc_library(
+    name = "api2",
+    srcs = ["api2.cc"],
+    hdrs = ["api2.h"],
+    deps = [
+        ":command_queue",
+        ":compiler",
+        ":egl_environment",
+        ":gl_call",
+        ":object",
+        ":portable",
+        ":request_gpu_info",
+        ":runtime",
+        ":variable",
+        "//tensorflow/lite/delegates/gpu:api",
+        "//tensorflow/lite/delegates/gpu/common:data_type",
+        "//tensorflow/lite/delegates/gpu/common:model",
+        "//tensorflow/lite/delegates/gpu/common:status",
+        "//tensorflow/lite/delegates/gpu/common:tensor",
+        "//tensorflow/lite/delegates/gpu/gl/kernels:converter",
+        "//tensorflow/lite/delegates/gpu/gl/kernels:registry",
+        "//tensorflow/lite/delegates/gpu/gl/workgroups:default_calculator",
+        "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/types:span",
+    ],
+)
+
 cc_library(
     name = "command_queue",
     srcs = ["command_queue.cc"],
diff --git a/tensorflow/lite/delegates/gpu/gl/api.cc b/tensorflow/lite/delegates/gpu/gl/api.cc
index fc9fcae84a9..7f0a534e183 100644
--- a/tensorflow/lite/delegates/gpu/gl/api.cc
+++ b/tensorflow/lite/delegates/gpu/gl/api.cc
@@ -381,11 +381,6 @@ bool IsBatchMatchesForAllValues(const GraphFloat32& model) {
   return true;
 }
 
-bool IsOpenGl31OrAbove(const GpuInfo& gpu_info) {
-  return (gpu_info.major_version == 3 && gpu_info.minor_version >= 1) ||
-         gpu_info.major_version > 3;
-}
-
 }  // namespace
 
 Status Compile(const CompilationOptions& options, const GraphFloat32& model,
diff --git a/tensorflow/lite/delegates/gpu/gl/api2.cc b/tensorflow/lite/delegates/gpu/gl/api2.cc
new file mode 100644
index 00000000000..7ec05e46af0
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/gl/api2.cc
@@ -0,0 +1,676 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/delegates/gpu/gl/api2.h"
+
+#include <algorithm>
+#include <cstring>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/memory/memory.h"
+#include "absl/types/span.h"
+#include "tensorflow/lite/delegates/gpu/common/data_type.h"
+#include "tensorflow/lite/delegates/gpu/common/tensor.h"
+#include "tensorflow/lite/delegates/gpu/gl/compiler.h"
+#include "tensorflow/lite/delegates/gpu/gl/egl_environment.h"
+#include "tensorflow/lite/delegates/gpu/gl/gl_call.h"
+#include "tensorflow/lite/delegates/gpu/gl/kernels/converter.h"
+#include "tensorflow/lite/delegates/gpu/gl/kernels/registry.h"
+#include "tensorflow/lite/delegates/gpu/gl/object.h"
+#include "tensorflow/lite/delegates/gpu/gl/portable_gl31.h"
+#include "tensorflow/lite/delegates/gpu/gl/request_gpu_info.h"
+#include "tensorflow/lite/delegates/gpu/gl/runtime.h"
+#include "tensorflow/lite/delegates/gpu/gl/variable.h"
+#include "tensorflow/lite/delegates/gpu/gl/workgroups/default_calculator.h"
+
+namespace tflite {
+namespace gpu {
+namespace gl {
+namespace {
+
+// Returns true if all tensors have same batch value.
+bool IsBatchMatchesForAllValues(const GraphFloat32& model) {
+  const auto& values = model.values();
+  if (!values.empty()) {
+    const int32_t b = values[0]->tensor.shape.b;
+    for (auto value : values) {
+      if (value->tensor.shape.b != b) {
+        return false;
+      }
+    }
+  }
+  return true;
+}
+
+std::string GetShaderHeader(uint3 localsize) {
+  return absl::StrCat("#version 310 es\nlayout(local_size_x = ", localsize.x,
+                      ", local_size_y = ", localsize.y,
+                      ", local_size_z = ", localsize.z, ") in;\n");
+}
+
+// Wraps given SSBO into GlBuffer object that does not have ownership.
+Status WrapSSBO(OpenGlBuffer ssbo, GlBuffer* buffer) {
+  int64_t size_bytes;
+  RETURN_IF_ERROR(GetSSBOSize(ssbo.id, &size_bytes));
+  *buffer = GlBuffer(GL_SHADER_STORAGE_BUFFER, ssbo.id, size_bytes, 0, false);
+  return OkStatus();
+}
+
+Status MaybeAllocateGlBuffer(const TensorObjectDef& def, GlBuffer* ssbo) {
+  if (def.object_def.object_type != gpu::ObjectType::OPENGL_SSBO) {
+    return InvalidArgumentError("Tensor object is not GL SSBO");
+  }
+  const uint32_t num_elements = NumElements(def);
+  switch (def.object_def.data_type) {
+    case DataType::FLOAT32:
+      return CreateReadWriteShaderStorageBuffer<float>(num_elements, ssbo);
+    case DataType::FLOAT16:
+      return CreateReadWriteShaderStorageBuffer<uint16_t>(num_elements, ssbo);
+    default:
+      return InternalError(
+          "Unable to create new GL SSBO. Unsupported data type.");
+  }
+  return OkStatus();
+}
+
+// Does one-step conversion between internal and external objects.
+// It may also allocate external objects if requested.
+class DefaultTensorTie : public TensorTie {
+ public:
+  DefaultTensorTie(const TensorTieDef& def, TensorObject internal_obj,
+                   ObjectManager* objects)
+      : TensorTie(def), objects_(objects), internal_obj_(internal_obj) {}
+
+  static bool IsSupported(
+      const TensorTieDef& def,
+      const TensorObjectConverterBuilder& converter_builder) {
+    return converter_builder.IsSupported(def.internal_def, def.external_def) &&
+           converter_builder.IsSupported(def.external_def, def.internal_def);
+  }
+
+  static Status New(const TensorTieDef& def,
+                    TensorObjectConverterBuilder* converter_builder,
+                    ObjectManager* objects, std::unique_ptr<TensorTie>* tie) {
+    auto tie_impl =
+        absl::make_unique<DefaultTensorTie>(def, TensorObject{}, objects);
+    RETURN_IF_ERROR(tie_impl->Init(converter_builder));
+    *tie = std::move(tie_impl);
+    return OkStatus();
+  }
+
+  static Status New(const TensorTieDef& def,
+                    TensorObjectConverterBuilder* converter_builder,
+                    TensorObject internal_object,
+                    std::unique_ptr<TensorTie>* tie) {
+    if (!IsValid(def.internal_def, internal_object)) {
+      return InternalError("Internal object does not match definition.");
+    }
+
+    auto tie_impl =
+        absl::make_unique<DefaultTensorTie>(def, internal_object, nullptr);
+    RETURN_IF_ERROR(tie_impl->Init(converter_builder));
+    *tie = std::move(tie_impl);
+    return OkStatus();
+  }
+
+  Status CopyToExternalObject() final {
+    if (!converter_to_) {
+      return OkStatus();
+    }
+    return converter_to_->Convert(internal_obj_, GetExternalObject());
+  }
+
+  Status CopyFromExternalObject() final {
+    if (!converter_from_) {
+      return OkStatus();
+    }
+    return converter_from_->Convert(GetExternalObject(), internal_obj_);
+  }
+
+  Status SetExternalObject(TensorObject obj) final {
+    if (!def().external_def.object_def.user_provided) {
+      return InvalidArgumentError("External object is read-only");
+    }
+    if (!IsValid(def().external_def, obj)) {
+      return InvalidArgumentError("Given object is not valid");
+    }
+    // TODO(akulik): external object should propagate to internal.
+    if (IsSameDef()) {
+      return UnimplementedError("Not supported");
+    }
+    external_obj_ = obj;
+    return OkStatus();
+  }
+
+  TensorObject GetExternalObject() final { return external_obj_; }
+
+ private:
+  bool IsSameDef() const {
+    const auto& external_def = def().external_def.object_def;
+    const auto& internal_def = def().internal_def.object_def;
+    return (external_def.object_type == internal_def.object_type &&
+            external_def.data_type == internal_def.data_type &&
+            external_def.data_layout == internal_def.data_layout) ||
+           // Check for equivalent layouts that have the same size.
+           (external_def.object_type == internal_def.object_type &&
+            external_def.data_type == internal_def.data_type &&
+            external_def.data_layout == DataLayout::BHWC &&
+            internal_def.data_layout == DataLayout::DHWC4 &&
+            def().external_def.dimensions.c == 4);
+  }
+  Status Init(TensorObjectConverterBuilder* converter_builder) {
+    // First check is an object is user provided.
+    const auto& external_def = def().external_def.object_def;
+    const auto& internal_def = def().internal_def.object_def;
+
+    const bool is_same_def = IsSameDef();
+
+    if (!is_same_def) {
+      RETURN_IF_ERROR(converter_builder->MakeConverter(
+          def().internal_def, def().external_def, &converter_to_));
+      RETURN_IF_ERROR(converter_builder->MakeConverter(
+          def().external_def, def().internal_def, &converter_from_));
+    }
+
+    if (external_def.user_provided) {
+      if (is_same_def) {
+        return OkStatus();
+      }
+      // Object is provided by a user, but runtime expects different object
+      // type. Therefore, we have to allocate internal object and convert.
+      return MaybeAllocateInternalObject();
+    } else {
+      RETURN_IF_ERROR(MaybeAllocateInternalObject());
+
+      if (is_same_def) {
+        // Object is NOT provided by a user, but it matches definition expected
+        // by runtime. Conversion is not needed.
+        external_obj_ = internal_obj_;
+        return OkStatus();
+      }
+
+      // Object is NOT provided by a user.
+      return MaybeAllocateExternalObject();
+    }
+    return OkStatus();
+  }
+
+  Status MaybeAllocateInternalObject() {
+    const TensorObjectDef& d = def().internal_def;
+    if (d.object_def.user_provided) {
+      return OkStatus();
+    }
+    switch (d.object_def.object_type) {
+      case gpu::ObjectType::OPENGL_SSBO: {
+        GlBuffer ssbo;
+        RETURN_IF_ERROR(MaybeAllocateGlBuffer(d, &ssbo));
+        internal_obj_ = OpenGlBuffer{ssbo.id()};
+        RETURN_IF_ERROR(objects_->RegisterBuffer(def().id, std::move(ssbo)));
+        break;
+      }
+      // TODO(akulik): support textures as internal object when compiler permits
+      default:
+        return InternalError("Unexpected object type");
+    }
+    return OkStatus();
+  }
+
+  Status MaybeAllocateExternalObject() {
+    const TensorObjectDef& d = def().external_def;
+    switch (d.object_def.object_type) {
+      case gpu::ObjectType::CPU_MEMORY: {
+        size_t bytes_size = NumElements(d) * SizeOf(d.object_def.data_type);
+        cpu_memory_.resize(bytes_size);
+        external_obj_ = CpuMemory{cpu_memory_.data(), cpu_memory_.size()};
+        break;
+      }
+      case gpu::ObjectType::OPENGL_SSBO: {
+        RETURN_IF_ERROR(MaybeAllocateGlBuffer(d, &external_ssbo_));
+        external_obj_ = OpenGlBuffer{external_ssbo_.id()};
+        GlBuffer bbb;
+        RETURN_IF_ERROR(WrapSSBO(OpenGlBuffer{external_ssbo_.id()}, &bbb));
+        break;
+      }
+      default:
+        return InternalError("Unexpected object type");
+    }
+    return OkStatus();
+  }
+
+  ObjectManager* objects_;
+
+  // hold references to objects.
+  TensorObject internal_obj_;
+  TensorObject external_obj_;
+
+  // Hold actual objects.
+  GlBuffer external_ssbo_;
+  std::vector<uint8_t> cpu_memory_;
+
+  std::unique_ptr<TensorObjectConverter> converter_to_;
+  std::unique_ptr<TensorObjectConverter> converter_from_;
+};
+
+// Copies data to intermediate OpenGL buffer and then does two step conversion.
+// It drives the following cases were one-step conversion is not supported:
+//   - CPU BHWC -> GL buffer BHWC -> GL texture DHWC4.
+class TwoStepTensorTie : public TensorTie {
+ public:
+  explicit TwoStepTensorTie(const TensorTieDef& def) : TensorTie(def) {}
+
+  static bool IsSupported(
+      const TensorTieDef& def,
+      const TensorObjectConverterBuilder& converter_builder) {
+    auto defs = MakeOuterInnerDefs(def);
+    return DefaultTensorTie::IsSupported(defs.first, converter_builder) &&
+           DefaultTensorTie::IsSupported(defs.second, converter_builder);
+  }
+
+  static Status New(const TensorTieDef& def,
+                    TensorObjectConverterBuilder* converter_builder,
+                    ObjectManager* objects, std::unique_ptr<TensorTie>* tie) {
+    auto tie_impl = absl::make_unique<TwoStepTensorTie>(def);
+    RETURN_IF_ERROR(tie_impl->Init(converter_builder, objects));
+    *tie = std::move(tie_impl);
+    return OkStatus();
+  }
+
+  Status CopyToExternalObject() final {
+    RETURN_IF_ERROR(inner_tie_->CopyToExternalObject());
+    return outer_tie_->CopyToExternalObject();
+  }
+
+  Status CopyFromExternalObject() final {
+    RETURN_IF_ERROR(outer_tie_->CopyFromExternalObject());
+    return inner_tie_->CopyFromExternalObject();
+  }
+
+  Status SetExternalObject(TensorObject obj) final {
+    return outer_tie_->SetExternalObject(obj);
+  }
+
+  TensorObject GetExternalObject() final {
+    return outer_tie_->GetExternalObject();
+  }
+
+ private:
+  static std::pair<TensorTieDef, TensorTieDef> MakeOuterInnerDefs(
+      const TensorTieDef& def) {
+    TensorTieDef outer_def;
+    outer_def.external_def = def.external_def;
+    outer_def.internal_def = def.external_def;
+    outer_def.internal_def.object_def.object_type =
+        gpu::ObjectType::OPENGL_SSBO;
+    // Will not allocate new SSBO
+    outer_def.internal_def.object_def.user_provided = true;
+
+    TensorTieDef inner_def;
+    inner_def.id = def.id;
+    inner_def.external_def = outer_def.internal_def;
+    // Should not allocate external object.
+    inner_def.external_def.object_def.user_provided = false;
+    // Reflects what is actually supported by compiler.
+    inner_def.internal_def.dimensions = inner_def.external_def.dimensions;
+    inner_def.internal_def.object_def.data_type = DataType::FLOAT32;
+    inner_def.internal_def.object_def.data_layout = DataLayout::DHWC4;
+    inner_def.internal_def.object_def.object_type =
+        gpu::ObjectType::OPENGL_SSBO;
+    // It may allocate another internal object and should register it to
+    // ObjectManager.
+    inner_def.internal_def.object_def.user_provided = false;
+    return std::make_pair(outer_def, inner_def);
+  }
+
+  Status Init(TensorObjectConverterBuilder* converter_builder,
+              ObjectManager* objects) {
+    auto defs = MakeOuterInnerDefs(def());
+    RETURN_IF_ERROR(DefaultTensorTie::New(defs.second, converter_builder,
+                                          objects, &inner_tie_));
+    return DefaultTensorTie::New(defs.first, converter_builder,
+                                 inner_tie_->GetExternalObject(), &outer_tie_);
+  }
+
+  std::unique_ptr<TensorTie> inner_tie_;
+  std::unique_ptr<TensorTie> outer_tie_;
+};
+
+// Responsible for creating new tensor tie objects.
+class TensorTieFactory {
+ public:
+  explicit TensorTieFactory(const InferenceEnvironmentOptions& env_options)
+      : converter_builder_(NewConverterBuilder(env_options.queue)) {}
+
+  bool IsSupported(const TensorTieDef& def) const {
+    return IsValid(def.external_def.object_def) &&
+           (DefaultTensorTie::IsSupported(def, *converter_builder_) ||
+            TwoStepTensorTie::IsSupported(def, *converter_builder_));
+  }
+
+  Status NewTensorTie(const TensorTieDef& def, ObjectManager* objects,
+                      std::unique_ptr<TensorTie>* tie) {
+    auto converter = converter_builder_.get();
+    if (DefaultTensorTie::IsSupported(def, *converter)) {
+      return DefaultTensorTie::New(def, converter, objects, tie);
+    }
+    if (TwoStepTensorTie::IsSupported(def, *converter)) {
+      return TwoStepTensorTie::New(def, converter, objects, tie);
+    }
+    return UnimplementedError("Unsupported tensor tie definition.");
+  }
+
+ private:
+  std::unique_ptr<TensorObjectConverterBuilder> converter_builder_;
+};
+
+class InferenceRunnerImpl : public InferenceRunner {
+ public:
+  InferenceRunnerImpl(std::unique_ptr<Runtime> runtime,
+                      std::unique_ptr<ObjectManager> objects)
+      : runtime_(std::move(runtime)), objects_(std::move(objects)) {}
+
+  Status Initialize(const std::vector<TensorTieDef>& inputs,
+                    const std::vector<TensorTieDef>& outputs,
+                    TensorTieFactory* tie_factory) {
+    RETURN_IF_ERROR(LinkTensors(inputs, tie_factory, &inputs_));
+    return LinkTensors(outputs, tie_factory, &outputs_);
+  }
+
+  std::vector<TensorObjectDef> inputs() const override {
+    return GetExternalDefinitions(inputs_);
+  }
+
+  std::vector<TensorObjectDef> outputs() const override {
+    return GetExternalDefinitions(outputs_);
+  }
+
+  Status GetInputObject(int index, TensorObject* object) override {
+    if (index < 0 || index >= inputs_.size()) {
+      return OutOfRangeError("Index is out of range");
+    }
+    *object = inputs_[index]->GetExternalObject();
+    return OkStatus();
+  }
+
+  Status GetOutputObject(int index, TensorObject* object) override {
+    if (index < 0 || index >= outputs_.size()) {
+      return OutOfRangeError("Index is out of range");
+    }
+    *object = outputs_[index]->GetExternalObject();
+    return OkStatus();
+  }
+
+  Status SetInputObject(int index, TensorObject object) override {
+    if (index < 0 || index >= inputs_.size()) {
+      return OutOfRangeError("Index is out of range");
+    }
+    return inputs_[index]->SetExternalObject(object);
+  }
+
+  Status SetOutputObject(int index, TensorObject object) override {
+    if (index < 0 || index >= outputs_.size()) {
+      return OutOfRangeError("Index is out of range");
+    }
+    return outputs_[index]->SetExternalObject(object);
+  }
+
+  Status Run() override {
+    for (auto& obj : inputs_) {
+      RETURN_IF_ERROR(obj->CopyFromExternalObject());
+    }
+    RETURN_IF_ERROR(runtime_->Execute());
+    for (auto& obj : outputs_) {
+      RETURN_IF_ERROR(obj->CopyToExternalObject());
+    }
+    return OkStatus();
+  }
+
+ private:
+  Status LinkTensors(const std::vector<TensorTieDef>& defs,
+                     TensorTieFactory* tie_factory,
+                     std::vector<std::unique_ptr<TensorTie>>* objects) {
+    objects->reserve(defs.size());
+    for (auto& def : defs) {
+      std::unique_ptr<TensorTie> object;
+      RETURN_IF_ERROR(tie_factory->NewTensorTie(def, objects_.get(), &object));
+      objects->push_back(std::move(object));
+    }
+    return OkStatus();
+  }
+
+  static std::vector<TensorObjectDef> GetExternalDefinitions(
+      const std::vector<std::unique_ptr<TensorTie>>& objects) {
+    std::vector<TensorObjectDef> defs;
+    defs.reserve(objects.size());
+    for (auto& obj : objects) {
+      defs.push_back(obj->def().external_def);
+    }
+    return defs;
+  }
+
+  std::unique_ptr<Runtime> runtime_;
+  std::unique_ptr<ObjectManager> objects_;
+  std::vector<std::unique_ptr<TensorTie>> inputs_;
+  std::vector<std::unique_ptr<TensorTie>> outputs_;
+};
+
+class InferenceBuilderImpl : public InferenceBuilder {
+ public:
+  InferenceBuilderImpl(const InferenceEnvironmentOptions& env_options,
+                       GraphFloat32 graph, const GpuInfo* gpu_info)
+      : env_options_(env_options),
+        graph_(std::move(graph)),
+        gpu_info_(gpu_info),
+        tie_factory_(env_options_) {}
+
+  Status Initialize() {
+    inputs_ = LinkTensors(graph_.inputs());
+    outputs_ = LinkTensors(graph_.outputs());
+    return OkStatus();
+  }
+
+  std::vector<TensorObjectDef> inputs() const final {
+    return GetExternalDefinitions(inputs_);
+  }
+
+  std::vector<TensorObjectDef> outputs() const final {
+    return GetExternalDefinitions(outputs_);
+  }
+
+  Status SetInputShape(int index, const Dimensions& dimensions) final {
+    if (index < 0 || index >= inputs_.size()) {
+      return OutOfRangeError("Index is out of range");
+    }
+    return UnimplementedError("Changing input shapes is not supported");
+  }
+
+  Status SetInputObjectDef(int index, ObjectDef new_def) final {
+    if (index < 0 || index >= inputs_.size()) {
+      return OutOfRangeError("Index is out of range");
+    }
+    auto def = inputs_[index];
+    def.external_def.object_def = new_def;
+    if (!tie_factory_.IsSupported(def)) {
+      return InvalidArgumentError("New object definition is not supported.");
+    }
+    inputs_[index] = def;
+    return OkStatus();
+  }
+
+  Status SetOutputObjectDef(int index, ObjectDef new_def) final {
+    if (index < 0 || index >= outputs_.size()) {
+      return OutOfRangeError("Index is out of range");
+    }
+    auto def = outputs_[index];
+    def.external_def.object_def = new_def;
+    if (!tie_factory_.IsSupported(def)) {
+      return InvalidArgumentError("New object definition is not supported.");
+    }
+    outputs_[index] = def;
+    return OkStatus();
+  }
+
+  Status Build(std::unique_ptr<InferenceRunner>* runner) final {
+    CompilationOptions compiler_options;
+    auto kernels = NewNodeShaderRegistry();
+    auto compiler = NewCompiler(kernels.get(), gpu_info_, compiler_options);
+    auto workgroup_calculator = NewDefaultWorkgroupsCalculator(*gpu_info_);
+    auto external_objects = absl::make_unique<ObjectManager>();
+    std::vector<GlShader> shaders;
+    std::unordered_map<std::string, size_t> shader_to_index;
+    RuntimeOptions runtime_options;
+    auto runtime =
+        absl::make_unique<Runtime>(runtime_options, *gpu_info_,
+                                   env_options_.queue, external_objects.get());
+    Runtime* runtime_ptr = runtime.get();
+    auto runner_impl = absl::make_unique<InferenceRunnerImpl>(
+        std::move(runtime), std::move(external_objects));
+    RETURN_IF_ERROR(runner_impl->Initialize(inputs_, outputs_, &tie_factory_));
+    RETURN_IF_ERROR(
+        compiler->Compile(graph_, {}, [&](ShaderCode code) -> Status {
+          auto workgroup = workgroup_calculator->Calculate(code);
+          size_t shader_index;
+          std::string shader_src =
+              GetShaderHeader(workgroup) + code.source_code;
+          // Check if a shader was already compiled.
+          auto it = shader_to_index.find(shader_src);
+          if (it == shader_to_index.end()) {
+            GlShader shader;
+            RETURN_IF_ERROR(GlShader::CompileShader(GL_COMPUTE_SHADER,
+                                                    shader_src, &shader));
+            shaders.push_back(std::move(shader));
+            shader_to_index.insert({shader_src, shader_to_index.size()});
+            shader_index = shader_to_index.size() - 1;
+          } else {
+            shader_index = it->second;
+          }
+          auto num_workgroups = IntegralDivideRoundUp(code.workload, workgroup);
+          return runtime_ptr->AddProgram(shaders[shader_index], code.parameters,
+                                         code.objects, num_workgroups);
+        }));
+    RETURN_IF_ERROR(runtime_ptr->PrepareForExecution());
+    *runner = std::move(runner_impl);
+    return OkStatus();
+  }
+
+ private:
+  // Links internal tensors with external user-facing objects.
+  std::vector<TensorTieDef> LinkTensors(
+      const std::vector<Value<TensorRef<BHWC>>*>& values) {
+    std::vector<TensorTieDef> links;
+    links.reserve(values.size());
+    for (const auto& value : values) {
+      TensorObjectDef def;
+      // So far the compiler always forces inputs and outputs to be in the fixed
+      // format.
+      const auto& shape = value->tensor.shape;
+      def.dimensions = Dimensions(shape.b, shape.h, shape.w, shape.c);
+      def.object_def.data_type = DataType::FLOAT32;
+      def.object_def.data_layout = DataLayout::DHWC4;
+      def.object_def.object_type = gpu::ObjectType::OPENGL_SSBO;
+      def.object_def.user_provided = true;
+      AccessType access =
+          graph_.IsGraphInput(value->id) ? AccessType::READ : AccessType::WRITE;
+      links.push_back({value->id, access, def, def});
+    }
+    return links;
+  }
+
+  static std::vector<TensorObjectDef> GetExternalDefinitions(
+      const std::vector<TensorTieDef>& links) {
+    std::vector<TensorObjectDef> defs;
+    defs.reserve(links.size());
+    for (auto& desc : links) {
+      defs.push_back(desc.external_def);
+    }
+    return defs;
+  }
+
+  const InferenceEnvironmentOptions env_options_;
+
+  GraphFloat32 graph_;
+  const GpuInfo* gpu_info_;
+  std::vector<TensorTieDef> inputs_;
+  std::vector<TensorTieDef> outputs_;
+  TensorTieFactory tie_factory_;
+};
+
+class InferenceEnvironmentImpl : public InferenceEnvironment {
+ public:
+  explicit InferenceEnvironmentImpl(const InferenceEnvironmentOptions& options)
+      : env_options_(options) {}
+
+  Status Init() {
+    RETURN_IF_ERROR(EglEnvironment::NewEglEnvironment(&egl_env_));
+
+    RETURN_IF_ERROR(RequestGpuInfo(&gpu_info_));
+    properties_.is_opengl_available = IsOpenGl31OrAbove(gpu_info_);
+    if (!properties_.is_opengl_available) {
+      return InternalError(
+          "OpenGL ES 3.1 or above is required to use OpenGL inference.");
+    }
+    if (!env_options_.queue) {
+      queue_ = NewCommandQueue(gpu_info_);
+      env_options_.queue = queue_.get();
+    }
+    return OkStatus();
+  }
+
+  Status NewInferenceBuilder(GraphFloat32&& model,
+                             const InferenceOptions& options,
+                             std::unique_ptr<InferenceBuilder>* builder) final {
+    if (!IsBatchMatchesForAllValues(model)) {
+      return InvalidArgumentError(
+          "Only identical batch dimension is supported");
+    }
+    auto builder_impl = absl::make_unique<InferenceBuilderImpl>(
+        env_options_, std::move(model), &gpu_info_);
+    RETURN_IF_ERROR(builder_impl->Initialize());
+    *builder = std::move(builder_impl);
+    return OkStatus();
+  }
+
+  const InferenceEnvironmentProperties& properties() const {
+    return properties_;
+  }
+
+ private:
+  std::unique_ptr<EglEnvironment> egl_env_;
+  std::unique_ptr<CommandQueue> queue_;
+  InferenceEnvironmentOptions env_options_;
+  GpuInfo gpu_info_;
+  InferenceEnvironmentProperties properties_;
+};
+
+}  // namespace
+
+Status NewInferenceEnvironment(
+    const InferenceEnvironmentOptions& options,
+    std::unique_ptr<InferenceEnvironment>* environment,
+    InferenceEnvironmentProperties* properties) {
+  auto env_impl = absl::make_unique<InferenceEnvironmentImpl>(options);
+  Status status = env_impl->Init();
+  if (properties) {
+    *properties = env_impl->properties();
+  }
+  RETURN_IF_ERROR(status);
+  *environment = std::move(env_impl);
+  return OkStatus();
+}
+
+}  // namespace gl
+}  // namespace gpu
+}  // namespace tflite
diff --git a/tensorflow/lite/delegates/gpu/gl/api2.h b/tensorflow/lite/delegates/gpu/gl/api2.h
new file mode 100644
index 00000000000..1d237736054
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/gl/api2.h
@@ -0,0 +1,66 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_GL_API2_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_GL_API2_H_
+
+#include <cstdint>
+#include <memory>
+
+#include "absl/types/span.h"
+#include "tensorflow/lite/delegates/gpu/api.h"
+#include "tensorflow/lite/delegates/gpu/common/model.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+#include "tensorflow/lite/delegates/gpu/gl/command_queue.h"
+
+namespace tflite {
+namespace gpu {
+namespace gl {
+
+struct InferenceOptions {
+  // Runtime options + CompileOptions
+};
+
+struct InferenceEnvironmentProperties {
+  bool is_opengl_available = false;
+};
+
+// Manages all resources that need to stay around as long as any inference is
+// running using the OpenGL backend.
+class InferenceEnvironment {
+ public:
+  virtual ~InferenceEnvironment() = default;
+
+  virtual Status NewInferenceBuilder(
+      GraphFloat32&& model, const InferenceOptions& options,
+      std::unique_ptr<InferenceBuilder>* builder) = 0;
+};
+
+struct InferenceEnvironmentOptions {
+  CommandQueue* queue = nullptr;
+};
+
+// Creates a new OpenGL environment that needs to stay around until all
+// inference runners are destroyed.
+Status NewInferenceEnvironment(
+    const InferenceEnvironmentOptions& options,
+    std::unique_ptr<InferenceEnvironment>* environment,
+    InferenceEnvironmentProperties* properties /* optional */);
+
+}  // namespace gl
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_GL_API2_H_
diff --git a/tensorflow/lite/delegates/gpu/gl/compiler/shader_codegen.cc b/tensorflow/lite/delegates/gpu/gl/compiler/shader_codegen.cc
index 4b61948f6bc..5513e2a656e 100644
--- a/tensorflow/lite/delegates/gpu/gl/compiler/shader_codegen.cc
+++ b/tensorflow/lite/delegates/gpu/gl/compiler/shader_codegen.cc
@@ -89,7 +89,14 @@ Status ShaderCodegen::Build(CompiledNodeAttributes attr,
   RETURN_IF_ERROR(add_uniform_parameter(
       {"workload_z", static_cast<int32_t>(attr.code.workload.z)}));
 
-  std::string main_source_code = R"(
+  // NOTE: If the shader has shared variables it will have to use barriers,
+  //       which will conflict with a return at this stage.
+  // Let the user deal with the geometry constraints.
+  const bool has_shared_variables = !attr.code.shared_variables.empty();
+  std::string main_source_code = has_shared_variables ? R"(
+  ivec3 gid = ivec3(gl_GlobalInvocationID.xyz);
+)"
+                                                      : R"(
   ivec3 gid = ivec3(gl_GlobalInvocationID.xyz);
   if (gid.x >= $workload_x$ || gid.y >= $workload_y$ || gid.z >= $workload_z$) {
     return;
diff --git a/tensorflow/lite/delegates/gpu/gl/compiler/variable_accessor.cc b/tensorflow/lite/delegates/gpu/gl/compiler/variable_accessor.cc
index 9bac6d62a62..ff80e602818 100644
--- a/tensorflow/lite/delegates/gpu/gl/compiler/variable_accessor.cc
+++ b/tensorflow/lite/delegates/gpu/gl/compiler/variable_accessor.cc
@@ -146,14 +146,23 @@ void GetValue(const Variable::ValueType& value, std::string* result) {
 struct SharedVariableDeclarationGenerator {
   template <typename T>
   void operator()(const T&) const {
-    absl::StrAppend(result, "shared ", GetVariableType(variable.value), " ",
-                    variable.name, ";\n");
+    absl::StrAppend(result, "shared highp ", GetVariableType(variable.value),
+                    " ", variable.name, ";\n");
   }
 
   template <typename T>
   void operator()(const std::vector<T>& v) const {
-    absl::StrAppend(result, "shared ", GetVariableType(variable.value), " ",
-                    variable.name, "[", v.size(), "];\n");
+    absl::StrAppend(result, "shared highp ", GetVariableType(variable.value),
+                    " ", variable.name);
+    if (v.empty()) {
+      // Normalize the size of the shared array to that of the WorkGroupSize
+      absl::StrAppend(
+          result,
+          "[gl_WorkGroupSize.z * gl_WorkGroupSize.y * gl_WorkGroupSize.x];\n");
+    } else {
+      // Use the specified size
+      absl::StrAppend(result, "[", v.size(), "];\n");
+    }
   }
 
   const Variable& variable;
diff --git a/tensorflow/lite/delegates/gpu/gl/kernels/BUILD b/tensorflow/lite/delegates/gpu/gl/kernels/BUILD
index 03f8a479964..6f9e5f45006 100644
--- a/tensorflow/lite/delegates/gpu/gl/kernels/BUILD
+++ b/tensorflow/lite/delegates/gpu/gl/kernels/BUILD
@@ -609,6 +609,7 @@ cc_test(
         ":softmax",
         ":test_util",
         "//tensorflow/lite/delegates/gpu/common:operations",
+        "//tensorflow/lite/delegates/gpu/common:shape",
         "@com_google_googletest//:gtest",
         "@com_google_googletest//:gtest_main",
     ],
diff --git a/tensorflow/lite/delegates/gpu/gl/kernels/converter.cc b/tensorflow/lite/delegates/gpu/gl/kernels/converter.cc
index a919b18402e..189beedf815 100644
--- a/tensorflow/lite/delegates/gpu/gl/kernels/converter.cc
+++ b/tensorflow/lite/delegates/gpu/gl/kernels/converter.cc
@@ -174,7 +174,7 @@ class FromTensorConverter : public OpenGlConverterImpl {
   BHWC shape_;
 };
 
-// Implements conversion from BHWC to OpenCL-specific tensor layout.
+// Implements conversion from BHWC to OpenGL-specific tensor layout.
 class ToTensorConverter : public OpenGlConverterImpl {
  public:
   explicit ToTensorConverter(CommandQueue* command_queue)
@@ -332,7 +332,7 @@ class CpuCopier : public TensorObjectConverter {
         GlBuffer gl_buffer;
         RETURN_IF_ERROR(WrapSSBO(*ssbo_input, &gl_buffer));
         return gl_buffer.Read(absl::MakeSpan(
-            static_cast<uint8_t*>(cpu_input->data), cpu_input->size_bytes));
+            static_cast<uint8_t*>(cpu_output->data), cpu_output->size_bytes));
       }
     }
     return InternalError("Unexpected object");
@@ -345,7 +345,7 @@ class TensorConverterBuilderImpl : public TensorObjectConverterBuilder {
       : command_queue_(command_queue) {}
 
   bool IsSupported(const TensorObjectDef& input,
-                   const TensorObjectDef& output) final {
+                   const TensorObjectDef& output) const final {
     const auto& input_def = input.object_def;
     const auto& output_def = output.object_def;
     return input.dimensions == output.dimensions &&
diff --git a/tensorflow/lite/delegates/gpu/gl/kernels/fully_connected.cc b/tensorflow/lite/delegates/gpu/gl/kernels/fully_connected.cc
index 2b43d7cc103..f4ad5b8cc0a 100644
--- a/tensorflow/lite/delegates/gpu/gl/kernels/fully_connected.cc
+++ b/tensorflow/lite/delegates/gpu/gl/kernels/fully_connected.cc
@@ -32,9 +32,6 @@ namespace gpu {
 namespace gl {
 namespace {
 
-constexpr int kWorkPerThread = 4;
-constexpr int kVectorizedWidth = 4;  // Also number of 'offsetN' in kernel.
-
 class FullyConnectedBuffers : public NodeShader {
  public:
   Status GenerateCode(const GenerationContext& ctx,
@@ -42,17 +39,18 @@ class FullyConnectedBuffers : public NodeShader {
     auto attr = absl::any_cast<const FullyConnectedAttributes&>(
         ctx.node->operation.attributes);
 
-    // Number of float4 chunks needed.
     const int src_depth = IntegralDivideRoundUp(attr.weights.shape.i, 4);
     const int dst_depth = IntegralDivideRoundUp(attr.weights.shape.o, 4);
 
+    // This shader can work with any workgroup size, the values below work well
+    // for OpenGL.
+    constexpr int kWorkgroupHintX = 4;
+    constexpr int kWorkgroupHintY = 4;
+
     // TODO(akulik): check that input has h,w == 1,1
     std::vector<Variable> parameters = {
         {"src_depth", src_depth},
-        {"src_depth_x4", IntegralDivideRoundUp(src_depth, kVectorizedWidth)},
-        {"src_size", attr.weights.shape.i},
         {"dst_depth", dst_depth},
-        {"dst_size", attr.weights.shape.o},
     };
 
     // TODO(akulik): refactor indexed access to weights.
@@ -60,71 +58,54 @@ class FullyConnectedBuffers : public NodeShader {
         {"weights", MakeReadonlyObject(ConvertToPHWO4I4(attr.weights))}};
 
     std::string source = R"(
-  // setup
-  ivec2 tid = ivec2(gl_LocalInvocationID.xy);
-  vec4 sum = vec4(0.0);  // accumulator
-  int channel = int(tid.y);  // vector coord for every thread
-  int work_per_thread = int(gl_WorkGroupSize.x);
+  const int threads = int(gl_WorkGroupSize.y);
+  const int workers = int(gl_WorkGroupSize.x);
+  ivec3 tid = ivec3(gl_LocalInvocationID);
 
-  // matrix vector workgroup mul
-  uint offset0 = uint(gid.x * $src_depth$ * 4 + tid.y * 4 + 0);
-  uint offset1 = uint(gid.x * $src_depth$ * 4 + tid.y * 4 + 1);
-  uint offset2 = uint(gid.x * $src_depth$ * 4 + tid.y * 4 + 2);
-  uint offset3 = uint(gid.x * $src_depth$ * 4 + tid.y * 4 + 3);
-  uint offset_stride = 16u;  // src_depth_x4 == (src_size / 16)
-  for (int i = 0; i < $src_depth_x4$; ++i, channel += int(4)) {
-    vec4 v = $input_data_0[0, 0, channel]$;
-    vec4 m0 = $weights[ offset0 ]$;
-    vec4 m1 = $weights[ offset1 ]$;
-    vec4 m2 = $weights[ offset2 ]$;
-    vec4 m3 = $weights[ offset3 ]$;
-    offset0 += offset_stride;
-    offset1 += offset_stride;
-    offset2 += offset_stride;
-    offset3 += offset_stride;
-    sum.x += dot(v, m0);  // matrix * vector
-    sum.y += dot(v, m1);
-    sum.z += dot(v, m2);
-    sum.w += dot(v, m3);
+  if (gid.x < $dst_depth$) {
+    int offset = 4 * gid.x * $src_depth$ + 4 * tid.y;
+    int iterations = ($src_depth$ + threads-1) / threads;
+    for (int d = 0; d < iterations; d++, offset += 4 * threads) {
+      vec4 src = $input_data_0[0, 0, d * threads + tid.y]$;
+      value_0.x += dot(src, $weights[offset + 0]$);
+      value_0.y += dot(src, $weights[offset + 1]$);
+      value_0.z += dot(src, $weights[offset + 2]$);
+      value_0.w += dot(src, $weights[offset + 3]$);
+    }
+    sh_mem[workers * tid.y + tid.x] = value_0;
   }
-
-  // accumulate local partial sums
-  sh_mem[tid.x + tid.y * work_per_thread] = sum;
   memoryBarrierShared();
   barrier();
 
-  // accumulate global sums, write results
-  if (tid.y == 0 && gid.x < $dst_depth$) {
-    /*sum+=sh_mem[tid.x + 0 * work_per_thread];*/  // current thread
-    sum += sh_mem[tid.x + 1 * work_per_thread];
-    sum += sh_mem[tid.x + 2 * work_per_thread];
-    sum += sh_mem[tid.x + 3 * work_per_thread];
-    vec4 r0 = sum;
-)" + std::string(attr.bias.data.empty() ? R"( )" : R"(
-    r0 += $bias[gid.x]$;  )") +
-                         std::string(R"(
-    $output_data_0[0, 0, gid.x] = r0$;
+  if (tid.y > 0 || gid.x >= $dst_depth$) {
+    return;
   }
-)");
+
+  for (int t = 1; t < threads; t++) {
+    value_0 += sh_mem[workers * t + tid.x];
+  }
+)";
     if (!attr.bias.data.empty()) {
+      source += "  value_0 += $bias[gid.x]$;\n";
       objects.push_back({"bias", MakeReadonlyObject(attr.bias.data)});
     }
+    source += "  $output_data_0[0, 0, gid.x] = value_0$;";
 
     std::vector<Variable> shared_variables = {
-        {"sh_mem", std::vector<float4>(kWorkPerThread * kVectorizedWidth)},
+        // The actual size of sh_mem depends on the WorkgroupSize
+        {"sh_mem", std::vector<float4>(0)},
     };
 
     *generated_code = {
         /*parameters=*/std::move(parameters),
         /*objects=*/std::move(objects),
         /*shared_variables=*/std::move(shared_variables),
-        /*workload=*/uint3(dst_depth, kVectorizedWidth, 1),
-        /*workgroup=*/uint3(kWorkPerThread, kVectorizedWidth, 1),
+        /*workload=*/uint3(dst_depth, kWorkgroupHintY, 1),
+        /*workgroup=*/uint3(kWorkgroupHintX, kWorkgroupHintY, 1),
         /*source_code=*/std::move(source),
         /*input=*/IOStructure::ONLY_DEFINITIONS,
         /*output=*/IOStructure::ONLY_DEFINITIONS,
     };
-
     return OkStatus();
   }
 };
diff --git a/tensorflow/lite/delegates/gpu/gl/kernels/transpose_conv.cc b/tensorflow/lite/delegates/gpu/gl/kernels/transpose_conv.cc
index e84f3ef2e00..7fcfde4f92a 100644
--- a/tensorflow/lite/delegates/gpu/gl/kernels/transpose_conv.cc
+++ b/tensorflow/lite/delegates/gpu/gl/kernels/transpose_conv.cc
@@ -63,8 +63,10 @@ class ConvolutionTransposedBuffers : public NodeShader {
     ivec2 p0 = ($padding$ + $stride$ - gid.xy % $stride$) % $stride$;
     for (int y = p0.y; y < $kernel_size.y$; y += $stride.y$) {
       for (int x = p0.x; x < $kernel_size.x$; x += $stride.x$) {
-        int i = y * $kernel_size.x$ + x;
-        ivec2 idx = gid.xy + ivec2(x, y) - $padding$;
+      
+        int i = int(float(y * $kernel_size.x$) + float(x));        
+        ivec2 idx = ivec2(vec2(gid.xy + ivec2(x, y)) - vec2($padding$));
+        
         if (IN_BOUNDS(idx, ivec2(0), ivec2($input_data_0_w$, $input_data_0_h$) * $stride$)) {
           ivec2 coord = idx / $stride$;
           for (int l = 0; l < $src_depth$; ++l) {
diff --git a/tensorflow/lite/delegates/gpu/gl/kernels/transpose_conv_test.cc b/tensorflow/lite/delegates/gpu/gl/kernels/transpose_conv_test.cc
index 7127488204e..8ae80d99143 100644
--- a/tensorflow/lite/delegates/gpu/gl/kernels/transpose_conv_test.cc
+++ b/tensorflow/lite/delegates/gpu/gl/kernels/transpose_conv_test.cc
@@ -57,7 +57,7 @@ TEST(TransposeConvTest, O2H2W1I1Stride1x1DAdjacent1x1) {
   TensorRef<BHWC> output;
   output.type = DataType::FLOAT32;
   output.ref = 3;
-  output.shape = BHWC(1, 2, 2, 2);
+  output.shape = BHWC(1, 3, 3, 2);
 
   SingleOpModel model(
       {ToString(OperationType::CONVOLUTION_TRANSPOSED), std::move(attr)},
@@ -65,7 +65,8 @@ TEST(TransposeConvTest, O2H2W1I1Stride1x1DAdjacent1x1) {
   ASSERT_TRUE(model.PopulateTensor(0, {1, 1, 1, 1}));
   ASSERT_OK(model.Invoke(*NewConvolutionTransposedNodeShader()));
   EXPECT_THAT(model.GetOutput(0),
-              Pointwise(FloatNear(1e-6), {2, 4, 2, 4, 4, 8, 4, 8}));
+              Pointwise(FloatNear(1e-6), {2, 4, 2, 4, 1, 1, 4, 8, 4, 8, 1, 1, 3,
+                                          5, 3, 5, 1, 1}));
 }
 
 TEST(TransposeConvTest, O1H2W2I1Stride1x1Adjacent2x2) {
@@ -95,14 +96,18 @@ TEST(TransposeConvTest, O1H2W2I1Stride1x1Adjacent2x2) {
   TensorRef<BHWC> output;
   output.type = DataType::FLOAT32;
   output.ref = 3;
-  output.shape = BHWC(1, 1, 1, 1);
+  output.shape = BHWC(1, 6, 6, 1);
 
   SingleOpModel model(
       {ToString(OperationType::CONVOLUTION_TRANSPOSED), std::move(attr)},
       {input}, {output});
   ASSERT_TRUE(model.PopulateTensor(0, {1, 1, 1, 1, 1, 1, 1, 1, 1}));
   ASSERT_OK(model.Invoke(*NewConvolutionTransposedNodeShader()));
-  EXPECT_THAT(model.GetOutput(0), Pointwise(FloatNear(1e-6), {1}));
+  EXPECT_THAT(
+      model.GetOutput(0),
+      Pointwise(FloatNear(1e-6),
+                {1, 3, 3, 2, 0, 0, 4, 10, 10, 6, 0, 0, 4, 10, 10, 6, 0, 0,
+                 3, 7, 7, 4, 0, 0, 0, 0,  0,  0, 0, 0, 0, 0,  0,  0, 0, 0}));
 }
 
 TEST(TransposeConvTest, O1H3W3I1Stride1x1Adjacent1x1) {
@@ -132,14 +137,16 @@ TEST(TransposeConvTest, O1H3W3I1Stride1x1Adjacent1x1) {
   TensorRef<BHWC> output;
   output.type = DataType::FLOAT32;
   output.ref = 3;
-  output.shape = BHWC(1, 1, 1, 1);
+  output.shape = BHWC(1, 4, 4, 1);
 
   SingleOpModel model(
       {ToString(OperationType::CONVOLUTION_TRANSPOSED), std::move(attr)},
       {input}, {output});
   ASSERT_TRUE(model.PopulateTensor(0, {1, 1, 1, 1}));
   ASSERT_OK(model.Invoke(*NewConvolutionTransposedNodeShader()));
-  EXPECT_THAT(model.GetOutput(0), Pointwise(FloatNear(1e-6), {7}));
+  EXPECT_THAT(model.GetOutput(0),
+              Pointwise(FloatNear(1e-6),
+                        {7, 11, 7, 1, 7, 11, 7, 1, 4, 6, 4, 1, 1, 1, 1, 1}));
 }
 
 TEST(TransposeConvTest, O2H1W1I2Stride1x1Dilation1x1) {
@@ -169,14 +176,15 @@ TEST(TransposeConvTest, O2H1W1I2Stride1x1Dilation1x1) {
   TensorRef<BHWC> output;
   output.type = DataType::FLOAT32;
   output.ref = 3;
-  output.shape = BHWC(1, 2, 1, 2);
+  output.shape = BHWC(1, 3, 2, 2);
 
   SingleOpModel model(
       {ToString(OperationType::CONVOLUTION_TRANSPOSED), std::move(attr)},
       {input}, {output});
   ASSERT_TRUE(model.PopulateTensor(0, {1, 1, 1, 1}));
   ASSERT_OK(model.Invoke(*NewConvolutionTransposedNodeShader()));
-  EXPECT_THAT(model.GetOutput(0), Pointwise(FloatNear(1e-6), {4, 8, 4, 8}));
+  EXPECT_THAT(model.GetOutput(0),
+              Pointwise(FloatNear(1e-6), {4, 8, 1, 1, 4, 8, 1, 1, 1, 1, 1, 1}));
 }
 
 TEST(TransposeConvTest, O1H1W1I1Stride2x2Dilation1x1) {
@@ -207,14 +215,18 @@ TEST(TransposeConvTest, O1H1W1I1Stride2x2Dilation1x1) {
   TensorRef<BHWC> output;
   output.type = DataType::FLOAT32;
   output.ref = 3;
-  output.shape = BHWC(1, 1, 1, 1);
+  output.shape = BHWC(1, 6, 6, 1);
 
   SingleOpModel model(
       {ToString(OperationType::CONVOLUTION_TRANSPOSED), std::move(attr)},
       {input}, {output});
   ASSERT_TRUE(model.PopulateTensor(0, {1, 0, 2, 0, 0, 0, 4, 0, 8}));
   ASSERT_OK(model.Invoke(*NewConvolutionTransposedNodeShader()));
-  EXPECT_THAT(model.GetOutput(0), Pointwise(FloatNear(1e-6), {2}));
+  EXPECT_THAT(
+      model.GetOutput(0),
+      Pointwise(FloatNear(1e-6),
+                {2, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0,  0, 0, 0, 0, 0, 0, 0,
+                 0, 0, 0, 0, 0, 0, 8, 0, 0, 0, 16, 0, 0, 0, 0, 0, 0, 0}));
 }
 
 }  // namespace
diff --git a/tensorflow/lite/delegates/gpu/gl/runtime.cc b/tensorflow/lite/delegates/gpu/gl/runtime.cc
index d3678864cae..14e30389cf0 100644
--- a/tensorflow/lite/delegates/gpu/gl/runtime.cc
+++ b/tensorflow/lite/delegates/gpu/gl/runtime.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include "tensorflow/lite/delegates/gpu/common/data_type.h"
 #include "tensorflow/lite/delegates/gpu/common/gpu_info.h"
 #include "tensorflow/lite/delegates/gpu/common/memory_management.h"
+#include "tensorflow/lite/delegates/gpu/common/memory_management/types.h"
 #include "tensorflow/lite/delegates/gpu/common/status.h"
 #include "tensorflow/lite/delegates/gpu/common/types.h"
 #include "tensorflow/lite/delegates/gpu/gl/gl_call.h"
diff --git a/tensorflow/lite/delegates/gpu/java/src/main/java/org/tensorflow/lite/gpu/GpuDelegate.java b/tensorflow/lite/delegates/gpu/java/src/main/java/org/tensorflow/lite/gpu/GpuDelegate.java
index b19dc346fcb..119b4744441 100644
--- a/tensorflow/lite/delegates/gpu/java/src/main/java/org/tensorflow/lite/gpu/GpuDelegate.java
+++ b/tensorflow/lite/delegates/gpu/java/src/main/java/org/tensorflow/lite/gpu/GpuDelegate.java
@@ -19,7 +19,17 @@ import java.io.Closeable;
 import org.tensorflow.lite.Delegate;
 import org.tensorflow.lite.Tensor;
 
-/** {@link Delegate} for GPU inference. */
+/**
+ * {@link Delegate} for GPU inference.
+ *
+ * <p>Note: When calling {@code Interpreter.modifyGraphWithDelegate()}/
+ * {@code Interpreter.Options.addDelegate()} and {@code Interpreter.run()}, the caller must have an
+ * {@code EGLContext} in the <b>current thread</b> and {@code Interpreter.run()} must be called from
+ * the same {@code EGLContext}. If an {@code EGLContext} does not exist, the delegate will
+ * internally create one, but then the developer must ensure that {@code Interpreter.run()} is
+ * always called from the same thread in which {@code Interpreter.modifyGraphWithDelegate()} was
+ * called.
+ */
 public class GpuDelegate implements Delegate, Closeable {
 
   private static final long INVALID_DELEGATE_HANDLE = 0;
diff --git a/tensorflow/lite/delegates/gpu/metal/BUILD b/tensorflow/lite/delegates/gpu/metal/BUILD
index c6dc95d1a58..e6ddd09b229 100644
--- a/tensorflow/lite/delegates/gpu/metal/BUILD
+++ b/tensorflow/lite/delegates/gpu/metal/BUILD
@@ -4,7 +4,6 @@ package(
 )
 
 DEFAULT_COPTS = [
-    "-std=c++11",
     "-Wno-shorten-64-to-32",
 ]
 
diff --git a/tensorflow/lite/delegates/gpu/metal/inference_context.mm b/tensorflow/lite/delegates/gpu/metal/inference_context.mm
index 2bbb5e01559..fb3a51f4694 100644
--- a/tensorflow/lite/delegates/gpu/metal/inference_context.mm
+++ b/tensorflow/lite/delegates/gpu/metal/inference_context.mm
@@ -20,6 +20,7 @@ limitations under the License.
 
 #include "absl/strings/substitute.h"
 #include "tensorflow/lite/delegates/gpu/common/memory_management.h"
+#include "tensorflow/lite/delegates/gpu/common/memory_management/types.h"
 #include "tensorflow/lite/delegates/gpu/common/model.h"
 #include "tensorflow/lite/delegates/gpu/common/shape.h"
 #include "tensorflow/lite/delegates/gpu/common/status.h"
diff --git a/tensorflow/lite/delegates/gpu/metal/kernels/BUILD b/tensorflow/lite/delegates/gpu/metal/kernels/BUILD
index 421ddbb7f83..e2b8eed7518 100644
--- a/tensorflow/lite/delegates/gpu/metal/kernels/BUILD
+++ b/tensorflow/lite/delegates/gpu/metal/kernels/BUILD
@@ -463,6 +463,28 @@ cc_library(
     ],
 )
 
+objc_library(
+    name = "transpose_conv_test_lib",
+    testonly = 1,
+    srcs = ["transpose_conv_test.mm"],
+    sdk_frameworks = ["XCTest"],
+    deps = [
+        ":test_util",
+        ":transpose_conv",
+    ],
+)
+
+ios_unit_test(
+    name = "transpose_conv_test",
+    testonly = 1,
+    minimum_os_version = "9.0",
+    tags = [
+        "notap",
+        "tflite_not_portable_android",
+    ],
+    deps = [":transpose_conv_test_lib"],
+)
+
 cc_library(
     name = "upsample",
     srcs = ["upsample.cc"],
diff --git a/tensorflow/lite/delegates/gpu/metal/kernels/conv.cc b/tensorflow/lite/delegates/gpu/metal/kernels/conv.cc
index 19be9d4902a..b21cd6ff4b5 100644
--- a/tensorflow/lite/delegates/gpu/metal/kernels/conv.cc
+++ b/tensorflow/lite/delegates/gpu/metal/kernels/conv.cc
@@ -98,7 +98,7 @@ std::string GetSummationPart(int num_output_slices, int index) {
   std::string code = R"(
       {
         const FLT4 src = src_buffer[src_address];
-        src_address += params.dillation_layer_offsets.z;
+        src_address += params.dilation_layer_offsets.z;
     )";
   for (int d = 0; d < num_output_slices; ++d) {
     code += absl::Substitute(R"(
@@ -138,7 +138,7 @@ std::string GetWritingPart(int num_output_slices) {
      {
          int dst_address = int(gid.y) * params.size.z + int(gid.x);
          FLT4 value = FLT4(sum$0) + temp[$0];
-         const int linear_index = gid.z * params.dillation_layer_offsets.w + dst_address;
+         const int linear_index = gid.z * params.dilation_layer_offsets.w + dst_address;
          $$2
          dst_buffer[linear_index + params.z_offset.y] = value;
          gid.z += 1;
@@ -171,7 +171,7 @@ std::string GetKernelForConv(const Convolution2DAttributes& params) {
     constant int kernel_y = $3;
     struct uniforms {
       int4 stride_padding;
-      int4 dillation_layer_offsets;
+      int4 dilation_layer_offsets;
       int4 size;
       int4 z_offset;
     };
@@ -193,7 +193,7 @@ std::string GetKernelForConv(const Convolution2DAttributes& params) {
       for(int ky = 0; ky < kernel_y; ++ky) {
         for(int kx = 0; kx < kernel_x; ++kx) {
           int2 coords = int2(gid.xy) * params.stride_padding.xy + int2(kx, ky) *
-            params.dillation_layer_offsets.xy - params.stride_padding.zw;
+            params.dilation_layer_offsets.xy - params.stride_padding.zw;
           const bool el_outside = coords.x < 0 || coords.y < 0 || coords.x >= params.size.x ||
             coords.y >= params.size.y;
           const FLT multiplier = el_outside ? 0.0f : 1.0f;
diff --git a/tensorflow/lite/delegates/gpu/metal/kernels/depthwise_conv.cc b/tensorflow/lite/delegates/gpu/metal/kernels/depthwise_conv.cc
index 15b46541562..0ecd945528e 100644
--- a/tensorflow/lite/delegates/gpu/metal/kernels/depthwise_conv.cc
+++ b/tensorflow/lite/delegates/gpu/metal/kernels/depthwise_conv.cc
@@ -480,7 +480,7 @@ std::vector<ComputeTaskDescriptorPtr> DepthWiseConvolution(
     struct uniforms {
       int4 stride;
       int4 padding;
-      int4 dillation;
+      int4 dilation;
       int4 size;
       int4 channel_multiplier;
     };
@@ -499,7 +499,7 @@ std::vector<ComputeTaskDescriptorPtr> DepthWiseConvolution(
 
       for(int ky = 0; ky < kernel_y; ++ky) {
         for(int kx = 0; kx < kernel_x; ++kx) {
-          int2 coords  = int2(gid.xy) * params.stride.xy + int2(kx, ky) * params.dillation.xy -
+          int2 coords  = int2(gid.xy) * params.stride.xy + int2(kx, ky) * params.dilation.xy -
             params.padding.xy;
           const bool outside = coords.x < 0 || coords.y < 0 ||
             coords.x >= params.size.x || coords.y >= params.size.y;
diff --git a/tensorflow/lite/delegates/gpu/metal/kernels/transpose_conv.cc b/tensorflow/lite/delegates/gpu/metal/kernels/transpose_conv.cc
index 43d2b8fd1c7..9805163e3b7 100644
--- a/tensorflow/lite/delegates/gpu/metal/kernels/transpose_conv.cc
+++ b/tensorflow/lite/delegates/gpu/metal/kernels/transpose_conv.cc
@@ -953,9 +953,11 @@ std::vector<ComputeTaskDescriptorPtr> ConvolutionTransposed(
   auto filters = options.storage_precision == RuntimeOptions::Precision::FP32
                      ? VectorToUint8Vector(filters_reordered)
                      : VectorFloatToHalf(filters_reordered);
+  auto resized_bias = params.bias.data;
+  resized_bias.resize(params.weights.shape.o, 0.0f);
   auto biases = options.storage_precision == RuntimeOptions::Precision::FP32
-                    ? VectorToUint8Vector(params.bias.data)
-                    : VectorFloatToHalf(params.bias.data);
+                    ? VectorToUint8Vector(resized_bias)
+                    : VectorFloatToHalf(resized_bias);
   desc->immutable_buffers = {
       {"device FilterStripe* const filters", filters},
       {"constant FLT4* const biases", biases},
@@ -1047,9 +1049,11 @@ std::vector<ComputeTaskDescriptorPtr> ConvolutionTransposed3x3(
   auto filters = options.storage_precision == RuntimeOptions::Precision::FP32
                      ? VectorToUint8Vector(filters_reordered)
                      : VectorFloatToHalf(filters_reordered);
+  auto resized_bias = params.bias.data;
+  resized_bias.resize(params.weights.shape.o, 0.0f);
   auto biases = options.storage_precision == RuntimeOptions::Precision::FP32
-                    ? VectorToUint8Vector(params.bias.data)
-                    : VectorFloatToHalf(params.bias.data);
+                    ? VectorToUint8Vector(resized_bias)
+                    : VectorFloatToHalf(resized_bias);
   border_desc->immutable_buffers = {
       {"device FilterStripe* const filters", filters},
       {"constant FLT4* const biases", biases},
@@ -1137,9 +1141,8 @@ std::vector<ComputeTaskDescriptorPtr> ConvolutionTransposed3x3(
       }};
 
   desc->immutable_buffers = {
-      {"device FilterStripe* const filters",
-       VectorToUint8Vector(filters_reordered)},
-      {"constant FLT4* const biases", VectorToUint8Vector(params.bias.data)},
+      {"device FilterStripe* const filters", VectorToUint8Vector(filters)},
+      {"constant FLT4* const biases", VectorToUint8Vector(biases)},
   };
 
   desc->uniform_buffers = {
diff --git a/tensorflow/lite/delegates/gpu/metal/kernels/transpose_conv_test.mm b/tensorflow/lite/delegates/gpu/metal/kernels/transpose_conv_test.mm
new file mode 100644
index 00000000000..6fc93d3fd8e
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/metal/kernels/transpose_conv_test.mm
@@ -0,0 +1,248 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/delegates/gpu/metal/kernels/transpose_conv.h"
+
+#import <XCTest/XCTest.h>
+
+#include <vector>
+
+#include "tensorflow/lite/delegates/gpu/common/operations.h"
+#include "tensorflow/lite/delegates/gpu/common/shape.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+#include "tensorflow/lite/delegates/gpu/common/tensor.h"
+#include "tensorflow/lite/delegates/gpu/common/util.h"
+#include "tensorflow/lite/delegates/gpu/metal/compute_task_descriptor.h"
+#include "tensorflow/lite/delegates/gpu/metal/kernels/test_util.h"
+#include "tensorflow/lite/delegates/gpu/metal/runtime_options.h"
+
+using ::tflite::gpu::ConvolutionTransposedAttributes;
+using ::tflite::gpu::BHWC;
+using ::tflite::gpu::DataType;
+using ::tflite::gpu::HW;
+using ::tflite::gpu::Linear;
+using ::tflite::gpu::metal::CompareVectors;
+using ::tflite::gpu::metal::SingleOpModel;
+using ::tflite::gpu::Tensor;
+using ::tflite::gpu::TensorRef;
+using ::tflite::gpu::OHWI;
+using ::tflite::gpu::OperationType;
+
+@interface TransposeConvTest : XCTestCase
+@end
+
+@implementation TransposeConvTest
+- (void)setUp {
+  [super setUp];
+}
+
+- (void)testTransposeConvO2H2W1I1Stride1x1DAdjacent1x1 {
+  TensorRef<BHWC> input;
+  input.type = DataType::FLOAT32;
+  input.ref = 0;
+  input.shape = BHWC(1, 2, 2, 1);
+
+  ConvolutionTransposedAttributes attr;
+  Tensor<Linear, DataType::FLOAT32> bias;
+  bias.shape.v = 2;
+  bias.id = 1;
+  bias.data = {1, 1};
+  attr.bias = std::move(bias);
+
+  Tensor<OHWI, DataType::FLOAT32> weights;
+  weights.shape = OHWI(2, 2, 1, 1);
+  weights.id = 2;
+  weights.data = {1, 2, 3, 4};
+  attr.weights = std::move(weights);
+
+  attr.padding.prepended = HW(0, 0);
+  attr.padding.appended = HW(1, 0);
+  attr.adjacent = HW(1, 1);
+  attr.stride = HW(1, 1);
+
+  TensorRef<BHWC> output;
+  output.type = DataType::FLOAT32;
+  output.ref = 3;
+  output.shape = BHWC(1, 3, 3, 2);
+
+  SingleOpModel model({ToString(OperationType::CONVOLUTION_TRANSPOSED), std::move(attr)}, {input},
+                      {output});
+  XCTAssertTrue(model.PopulateTensor(0, {1, 1, 1, 1}));
+  auto status = model.Invoke();
+  XCTAssertTrue(status.ok(), @"%s", status.ToString().c_str());
+  status = CompareVectors({2, 4, 2, 4, 1, 1, 4, 8, 4, 8, 1, 1, 3, 5, 3, 5, 1, 1},
+                          model.GetOutput(0), 1e-6f);
+  XCTAssertTrue(status.ok(), @"%s", status.ToString().c_str());
+}
+
+- (void)testTransposeConvO1H2W2I1Stride1x1Adjacent2x2 {
+  TensorRef<BHWC> input;
+  input.type = DataType::FLOAT32;
+  input.ref = 0;
+  input.shape = BHWC(1, 3, 3, 1);
+
+  ConvolutionTransposedAttributes attr;
+  Tensor<Linear, DataType::FLOAT32> bias;
+  bias.shape.v = 2;
+  bias.id = 1;
+  bias.data.push_back(0.0);
+  attr.bias = std::move(bias);
+
+  Tensor<OHWI, DataType::FLOAT32> weights;
+  weights.shape = OHWI(1, 2, 2, 1);
+  weights.id = 2;
+  weights.data = {1, 2, 3, 4};
+  attr.weights = std::move(weights);
+
+  attr.adjacent = HW(2, 2);
+  attr.padding.prepended = HW(0, 0);
+  attr.padding.appended = HW(0, 0);
+  attr.stride = HW(1, 1);
+
+  TensorRef<BHWC> output;
+  output.type = DataType::FLOAT32;
+  output.ref = 3;
+  output.shape = BHWC(1, 6, 6, 1);
+
+  SingleOpModel model({ToString(OperationType::CONVOLUTION_TRANSPOSED), std::move(attr)}, {input},
+                      {output});
+  XCTAssertTrue(model.PopulateTensor(0, {1, 1, 1, 1, 1, 1, 1, 1, 1}));
+  auto status = model.Invoke();
+  XCTAssertTrue(status.ok(), @"%s", status.ToString().c_str());
+  status = CompareVectors({1, 3, 3, 2, 0, 0, 4, 10, 10, 6, 0, 0, 4, 10, 10, 6, 0, 0,
+                           3, 7, 7, 4, 0, 0, 0, 0,  0,  0, 0, 0, 0, 0,  0,  0, 0, 0},
+                          model.GetOutput(0), 1e-6f);
+  XCTAssertTrue(status.ok(), @"%s", status.ToString().c_str());
+}
+
+- (void)testTransposeConvO1H3W3I1Stride1x1Adjacent1x1 {
+  TensorRef<BHWC> input;
+  input.type = DataType::FLOAT32;
+  input.ref = 0;
+  input.shape = BHWC(1, 2, 2, 1);
+
+  ConvolutionTransposedAttributes attr;
+  Tensor<Linear, DataType::FLOAT32> bias;
+  bias.shape.v = 1;
+  bias.id = 1;
+  bias.data.push_back(1.0);
+  attr.bias = std::move(bias);
+
+  Tensor<OHWI, DataType::FLOAT32> weights;
+  weights.shape = OHWI(1, 3, 3, 1);
+  weights.id = 2;
+  weights.data = {1, 2, 3, 1, 2, 3, 1, 2, 3};
+  attr.weights = std::move(weights);
+
+  attr.adjacent = HW(1, 1);
+  attr.padding.prepended = HW(1, 1);
+  attr.padding.appended = HW(0, 0);
+  attr.stride = HW(1, 1);
+
+  TensorRef<BHWC> output;
+  output.type = DataType::FLOAT32;
+  output.ref = 3;
+  output.shape = BHWC(1, 4, 4, 1);
+
+  SingleOpModel model({ToString(OperationType::CONVOLUTION_TRANSPOSED), std::move(attr)}, {input},
+                      {output});
+  XCTAssertTrue(model.PopulateTensor(0, {1, 1, 1, 1}));
+  auto status = model.Invoke();
+  XCTAssertTrue(status.ok(), @"%s", status.ToString().c_str());
+  status =
+      CompareVectors({7, 11, 7, 1, 7, 11, 7, 1, 4, 6, 4, 1, 1, 1, 1, 1}, model.GetOutput(0), 1e-6f);
+  XCTAssertTrue(status.ok(), @"%s", status.ToString().c_str());
+}
+
+- (void)testTransposeConvO2H1W1I2Stride1x1Dilation1x1 {
+  TensorRef<BHWC> input;
+  input.type = DataType::FLOAT32;
+  input.ref = 0;
+  input.shape = BHWC(1, 2, 1, 2);
+
+  ConvolutionTransposedAttributes attr;
+  Tensor<Linear, DataType::FLOAT32> bias;
+  bias.shape.v = 2;
+  bias.id = 1;
+  bias.data = {1, 1};
+  attr.bias = std::move(bias);
+
+  Tensor<OHWI, DataType::FLOAT32> weights;
+  weights.shape = OHWI(2, 1, 1, 2);
+  weights.id = 2;
+  weights.data = {1, 2, 3, 4};
+  attr.weights = std::move(weights);
+
+  attr.adjacent = HW(1, 1);
+  attr.padding.prepended = HW(0, 0);
+  attr.padding.appended = HW(0, 0);
+  attr.stride = HW(1, 1);
+
+  TensorRef<BHWC> output;
+  output.type = DataType::FLOAT32;
+  output.ref = 3;
+  output.shape = BHWC(1, 3, 2, 2);
+
+  SingleOpModel model({ToString(OperationType::CONVOLUTION_TRANSPOSED), std::move(attr)}, {input},
+                      {output});
+  XCTAssertTrue(model.PopulateTensor(0, {1, 1, 1, 1}));
+  auto status = model.Invoke();
+  XCTAssertTrue(status.ok(), @"%s", status.ToString().c_str());
+  status = CompareVectors({4, 8, 1, 1, 4, 8, 1, 1, 1, 1, 1, 1}, model.GetOutput(0), 1e-6f);
+  XCTAssertTrue(status.ok(), @"%s", status.ToString().c_str());
+}
+
+- (void)testTransposeConvO1H1W1I1Stride2x2Dilation1x1 {
+  TensorRef<BHWC> input;
+  input.type = DataType::FLOAT32;
+  input.ref = 0;
+  input.shape = BHWC(1, 3, 3, 1);
+
+  ConvolutionTransposedAttributes attr;
+  Tensor<Linear, DataType::FLOAT32> bias;
+  bias.shape.v = 2;
+  bias.id = 1;
+  bias.data.push_back(0.0);
+  attr.bias = std::move(bias);
+
+  Tensor<OHWI, DataType::FLOAT32> weights;
+  weights.shape = OHWI(1, 1, 1, 1);
+  weights.id = 2;
+  weights.data.push_back(2.0);
+
+  attr.weights = std::move(weights);
+
+  attr.adjacent = HW(1, 1);
+  attr.padding.prepended = HW(0, 0);
+  attr.padding.appended = HW(0, 0);
+  attr.stride = HW(2, 2);
+
+  TensorRef<BHWC> output;
+  output.type = DataType::FLOAT32;
+  output.ref = 3;
+  output.shape = BHWC(1, 5, 5, 1);
+
+  SingleOpModel model({ToString(OperationType::CONVOLUTION_TRANSPOSED), std::move(attr)}, {input},
+                      {output});
+  XCTAssertTrue(model.PopulateTensor(0, {1, 0, 2, 0, 0, 0, 4, 0, 8}));
+  auto status = model.Invoke();
+  XCTAssertTrue(status.ok(), @"%s", status.ToString().c_str());
+  status = CompareVectors({2, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0,  0, 0, 0, 0, 0, 0, 0,
+                           0, 0, 0, 0, 0, 0, 8, 0, 0, 0, 16, 0, 0, 0, 0, 0, 0, 0},
+                          model.GetOutput(0), 1e-6f);
+  XCTAssertTrue(status.ok(), @"%s", status.ToString().c_str());
+}
+
+@end
diff --git a/tensorflow/lite/delegates/gpu/spi.h b/tensorflow/lite/delegates/gpu/spi.h
index fcc3a5714ef..c7f041f3db1 100644
--- a/tensorflow/lite/delegates/gpu/spi.h
+++ b/tensorflow/lite/delegates/gpu/spi.h
@@ -16,7 +16,10 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_DELEGATES_GPU_SPI_H_
 #define TENSORFLOW_LITE_DELEGATES_GPU_SPI_H_
 
+#include <cstdint>
+
 #include "tensorflow/lite/delegates/gpu/api.h"
+#include "tensorflow/lite/delegates/gpu/common/access_type.h"
 #include "tensorflow/lite/delegates/gpu/common/status.h"
 
 // Contains only service provider-related interfaces. Users should not use them
@@ -39,13 +42,44 @@ class TensorObjectConverterBuilder {
   virtual ~TensorObjectConverterBuilder() = default;
 
   virtual bool IsSupported(const TensorObjectDef& input,
-                           const TensorObjectDef& output) = 0;
+                           const TensorObjectDef& output) const = 0;
 
   virtual Status MakeConverter(
       const TensorObjectDef& input, const TensorObjectDef& output,
       std::unique_ptr<TensorObjectConverter>* converter) = 0;
 };
 
+// Connects tensor definition provided by a user (external) with tensor
+// definition used by the inference engine (internal).
+struct TensorTieDef {
+  uint32_t id;
+  AccessType access_type;
+  TensorObjectDef internal_def;
+  TensorObjectDef external_def;
+};
+
+// Connects external tensor object to internal tensor object and provides
+// functionality to copy data to/from external object to internal.
+class TensorTie {
+ public:
+  explicit TensorTie(const TensorTieDef& def) : def_(def) {}
+
+  virtual ~TensorTie() = default;
+
+  virtual Status SetExternalObject(TensorObject obj) = 0;
+
+  virtual TensorObject GetExternalObject() = 0;
+
+  virtual Status CopyToExternalObject() = 0;
+
+  virtual Status CopyFromExternalObject() = 0;
+
+  const TensorTieDef& def() const { return def_; }
+
+ private:
+  const TensorTieDef def_;
+};
+
 }  // namespace gpu
 }  // namespace tflite
 
diff --git a/tensorflow/lite/delegates/nnapi/nnapi_delegate.cc b/tensorflow/lite/delegates/nnapi/nnapi_delegate.cc
index 42796b1c9ca..6c0aefa24be 100644
--- a/tensorflow/lite/delegates/nnapi/nnapi_delegate.cc
+++ b/tensorflow/lite/delegates/nnapi/nnapi_delegate.cc
@@ -1710,9 +1710,13 @@ NNAPIDelegateKernel::MappingFn NNAPIDelegateKernel::Map(
           const bool hybrid_op =
               IsHybridOperator(mapping_args.context,
                                kTfLiteBuiltinTransposeConv, mapping_args.node);
-          mapping_args.builder->AddTensorInput(/*kDataInputTensor*/ 2,
-                                               hybrid_op);
-          mapping_args.builder->AddTensorInput(/*kWeightsTensor*/ 1, hybrid_op);
+          mapping_args.builder->AddTensorInput(
+              mapping_args.node->inputs->data[/* kDataInputTensor */ 2],
+              hybrid_op);
+
+          mapping_args.builder->AddTensorInput(
+              mapping_args.node->inputs->data[/* kWeightsTensor */ 1],
+              hybrid_op);
 
           // NNAPI requires a bias tensor, so we allocate a new tensor to fill
           // it with zeroes. It is deleted with other tensors in the context
@@ -1771,8 +1775,9 @@ NNAPIDelegateKernel::MappingFn NNAPIDelegateKernel::Map(
                 /*zero_point=*/0);
           }
 
-          mapping_args.builder->AddTensorInput(/*kOutputShapeTensor*/ 0,
-                                               hybrid_op);
+          mapping_args.builder->AddTensorInput(
+              mapping_args.node->inputs->data[/* kOutputShapeTensor */ 0],
+              hybrid_op);
 
           auto builtin = reinterpret_cast<TfLiteTransposeConvParams*>(
               mapping_args.node->builtin_data);
diff --git a/tensorflow/lite/examples/ios/camera/CameraExampleViewController.mm b/tensorflow/lite/examples/ios/camera/CameraExampleViewController.mm
index 4c56d05eec7..c6a38e7dcc8 100644
--- a/tensorflow/lite/examples/ios/camera/CameraExampleViewController.mm
+++ b/tensorflow/lite/examples/ios/camera/CameraExampleViewController.mm
@@ -386,7 +386,7 @@ void ProcessInputWithQuantizedModel(
 - (void)dealloc {
 #if TFLITE_USE_GPU_DELEGATE
   if (delegate) {
-    TFLGpuDelegateDelete(delegate);
+    DeleteGpuDelegate(delegate);
   }
 #endif
   [self teardownAVCapture];
@@ -415,10 +415,10 @@ void ProcessInputWithQuantizedModel(
   tflite::InterpreterBuilder(*model, resolver)(&interpreter);
 
 #if TFLITE_USE_GPU_DELEGATE
-  TFLGpuDelegateOptions options;
+  GpuDelegateOptions options;
   options.allow_precision_loss = true;
-  options.wait_type = TFLGpuDelegateWaitType::TFLGpuDelegateWaitTypeActive;
-  delegate = TFLGpuDelegateCreate(&options);
+  options.wait_type = GpuDelegateOptions::WaitType::kActive;
+  delegate = NewGpuDelegate(&options);
   interpreter->ModifyGraphWithDelegate(delegate);
 #endif
 
diff --git a/tensorflow/lite/examples/label_image/README.md b/tensorflow/lite/examples/label_image/README.md
new file mode 100644
index 00000000000..dd04465833f
--- /dev/null
+++ b/tensorflow/lite/examples/label_image/README.md
@@ -0,0 +1,102 @@
+# TensorFlow Lite C++ image classification demo
+
+This example shows how you can load a pre-trained and converted
+TensorFlow Lite model and use it to recognize objects in images.
+
+Before you begin,
+make sure you [have TensorFlow installed](https://www.tensorflow.org/install).
+
+You also need to [install Bazel 26.1](https://docs.bazel.build/versions/master/install.html)
+in order to build this example code. And be sure you have the Python `future`
+module installed:
+
+```
+pip install future --user
+```
+
+## Build the example
+
+First run `$TENSORFLOW_ROOT/configure`. To build for Android, set
+Android NDK or configure NDK setting in
+`$TENSORFLOW_ROOT/WORKSPACE` first.
+
+Build it for desktop machines (tested on Ubuntu and OS X):
+
+```
+bazel build --cxxopt=-std=c++11 //tensorflow/lite/examples/label_image:label_image
+```
+
+Build it for Android ARMv8:
+
+```
+bazel build --config monolithic --cxxopt=-std=c++11 \
+  --crosstool_top=//external:android/crosstool \
+  --host_crosstool_top=@bazel_tools//tools/cpp:toolchain \
+  --cpu=arm64-v8a \
+  //tensorflow/lite/examples/label_image:label_image
+```
+
+or
+
+```
+bazel build --config android_arm64 --config monolithic --cxxopt=-std=c++11 \
+  //tensorflow/lite/examples/label_image:label_image
+```
+
+Build it for Android arm-v7a:
+
+```
+bazel build --config monolithic --cxxopt=-std=c++11 \
+  --crosstool_top=//external:android/crosstool \
+  --host_crosstool_top=@bazel_tools//tools/cpp:toolchain \
+  --cpu=armeabi-v7a \
+  //tensorflow/lite/examples/label_image:label_image
+```
+
+or
+
+```
+bazel build --config android_arm --config monolithic --cxxopt=-std=c++11 \
+  //tensorflow/lite/examples/label_image:label_image
+```
+
+
+## Download sample model and image
+
+You can use any compatible model, but the following MobileNet v1 model offers
+a good demonstration of a model trained to recognize 1,000 different objects.
+
+```
+# Get model
+curl https://storage.googleapis.com/download.tensorflow.org/models/mobilenet_v1_2018_02_22/mobilenet_v1_1.0_224.tgz | tar xzv -C /tmp
+
+# Get labels
+curl https://storage.googleapis.com/download.tensorflow.org/models/mobilenet_v1_1.0_224_frozen.tgz  | tar xzv -C /tmp  mobilenet_v1_1.0_224/labels.txt
+
+mv /tmp/mobilenet_v1_1.0_224/labels.txt /tmp/
+```
+
+## Run the sample
+
+```
+bazel-bin/tensorflow/lite/examples/label_image/label_image \
+  --tflite_model /tmp/mobilenet_v1_1.0_224.tflite \
+  --labels /tmp/labels.txt \
+  --image testdata/grace_hopper.bmp
+```
+
+You should see results like this:
+
+```
+Loaded model /tmp/mobilenet_v1_1.0_224.tflite
+resolved reporter
+invoked
+average time: 68.12 ms
+0.860174: 653 653:military uniform
+0.0481017: 907 907:Windsor tie
+0.00786704: 466 466:bulletproof vest
+0.00644932: 514 514:cornet, horn, trumpet, trump
+0.00608029: 543 543:drumstick
+```
+
+See the `label_image.cc` source code for other command line options.
diff --git a/tensorflow/lite/examples/label_image/label_image.h b/tensorflow/lite/examples/label_image/label_image.h
index 3d41dd54821..b5df07f886c 100644
--- a/tensorflow/lite/examples/label_image/label_image.h
+++ b/tensorflow/lite/examples/label_image/label_image.h
@@ -17,7 +17,7 @@ limitations under the License.
 #define TENSORFLOW_LITE_EXAMPLES_LABEL_IMAGE_LABEL_IMAGE_H_
 
 #include "tensorflow/lite/model.h"
-#include "tensorflow/lite/string.h"
+#include "tensorflow/lite/string_type.h"
 
 namespace tflite {
 namespace label_image {
diff --git a/tensorflow/lite/examples/label_image/label_image.md b/tensorflow/lite/examples/label_image/label_image.md
deleted file mode 100644
index 178f5b9d301..00000000000
--- a/tensorflow/lite/examples/label_image/label_image.md
+++ /dev/null
@@ -1,78 +0,0 @@
-label_image for TensorFlow Lite inspired by TensorFlow's label_image.
-
-To build label_image for Android, run $TENSORFLOW_ROOT/configure 
-and set Android NDK or configure NDK setting in 
-$TENSORFLOW_ROOT/WORKSPACE first.
- 
-To build it for android ARMv8:
-```
-> bazel build --config monolithic --cxxopt=-std=c++11 \
-  --crosstool_top=//external:android/crosstool \
-  --host_crosstool_top=@bazel_tools//tools/cpp:toolchain \
-  --cpu=arm64-v8a \
-  //tensorflow/lite/examples/label_image:label_image
-```
-or
-```
-> bazel build --config android_arm64 --config monolithic --cxxopt=-std=c++11 \
-  //tensorflow/lite/examples/label_image:label_image
-```
-
-To build it for android arm-v7a:
-```
-> bazel build --config monolithic --cxxopt=-std=c++11 \
-  --crosstool_top=//external:android/crosstool \
-  --host_crosstool_top=@bazel_tools//tools/cpp:toolchain \
-  --cpu=armeabi-v7a \
-  //tensorflow/lite/examples/label_image:label_image
-```
-or
-```
-> bazel build --config android_arm --config monolithic --cxxopt=-std=c++11 \
-  //tensorflow/lite/examples/label_image:label_image
-```
-
-Build it for desktop machines (tested on Ubuntu and OS X)
-```
-> bazel build --config opt --cxxopt=-std=c++11 //tensorflow/lite/examples/label_image:label_image
-```
-To run it. Prepare `./mobilenet_quant_v1_224.tflite`, `./grace_hopper.bmp`, and `./labels.txt`.
-
-Run it:
-```
-> ./label_image
-Loaded model ./mobilenet_quant_v1_224.tflite
-resolved reporter
-invoked
-average time: 100.986 ms 
-0.439216: 653 military uniform
-0.372549: 458 bow tie
-0.0705882: 466 bulletproof vest
-0.0235294: 514 cornet
-0.0196078: 835 suit
-```
-Run `interpreter->Invoke()` 100 times:
-```
-> ./label_image   -c 100
-Loaded model ./mobilenet_quant_v1_224.tflite
-resolved reporter
-invoked
-average time: 33.4694 ms
-...
-```
-
-Run a floating point (`mobilenet_v1_1.0_224.tflite`) model,
-```
-> ./label_image -f 1 -m mobilenet_v1_1.0_224.tflite
-Loaded model mobilenet_v1_1.0_224.tflite
-resolved reporter
-invoked
-average time: 263.493 ms 
-0.88615: 653 military uniform
-0.0422316: 440 bearskin
-0.0109948: 466 bulletproof vest
-0.0105327: 401 academic gown
-0.00947104: 723 ping-pong bal
-```
-
-See the source code for other command line options.
diff --git a/tensorflow/lite/examples/python/README.md b/tensorflow/lite/examples/python/README.md
index ddfedb2916c..e59715488cc 100644
--- a/tensorflow/lite/examples/python/README.md
+++ b/tensorflow/lite/examples/python/README.md
@@ -5,6 +5,10 @@ TensorFlow Lite model and use it to recognize objects in images. The Python
 script accepts arguments specifying the model to use, the corresponding labels
 file, and the image to process.
 
+**Tip:**
+If you're using a Raspberry Pi, instead try the [classify_picamera.py example](
+https://github.com/tensorflow/examples/tree/master/lite/examples/image_classification/raspberry_pi).
+
 Before you begin,
 make sure you [have TensorFlow installed](https://www.tensorflow.org/install).
 
diff --git a/tensorflow/lite/experimental/micro/BUILD b/tensorflow/lite/experimental/micro/BUILD
index 7f94ad3b922..0a1287d1122 100644
--- a/tensorflow/lite/experimental/micro/BUILD
+++ b/tensorflow/lite/experimental/micro/BUILD
@@ -13,21 +13,25 @@ cc_library(
     srcs = [
         "debug_log.cc",
         "debug_log_numbers.cc",
+        "memory_helpers.cc",
         "micro_allocator.cc",
         "micro_error_reporter.cc",
         "micro_interpreter.cc",
         "micro_mutable_op_resolver.cc",
-        "simple_tensor_allocator.cc",
+        "simple_memory_allocator.cc",
+        "test_helpers.cc",
     ],
     hdrs = [
         "compatibility.h",
         "debug_log.h",
         "debug_log_numbers.h",
+        "memory_helpers.h",
         "micro_allocator.h",
         "micro_error_reporter.h",
         "micro_interpreter.h",
         "micro_mutable_op_resolver.h",
-        "simple_tensor_allocator.h",
+        "simple_memory_allocator.h",
+        "test_helpers.h",
     ],
     copts = [
         "-Werror",
@@ -37,6 +41,7 @@ cc_library(
     deps = [
         "//tensorflow/lite/c:c_api_internal",
         "//tensorflow/lite/core/api",
+        "//tensorflow/lite/experimental/micro/memory_planner:greedy_memory_planner",
         "//tensorflow/lite/schema:schema_fbs",
     ],
 )
@@ -74,9 +79,31 @@ tflite_micro_cc_test(
 )
 
 tflite_micro_cc_test(
-    name = "simple_tensor_allocator_test",
+    name = "simple_memory_allocator_test",
     srcs = [
-        "simple_tensor_allocator_test.cc",
+        "simple_memory_allocator_test.cc",
+    ],
+    deps = [
+        ":micro_framework",
+        "//tensorflow/lite/experimental/micro/testing:micro_test",
+    ],
+)
+
+tflite_micro_cc_test(
+    name = "micro_allocator_test",
+    srcs = [
+        "micro_allocator_test.cc",
+    ],
+    deps = [
+        ":micro_framework",
+        "//tensorflow/lite/experimental/micro/testing:micro_test",
+    ],
+)
+
+tflite_micro_cc_test(
+    name = "memory_helpers_test",
+    srcs = [
+        "memory_helpers_test.cc",
     ],
     deps = [
         ":micro_framework",
diff --git a/tensorflow/lite/experimental/micro/README.md b/tensorflow/lite/experimental/micro/README.md
index ddfc5bd7a27..ac39cab7fed 100644
--- a/tensorflow/lite/experimental/micro/README.md
+++ b/tensorflow/lite/experimental/micro/README.md
@@ -75,7 +75,7 @@ project files that you can download for the following platforms:
 
 Device                                                                                         | Mbed                                                                           | Keil                                                                           | Make/GCC
 ---------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------ | ------------------------------------------------------------------------------ | --------
-[STM32F746G Discovery Board](https://www.st.com/en/evaluation-tools/32f746gdiscovery.html)     | [Download](https://drive.google.com/open?id=1OtgVkytQBrEYIpJPsE8F6GUKHPBS3Xeb) | -                                                                              | [Download](https://drive.google.com/open?id=1u46mTtAMZ7Y1aD-He1u3R8AE4ZyEpnOl)
+[STM32F746G Discovery Board](https://www.st.com/en/evaluation-tools/32f746gdiscovery.html)     | [Download](https://drive.google.com/open?id=1OtgVkytQBrEYIpJPsE8F6GUKHPBS3Xeb) | -                                                                              | [Instructions](#generating-project-files)
 ["Blue Pill" STM32F103-compatible development board](https://github.com/google/stm32_bare_lib) | -                                                                              | -                                                                              | [Instructions](#building-for-the-blue-pill-stm32f103-using-make)
 [Ambiq Micro Apollo3Blue EVB using Make](https://ambiqmicro.com/apollo-ultra-low-power-mcus/)  | -                                                                              | -                                                                              | [Instructions](#building-for-ambiq-micro-apollo3blue-evb-using-make)
 [Generic Keil uVision Projects](http://www2.keil.com/mdk5/uvision/)                            | -                                                                              | [Download](https://drive.google.com/open?id=1Lw9rsdquNKObozClLPoE5CTJLuhfh5mV) | -
diff --git a/tensorflow/lite/experimental/micro/examples/hello_world/main.cc b/tensorflow/lite/experimental/micro/examples/hello_world/main.cc
index 146e356f441..04310bb8bcc 100644
--- a/tensorflow/lite/experimental/micro/examples/hello_world/main.cc
+++ b/tensorflow/lite/experimental/micro/examples/hello_world/main.cc
@@ -33,7 +33,7 @@ int main(int argc, char* argv[]) {
   if (model->version() != TFLITE_SCHEMA_VERSION) {
     error_reporter->Report(
         "Model provided is schema version %d not equal "
-        "to supported version %d.\n",
+        "to supported version %d.",
         model->version(), TFLITE_SCHEMA_VERSION);
     return 1;
   }
@@ -76,7 +76,7 @@ int main(int argc, char* argv[]) {
     // Run inference, and report any error
     TfLiteStatus invoke_status = interpreter.Invoke();
     if (invoke_status != kTfLiteOk) {
-      error_reporter->Report("Invoke failed on x_val: %f\n",
+      error_reporter->Report("Invoke failed on x_val: %f",
                              static_cast<double>(x_val));
       continue;
     }
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/README.md b/tensorflow/lite/experimental/micro/examples/micro_speech/README.md
index 8e3453a5eb0..5ddddfcbd3c 100644
--- a/tensorflow/lite/experimental/micro/examples/micro_speech/README.md
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/README.md
@@ -58,7 +58,7 @@ the trained TensorFlow model, runs some example inputs through it, and got the
 expected outputs.
 
 To understand how TensorFlow Lite does this, you can look at the source in
-[hello_world_test.cc](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/experimental/micro/examples/micro_speech/micro_speech_test.cc).
+[micro_speech_test.cc](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/experimental/micro/examples/micro_speech/micro_speech_test.cc).
 It's a fairly small amount of code that creates an interpreter, gets a handle to
 a model that's been compiled into the program, and then invokes the interpreter
 with the model and sample inputs.
@@ -356,10 +356,10 @@ structure:
 make -f tensorflow/lite/experimental/micro/tools/make/Makefile TARGET=mbed TAGS="CMSIS disco_f746ng" generate_micro_speech_mbed_project
 ```
 
-This will result in the creation of a new folder:
+Running the make command will result in the creation of a new folder:
 
 ```
-tensorflow/lite/experimental/micro/tools/make/gen/mbed_cortex-m4/prj/hello_world/mbed
+tensorflow/lite/experimental/micro/tools/make/gen/mbed_cortex-m4/prj/micro_speech/mbed
 ```
 
 This folder contains all of the example's dependencies structured in the correct
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/main.cc b/tensorflow/lite/experimental/micro/examples/micro_speech/main.cc
index 31e5cce4de9..2d7db654d1c 100644
--- a/tensorflow/lite/experimental/micro/examples/micro_speech/main.cc
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/main.cc
@@ -19,12 +19,22 @@ limitations under the License.
 #include "tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/micro_model_settings.h"
 #include "tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/tiny_conv_micro_features_model_data.h"
 #include "tensorflow/lite/experimental/micro/examples/micro_speech/recognize_commands.h"
-#include "tensorflow/lite/experimental/micro/kernels/all_ops_resolver.h"
 #include "tensorflow/lite/experimental/micro/micro_error_reporter.h"
 #include "tensorflow/lite/experimental/micro/micro_interpreter.h"
+#include "tensorflow/lite/experimental/micro/micro_mutable_op_resolver.h"
 #include "tensorflow/lite/schema/schema_generated.h"
 #include "tensorflow/lite/version.h"
 
+namespace tflite {
+namespace ops {
+namespace micro {
+TfLiteRegistration* Register_DEPTHWISE_CONV_2D();
+TfLiteRegistration* Register_FULLY_CONNECTED();
+TfLiteRegistration* Register_SOFTMAX();
+}  // namespace micro
+}  // namespace ops
+}  // namespace tflite
+
 int main(int argc, char* argv[]) {
   // Set up logging.
   tflite::MicroErrorReporter micro_error_reporter;
@@ -37,13 +47,27 @@ int main(int argc, char* argv[]) {
   if (model->version() != TFLITE_SCHEMA_VERSION) {
     error_reporter->Report(
         "Model provided is schema version %d not equal "
-        "to supported version %d.\n",
+        "to supported version %d.",
         model->version(), TFLITE_SCHEMA_VERSION);
     return 1;
   }
 
-  // This pulls in all the operation implementations we need.
-  tflite::ops::micro::AllOpsResolver resolver;
+  // Pull in only the operation implementations we need.
+  // This relies on a complete list of all the ops needed by this graph.
+  // An easier approach is to just use the AllOpsResolver, but this will
+  // incur some penalty in code space for op implementations that are not
+  // needed by this graph.
+  //
+  // tflite::ops::micro::AllOpsResolver resolver;
+  tflite::MicroMutableOpResolver micro_mutable_op_resolver;
+  micro_mutable_op_resolver.AddBuiltin(
+      tflite::BuiltinOperator_DEPTHWISE_CONV_2D,
+      tflite::ops::micro::Register_DEPTHWISE_CONV_2D());
+  micro_mutable_op_resolver.AddBuiltin(
+      tflite::BuiltinOperator_FULLY_CONNECTED,
+      tflite::ops::micro::Register_FULLY_CONNECTED());
+  micro_mutable_op_resolver.AddBuiltin(tflite::BuiltinOperator_SOFTMAX,
+                                       tflite::ops::micro::Register_SOFTMAX());
 
   // Create an area of memory to use for input, output, and intermediate arrays.
   // The size of this will depend on the model you're using, and may need to be
@@ -52,8 +76,9 @@ int main(int argc, char* argv[]) {
   uint8_t tensor_arena[tensor_arena_size];
 
   // Build an interpreter to run the model with.
-  tflite::MicroInterpreter interpreter(model, resolver, tensor_arena,
-                                       tensor_arena_size, error_reporter);
+  tflite::MicroInterpreter interpreter(model, micro_mutable_op_resolver,
+                                       tensor_arena, tensor_arena_size,
+                                       error_reporter);
   interpreter.AllocateTensors();
 
   // Get information about the memory area to use for the model's input.
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/micro_speech_test.cc b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_speech_test.cc
index 56289702161..e7c2a9d12c8 100644
--- a/tensorflow/lite/experimental/micro/examples/micro_speech/micro_speech_test.cc
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_speech_test.cc
@@ -16,13 +16,23 @@ limitations under the License.
 #include "tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/no_micro_features_data.h"
 #include "tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/tiny_conv_micro_features_model_data.h"
 #include "tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/yes_micro_features_data.h"
-#include "tensorflow/lite/experimental/micro/kernels/all_ops_resolver.h"
 #include "tensorflow/lite/experimental/micro/micro_error_reporter.h"
 #include "tensorflow/lite/experimental/micro/micro_interpreter.h"
+#include "tensorflow/lite/experimental/micro/micro_mutable_op_resolver.h"
 #include "tensorflow/lite/experimental/micro/testing/micro_test.h"
 #include "tensorflow/lite/schema/schema_generated.h"
 #include "tensorflow/lite/version.h"
 
+namespace tflite {
+namespace ops {
+namespace micro {
+TfLiteRegistration* Register_DEPTHWISE_CONV_2D();
+TfLiteRegistration* Register_FULLY_CONNECTED();
+TfLiteRegistration* Register_SOFTMAX();
+}  // namespace micro
+}  // namespace ops
+}  // namespace tflite
+
 TF_LITE_MICRO_TESTS_BEGIN
 
 TF_LITE_MICRO_TEST(TestInvoke) {
@@ -41,16 +51,31 @@ TF_LITE_MICRO_TEST(TestInvoke) {
         model->version(), TFLITE_SCHEMA_VERSION);
   }
 
-  // This pulls in all the operation implementations we need.
-  tflite::ops::micro::AllOpsResolver resolver;
+  // Pull in only the operation implementations we need.
+  // This relies on a complete list of all the ops needed by this graph.
+  // An easier approach is to just use the AllOpsResolver, but this will
+  // incur some penalty in code space for op implementations that are not
+  // needed by this graph.
+  //
+  // tflite::ops::micro::AllOpsResolver resolver;
+  tflite::MicroMutableOpResolver micro_mutable_op_resolver;
+  micro_mutable_op_resolver.AddBuiltin(
+      tflite::BuiltinOperator_DEPTHWISE_CONV_2D,
+      tflite::ops::micro::Register_DEPTHWISE_CONV_2D());
+  micro_mutable_op_resolver.AddBuiltin(
+      tflite::BuiltinOperator_FULLY_CONNECTED,
+      tflite::ops::micro::Register_FULLY_CONNECTED());
+  micro_mutable_op_resolver.AddBuiltin(tflite::BuiltinOperator_SOFTMAX,
+                                       tflite::ops::micro::Register_SOFTMAX());
 
   // Create an area of memory to use for input, output, and intermediate arrays.
   const int tensor_arena_size = 10 * 1024;
   uint8_t tensor_arena[tensor_arena_size];
 
   // Build an interpreter to run the model with.
-  tflite::MicroInterpreter interpreter(model, resolver, tensor_arena,
-                                       tensor_arena_size, error_reporter);
+  tflite::MicroInterpreter interpreter(model, micro_mutable_op_resolver,
+                                       tensor_arena, tensor_arena_size,
+                                       error_reporter);
   interpreter.AllocateTensors();
 
   // Get information about the memory area to use for the model's input.
diff --git a/tensorflow/lite/experimental/micro/examples/micro_vision/README.md b/tensorflow/lite/experimental/micro/examples/micro_vision/README.md
index d6a9be994e0..568d9a89b18 100644
--- a/tensorflow/lite/experimental/micro/examples/micro_vision/README.md
+++ b/tensorflow/lite/experimental/micro/examples/micro_vision/README.md
@@ -25,7 +25,7 @@ This will take a few minutes, and downloads frameworks the code uses like
 finished, run:
 
 ```
-make -f tensorflow/lite/experimental/micro/tools/make/Makefile TARGET=sparkfun_edge test_micro_vision
+make -f tensorflow/lite/experimental/micro/tools/make/Makefile test_micro_vision_test
 ```
 
 You should see a series of files get compiled, followed by some logging output
diff --git a/tensorflow/lite/experimental/micro/examples/micro_vision/main.cc b/tensorflow/lite/experimental/micro/examples/micro_vision/main.cc
index 8b067e33f05..e5f020b102b 100644
--- a/tensorflow/lite/experimental/micro/examples/micro_vision/main.cc
+++ b/tensorflow/lite/experimental/micro/examples/micro_vision/main.cc
@@ -17,22 +17,24 @@ limitations under the License.
 #include "tensorflow/lite/experimental/micro/examples/micro_vision/image_provider.h"
 #include "tensorflow/lite/experimental/micro/examples/micro_vision/model_settings.h"
 #include "tensorflow/lite/experimental/micro/examples/micro_vision/person_detect_model_data.h"
-#include "tensorflow/lite/experimental/micro/kernels/all_ops_resolver.h"
 #include "tensorflow/lite/experimental/micro/micro_error_reporter.h"
 #include "tensorflow/lite/experimental/micro/micro_interpreter.h"
+#include "tensorflow/lite/experimental/micro/micro_mutable_op_resolver.h"
 #include "tensorflow/lite/schema/schema_generated.h"
 #include "tensorflow/lite/version.h"
 
+namespace tflite {
+namespace ops {
+namespace micro {
+TfLiteRegistration* Register_DEPTHWISE_CONV_2D();
+TfLiteRegistration* Register_CONV_2D();
+TfLiteRegistration* Register_AVERAGE_POOL_2D();
+}  // namespace micro
+}  // namespace ops
+}  // namespace tflite
+
 // Create an area of memory to use for input, output, and intermediate arrays.
-// TODO(rocky): This is too big for many platforms.  Need to implement a more
-// efficient memory manager for intermediate tensors.
-// TODO(petewarden): Temporarily reduce the size for Arduino builds, so we can
-// make sure the continuous-integration builds work.
-#ifdef ARDUINO
-constexpr int tensor_arena_size = 10 * 1024;
-#else   // ARDUINO
-constexpr int tensor_arena_size = 270 * 1024;
-#endif  // ARDUINO
+constexpr int tensor_arena_size = 70 * 1024;
 uint8_t tensor_arena[tensor_arena_size];
 
 int main(int argc, char* argv[]) {
@@ -51,12 +53,27 @@ int main(int argc, char* argv[]) {
     return 1;
   }
 
-  // This pulls in all the operation implementations we need.
-  tflite::ops::micro::AllOpsResolver resolver;
+  // Pull in only the operation implementations we need.
+  // This relies on a complete list of all the ops needed by this graph.
+  // An easier approach is to just use the AllOpsResolver, but this will
+  // incur some penalty in code space for op implementations that are not
+  // needed by this graph.
+  //
+  // tflite::ops::micro::AllOpsResolver resolver;
+  tflite::MicroMutableOpResolver micro_mutable_op_resolver;
+  micro_mutable_op_resolver.AddBuiltin(
+      tflite::BuiltinOperator_DEPTHWISE_CONV_2D,
+      tflite::ops::micro::Register_DEPTHWISE_CONV_2D());
+  micro_mutable_op_resolver.AddBuiltin(tflite::BuiltinOperator_CONV_2D,
+                                       tflite::ops::micro::Register_CONV_2D());
+  micro_mutable_op_resolver.AddBuiltin(
+      tflite::BuiltinOperator_AVERAGE_POOL_2D,
+      tflite::ops::micro::Register_AVERAGE_POOL_2D());
 
   // Build an interpreter to run the model with.
-  tflite::MicroInterpreter interpreter(model, resolver, tensor_arena,
-                                       tensor_arena_size, error_reporter);
+  tflite::MicroInterpreter interpreter(model, micro_mutable_op_resolver,
+                                       tensor_arena, tensor_arena_size,
+                                       error_reporter);
   interpreter.AllocateTensors();
 
   // Get information about the memory area to use for the model's input.
diff --git a/tensorflow/lite/experimental/micro/examples/micro_vision/micro_vision_test.cc b/tensorflow/lite/experimental/micro/examples/micro_vision/micro_vision_test.cc
index c5735769fb6..fedb6a9b888 100644
--- a/tensorflow/lite/experimental/micro/examples/micro_vision/micro_vision_test.cc
+++ b/tensorflow/lite/experimental/micro/examples/micro_vision/micro_vision_test.cc
@@ -18,23 +18,25 @@ limitations under the License.
 #include "tensorflow/lite/experimental/micro/examples/micro_vision/no_person_image_data.h"
 #include "tensorflow/lite/experimental/micro/examples/micro_vision/person_detect_model_data.h"
 #include "tensorflow/lite/experimental/micro/examples/micro_vision/person_image_data.h"
-#include "tensorflow/lite/experimental/micro/kernels/all_ops_resolver.h"
 #include "tensorflow/lite/experimental/micro/micro_error_reporter.h"
 #include "tensorflow/lite/experimental/micro/micro_interpreter.h"
+#include "tensorflow/lite/experimental/micro/micro_mutable_op_resolver.h"
 #include "tensorflow/lite/experimental/micro/testing/micro_test.h"
 #include "tensorflow/lite/schema/schema_generated.h"
 #include "tensorflow/lite/version.h"
 
+namespace tflite {
+namespace ops {
+namespace micro {
+TfLiteRegistration* Register_DEPTHWISE_CONV_2D();
+TfLiteRegistration* Register_CONV_2D();
+TfLiteRegistration* Register_AVERAGE_POOL_2D();
+}  // namespace micro
+}  // namespace ops
+}  // namespace tflite
+
 // Create an area of memory to use for input, output, and intermediate arrays.
-// TODO(rocky): This is too big for many platforms.  Need to implement a more
-// efficient memory manager for intermediate tensors.
-// TODO(petewarden): Temporarily reduce the size for Arduino builds, so we can
-// make sure the continuous-integration builds work.
-#ifdef ARDUINO
-constexpr int tensor_arena_size = 10 * 1024;
-#else   // ARDUINO
-const int tensor_arena_size = 300 * 1024;
-#endif  // ARDUINO
+constexpr int tensor_arena_size = 70 * 1024;
 uint8_t tensor_arena[tensor_arena_size];
 
 TF_LITE_MICRO_TESTS_BEGIN
@@ -54,12 +56,27 @@ TF_LITE_MICRO_TEST(TestInvoke) {
         model->version(), TFLITE_SCHEMA_VERSION);
   }
 
-  // This pulls in all the operation implementations we need.
-  tflite::ops::micro::AllOpsResolver resolver;
+  // Pull in only the operation implementations we need.
+  // This relies on a complete list of all the ops needed by this graph.
+  // An easier approach is to just use the AllOpsResolver, but this will
+  // incur some penalty in code space for op implementations that are not
+  // needed by this graph.
+  //
+  // tflite::ops::micro::AllOpsResolver resolver;
+  tflite::MicroMutableOpResolver micro_mutable_op_resolver;
+  micro_mutable_op_resolver.AddBuiltin(
+      tflite::BuiltinOperator_DEPTHWISE_CONV_2D,
+      tflite::ops::micro::Register_DEPTHWISE_CONV_2D());
+  micro_mutable_op_resolver.AddBuiltin(tflite::BuiltinOperator_CONV_2D,
+                                       tflite::ops::micro::Register_CONV_2D());
+  micro_mutable_op_resolver.AddBuiltin(
+      tflite::BuiltinOperator_AVERAGE_POOL_2D,
+      tflite::ops::micro::Register_AVERAGE_POOL_2D());
 
   // Build an interpreter to run the model with.
-  tflite::MicroInterpreter interpreter(model, resolver, tensor_arena,
-                                       tensor_arena_size, error_reporter);
+  tflite::MicroInterpreter interpreter(model, micro_mutable_op_resolver,
+                                       tensor_arena, tensor_arena_size,
+                                       error_reporter);
   interpreter.AllocateTensors();
 
   // Get information about the memory area to use for the model's input.
@@ -125,7 +142,7 @@ TF_LITE_MICRO_TEST(TestInvoke) {
   TF_LITE_MICRO_EXPECT_EQ(1, output->dims->data[0]);
   TF_LITE_MICRO_EXPECT_EQ(1, output->dims->data[1]);
   TF_LITE_MICRO_EXPECT_EQ(1, output->dims->data[2]);
-  TF_LITE_MICRO_EXPECT_EQ(3, output->dims->data[3]);
+  TF_LITE_MICRO_EXPECT_EQ(kCategoryCount, output->dims->data[3]);
   TF_LITE_MICRO_EXPECT_EQ(kTfLiteUInt8, output->type);
 
   // Make sure that the expected "No Person" score is higher.
diff --git a/tensorflow/lite/experimental/micro/examples/micro_vision/sparkfun_edge/detection_responder.cc b/tensorflow/lite/experimental/micro/examples/micro_vision/sparkfun_edge/detection_responder.cc
index 43425b76e68..ff1c9f9f5e5 100644
--- a/tensorflow/lite/experimental/micro/examples/micro_vision/sparkfun_edge/detection_responder.cc
+++ b/tensorflow/lite/experimental/micro/examples/micro_vision/sparkfun_edge/detection_responder.cc
@@ -49,6 +49,6 @@ void RespondToDetection(tflite::ErrorReporter* error_reporter,
     am_hal_gpio_output_set(AM_BSP_GPIO_LED_YELLOW);
   }
 
-  error_reporter->Report("person score:%d no person score %d", person_score,
+  error_reporter->Report("person score: %d no person score: %d", person_score,
                          no_person_score);
 }
diff --git a/tensorflow/lite/experimental/micro/examples/micro_vision/sparkfun_edge/image_provider.cc b/tensorflow/lite/experimental/micro/examples/micro_vision/sparkfun_edge/image_provider.cc
index 6685d93cf44..e05dc4d6c53 100644
--- a/tensorflow/lite/experimental/micro/examples/micro_vision/sparkfun_edge/image_provider.cc
+++ b/tensorflow/lite/experimental/micro/examples/micro_vision/sparkfun_edge/image_provider.cc
@@ -79,7 +79,7 @@ static constexpr int kFramesToInitialize = 4;
 
 bool g_is_camera_initialized = false;
 
-void boost_mode_enable(tflite::ErrorReporter* error_reporter, bool bEnable) {
+void burst_mode_enable(tflite::ErrorReporter* error_reporter, bool bEnable) {
   am_hal_burst_avail_e eBurstModeAvailable;
   am_hal_burst_mode_e eBurstMode;
 
@@ -137,7 +137,7 @@ TfLiteStatus InitCamera(tflite::ErrorReporter* error_reporter) {
   // Enable interrupts so we can receive messages from the boot host.
   am_hal_interrupt_master_enable();
 
-  boost_mode_enable(error_reporter, true);
+  burst_mode_enable(error_reporter, true);
 
   // Turn on the 1.8V regulator for DVDD on the camera.
   am_hal_gpio_pinconfig(HM01B0_PIN_DVDD_EN, g_AM_HAL_GPIO_OUTPUT_12);
diff --git a/tensorflow/lite/experimental/micro/kernels/arg_min_max_test.cc b/tensorflow/lite/experimental/micro/kernels/arg_min_max_test.cc
index 722c0da9144..e3ff9179be3 100644
--- a/tensorflow/lite/experimental/micro/kernels/arg_min_max_test.cc
+++ b/tensorflow/lite/experimental/micro/kernels/arg_min_max_test.cc
@@ -16,7 +16,6 @@ limitations under the License.
 #include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/c_api_internal.h"
 #include "tensorflow/lite/experimental/micro/kernels/all_ops_resolver.h"
-#include "tensorflow/lite/experimental/micro/simple_tensor_allocator.h"
 #include "tensorflow/lite/experimental/micro/testing/micro_test.h"
 #include "tensorflow/lite/experimental/micro/testing/test_utils.h"
 
diff --git a/tensorflow/lite/experimental/micro/kernels/comparisons_test.cc b/tensorflow/lite/experimental/micro/kernels/comparisons_test.cc
index 13ab2a4b5ce..00f25d8f2a2 100644
--- a/tensorflow/lite/experimental/micro/kernels/comparisons_test.cc
+++ b/tensorflow/lite/experimental/micro/kernels/comparisons_test.cc
@@ -17,7 +17,6 @@ limitations under the License.
 #include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/c_api_internal.h"
 #include "tensorflow/lite/experimental/micro/kernels/all_ops_resolver.h"
-#include "tensorflow/lite/experimental/micro/simple_tensor_allocator.h"
 #include "tensorflow/lite/experimental/micro/testing/micro_test.h"
 #include "tensorflow/lite/experimental/micro/testing/test_utils.h"
 
diff --git a/tensorflow/lite/experimental/micro/kernels/conv_test.cc b/tensorflow/lite/experimental/micro/kernels/conv_test.cc
index 8e0b22f8599..c95d4b180c5 100644
--- a/tensorflow/lite/experimental/micro/kernels/conv_test.cc
+++ b/tensorflow/lite/experimental/micro/kernels/conv_test.cc
@@ -16,7 +16,6 @@ limitations under the License.
 #include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/c_api_internal.h"
 #include "tensorflow/lite/experimental/micro/kernels/all_ops_resolver.h"
-#include "tensorflow/lite/experimental/micro/simple_tensor_allocator.h"
 #include "tensorflow/lite/experimental/micro/testing/micro_test.h"
 #include "tensorflow/lite/experimental/micro/testing/test_utils.h"
 
diff --git a/tensorflow/lite/experimental/micro/kernels/depthwise_conv_test.cc b/tensorflow/lite/experimental/micro/kernels/depthwise_conv_test.cc
index c6423759f77..ed755c1403e 100644
--- a/tensorflow/lite/experimental/micro/kernels/depthwise_conv_test.cc
+++ b/tensorflow/lite/experimental/micro/kernels/depthwise_conv_test.cc
@@ -16,7 +16,6 @@ limitations under the License.
 #include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/c_api_internal.h"
 #include "tensorflow/lite/experimental/micro/kernels/all_ops_resolver.h"
-#include "tensorflow/lite/experimental/micro/simple_tensor_allocator.h"
 #include "tensorflow/lite/experimental/micro/testing/micro_test.h"
 #include "tensorflow/lite/experimental/micro/testing/test_utils.h"
 
diff --git a/tensorflow/lite/experimental/micro/kernels/floor_test.cc b/tensorflow/lite/experimental/micro/kernels/floor_test.cc
index 7b65a409fd7..14a34137569 100644
--- a/tensorflow/lite/experimental/micro/kernels/floor_test.cc
+++ b/tensorflow/lite/experimental/micro/kernels/floor_test.cc
@@ -16,7 +16,6 @@ limitations under the License.
 #include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/c_api_internal.h"
 #include "tensorflow/lite/experimental/micro/kernels/all_ops_resolver.h"
-#include "tensorflow/lite/experimental/micro/simple_tensor_allocator.h"
 #include "tensorflow/lite/experimental/micro/testing/micro_test.h"
 #include "tensorflow/lite/experimental/micro/testing/test_utils.h"
 
diff --git a/tensorflow/lite/experimental/micro/kernels/fully_connected_test.cc b/tensorflow/lite/experimental/micro/kernels/fully_connected_test.cc
index 4f46c0e0daa..88abf996fc2 100644
--- a/tensorflow/lite/experimental/micro/kernels/fully_connected_test.cc
+++ b/tensorflow/lite/experimental/micro/kernels/fully_connected_test.cc
@@ -18,7 +18,6 @@ limitations under the License.
 #include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/c_api_internal.h"
 #include "tensorflow/lite/experimental/micro/kernels/all_ops_resolver.h"
-#include "tensorflow/lite/experimental/micro/simple_tensor_allocator.h"
 #include "tensorflow/lite/experimental/micro/testing/micro_test.h"
 #include "tensorflow/lite/experimental/micro/testing/test_utils.h"
 
diff --git a/tensorflow/lite/experimental/micro/kernels/logical_test.cc b/tensorflow/lite/experimental/micro/kernels/logical_test.cc
index 55dfaca317a..01892187b74 100644
--- a/tensorflow/lite/experimental/micro/kernels/logical_test.cc
+++ b/tensorflow/lite/experimental/micro/kernels/logical_test.cc
@@ -15,7 +15,6 @@ limitations under the License.
 #include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/c_api_internal.h"
 #include "tensorflow/lite/experimental/micro/kernels/all_ops_resolver.h"
-#include "tensorflow/lite/experimental/micro/simple_tensor_allocator.h"
 #include "tensorflow/lite/experimental/micro/testing/micro_test.h"
 #include "tensorflow/lite/experimental/micro/testing/test_utils.h"
 
diff --git a/tensorflow/lite/experimental/micro/kernels/maximum_minimum_test.cc b/tensorflow/lite/experimental/micro/kernels/maximum_minimum_test.cc
index e911d9c7cb4..733977abe32 100644
--- a/tensorflow/lite/experimental/micro/kernels/maximum_minimum_test.cc
+++ b/tensorflow/lite/experimental/micro/kernels/maximum_minimum_test.cc
@@ -16,7 +16,6 @@ limitations under the License.
 #include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/c_api_internal.h"
 #include "tensorflow/lite/experimental/micro/kernels/all_ops_resolver.h"
-#include "tensorflow/lite/experimental/micro/simple_tensor_allocator.h"
 #include "tensorflow/lite/experimental/micro/testing/micro_test.h"
 #include "tensorflow/lite/experimental/micro/testing/test_utils.h"
 
diff --git a/tensorflow/lite/experimental/micro/kernels/neg_test.cc b/tensorflow/lite/experimental/micro/kernels/neg_test.cc
index f751049fbc1..e397c047a1f 100644
--- a/tensorflow/lite/experimental/micro/kernels/neg_test.cc
+++ b/tensorflow/lite/experimental/micro/kernels/neg_test.cc
@@ -16,7 +16,6 @@ limitations under the License.
 #include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/c_api_internal.h"
 #include "tensorflow/lite/experimental/micro/kernels/all_ops_resolver.h"
-#include "tensorflow/lite/experimental/micro/simple_tensor_allocator.h"
 #include "tensorflow/lite/experimental/micro/testing/micro_test.h"
 #include "tensorflow/lite/experimental/micro/testing/test_utils.h"
 
diff --git a/tensorflow/lite/experimental/micro/kernels/pooling_test.cc b/tensorflow/lite/experimental/micro/kernels/pooling_test.cc
index c8514a87777..dec74791121 100644
--- a/tensorflow/lite/experimental/micro/kernels/pooling_test.cc
+++ b/tensorflow/lite/experimental/micro/kernels/pooling_test.cc
@@ -18,7 +18,6 @@ limitations under the License.
 #include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/c_api_internal.h"
 #include "tensorflow/lite/experimental/micro/kernels/all_ops_resolver.h"
-#include "tensorflow/lite/experimental/micro/simple_tensor_allocator.h"
 #include "tensorflow/lite/experimental/micro/testing/micro_test.h"
 #include "tensorflow/lite/experimental/micro/testing/test_utils.h"
 
diff --git a/tensorflow/lite/experimental/micro/kernels/prelu_test.cc b/tensorflow/lite/experimental/micro/kernels/prelu_test.cc
index 6bc96abc245..d166819156e 100644
--- a/tensorflow/lite/experimental/micro/kernels/prelu_test.cc
+++ b/tensorflow/lite/experimental/micro/kernels/prelu_test.cc
@@ -16,7 +16,6 @@ limitations under the License.
 #include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/c_api_internal.h"
 #include "tensorflow/lite/experimental/micro/kernels/all_ops_resolver.h"
-#include "tensorflow/lite/experimental/micro/simple_tensor_allocator.h"
 #include "tensorflow/lite/experimental/micro/testing/micro_test.h"
 #include "tensorflow/lite/experimental/micro/testing/test_utils.h"
 
diff --git a/tensorflow/lite/experimental/micro/kernels/reshape_test.cc b/tensorflow/lite/experimental/micro/kernels/reshape_test.cc
index cf13c640142..2e3e62068b7 100644
--- a/tensorflow/lite/experimental/micro/kernels/reshape_test.cc
+++ b/tensorflow/lite/experimental/micro/kernels/reshape_test.cc
@@ -16,7 +16,6 @@ limitations under the License.
 #include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/c_api_internal.h"
 #include "tensorflow/lite/experimental/micro/kernels/all_ops_resolver.h"
-#include "tensorflow/lite/experimental/micro/simple_tensor_allocator.h"
 #include "tensorflow/lite/experimental/micro/testing/micro_test.h"
 #include "tensorflow/lite/experimental/micro/testing/test_utils.h"
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
diff --git a/tensorflow/lite/experimental/micro/kernels/round_test.cc b/tensorflow/lite/experimental/micro/kernels/round_test.cc
index 511d25033b2..c7546b3b7e7 100644
--- a/tensorflow/lite/experimental/micro/kernels/round_test.cc
+++ b/tensorflow/lite/experimental/micro/kernels/round_test.cc
@@ -16,7 +16,6 @@ limitations under the License.
 #include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/c_api_internal.h"
 #include "tensorflow/lite/experimental/micro/kernels/all_ops_resolver.h"
-#include "tensorflow/lite/experimental/micro/simple_tensor_allocator.h"
 #include "tensorflow/lite/experimental/micro/testing/micro_test.h"
 #include "tensorflow/lite/experimental/micro/testing/test_utils.h"
 
diff --git a/tensorflow/lite/experimental/micro/kernels/softmax_test.cc b/tensorflow/lite/experimental/micro/kernels/softmax_test.cc
index 8933b6c0ed0..3f0b7dc5453 100644
--- a/tensorflow/lite/experimental/micro/kernels/softmax_test.cc
+++ b/tensorflow/lite/experimental/micro/kernels/softmax_test.cc
@@ -16,7 +16,6 @@ limitations under the License.
 #include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/c_api_internal.h"
 #include "tensorflow/lite/experimental/micro/kernels/all_ops_resolver.h"
-#include "tensorflow/lite/experimental/micro/simple_tensor_allocator.h"
 #include "tensorflow/lite/experimental/micro/testing/micro_test.h"
 #include "tensorflow/lite/experimental/micro/testing/test_utils.h"
 
diff --git a/tensorflow/lite/experimental/micro/kernels/svdf.cc b/tensorflow/lite/experimental/micro/kernels/svdf.cc
index 866a0286553..756c3c7ccd3 100644
--- a/tensorflow/lite/experimental/micro/kernels/svdf.cc
+++ b/tensorflow/lite/experimental/micro/kernels/svdf.cc
@@ -359,7 +359,10 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   // [3] = Bias (optional), {1, num_units}
   // [4] = Activation State (variable),
   //         {2, batch_size, memory_size * num_filters}
-  TF_LITE_ENSURE_EQ(context, node->inputs->size, 5);
+  // TODO(kreeger): Use input tensor as variable until scratch tensor allocation
+  // has been implemented (cl/263032056)
+  // TF_LITE_ENSURE_EQ(context, node->inputs->size, 5);
+  TF_LITE_ENSURE_EQ(context, node->inputs->size, 6);
   const TfLiteTensor* input = GetInput(context, node, kInputTensor);
   const TfLiteTensor* weights_feature =
       GetInput(context, node, kWeightsFeatureTensor);
@@ -408,7 +411,11 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   // [0] = Holds dot-product of time-forward calculations in
   //       ApplyTimeWeightsBiasAndActivation():
   //         float, {2, batch_size, num_filters}
-  TfLiteTensor* scratch_tensor = GetTemporary(context, node, 0);
+  // TODO(kreeger): Use input tensor as variable until scratch tensor allocation
+  // has been implemented (cl/263032056)
+  // TfLiteTensor* scratch_tensor = GetTemporary(context, node, 0);
+  TfLiteTensor* scratch_tensor = &context->tensors[node->inputs->data[5]];
+
   TF_LITE_ENSURE_EQ(context, scratch_tensor->type, kTfLiteFloat32);
   TF_LITE_ENSURE_EQ(context, NumDimensions(scratch_tensor), 2);
   TF_LITE_ENSURE_EQ(context, scratch_tensor->dims->data[0], batch_size);
@@ -479,7 +486,9 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
 
     // Full-float SVDF only uses the one shared scratch tensor (see above for
     // usage).
-    TF_LITE_ENSURE_EQ(context, node->temporaries->size, 1);
+    // TODO(kreeger): Use input tensor as variable until scratch tensor
+    // allocation has been implemented (cl/263032056)
+    // TF_LITE_ENSURE_EQ(context, node->temporaries->size, 1);
   }
 
   // Validate Tensor Output:
@@ -504,7 +513,10 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
       GetInput(context, node, kWeightsTimeTensor);
   const TfLiteTensor* bias = GetOptionalInputTensor(context, node, kBiasTensor);
 
-  TfLiteTensor* scratch = GetTemporary(context, node, /*index=*/0);
+  // TODO(kreeger): Use input tensor as variable until scratch tensor allocation
+  // has been implemented (cl/263032056)
+  // TfLiteTensor* scratch = GetTemporary(context, node, /*index=*/0);
+  TfLiteTensor* scratch = &context->tensors[node->inputs->data[5]];
 
   TfLiteTensor* activation_state =
       &context->tensors[node->inputs->data[kInputActivationStateTensor]];
diff --git a/tensorflow/lite/experimental/micro/kernels/svdf_test.cc b/tensorflow/lite/experimental/micro/kernels/svdf_test.cc
index de3dcb83f66..b87bc4aad7a 100644
--- a/tensorflow/lite/experimental/micro/kernels/svdf_test.cc
+++ b/tensorflow/lite/experimental/micro/kernels/svdf_test.cc
@@ -18,7 +18,6 @@ limitations under the License.
 #include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/c_api_internal.h"
 #include "tensorflow/lite/experimental/micro/kernels/all_ops_resolver.h"
-#include "tensorflow/lite/experimental/micro/simple_tensor_allocator.h"
 #include "tensorflow/lite/experimental/micro/testing/micro_test.h"
 #include "tensorflow/lite/experimental/micro/testing/test_utils.h"
 
@@ -146,7 +145,10 @@ void ValidateSVDFGoldens(const int batch_size, const int num_units,
   }
 
   // Bias is an optional tensor:
-  int inputs_array_data[] = {5, 0, 1, 2, kOptionalTensor, 3};
+  // TODO(kreeger): Use input tensor as variable until scratch tensor allocation
+  // has been implemented (cl/263032056)
+  // int inputs_array_data[] = {5, 0, 1, 2, kOptionalTensor, 3};
+  int inputs_array_data[] = {6, 0, 1, 2, kOptionalTensor, 3, 5};
   TfLiteIntArray* inputs_array = IntArrayFromInts(inputs_array_data);
 
   int outputs_array_data[] = {1, 4};
diff --git a/tensorflow/lite/experimental/micro/memory_helpers.cc b/tensorflow/lite/experimental/micro/memory_helpers.cc
new file mode 100644
index 00000000000..7c32c3b5aef
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/memory_helpers.cc
@@ -0,0 +1,93 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/experimental/micro/memory_helpers.h"
+
+#include "tensorflow/lite/core/api/flatbuffer_conversions.h"
+
+namespace tflite {
+
+uint8_t* AlignPointerUp(uint8_t* data, size_t alignment) {
+  size_t data_as_size_t = reinterpret_cast<size_t>(data);
+  uint8_t* aligned_result = reinterpret_cast<uint8_t*>(
+      ((data_as_size_t + (alignment - 1)) / alignment) * alignment);
+  return aligned_result;
+}
+
+uint8_t* AlignPointerDown(uint8_t* data, size_t alignment) {
+  size_t data_as_size_t = reinterpret_cast<size_t>(data);
+  uint8_t* aligned_result =
+      reinterpret_cast<uint8_t*>((data_as_size_t / alignment) * alignment);
+  return aligned_result;
+}
+
+size_t AlignSizeUp(size_t size, size_t alignment) {
+  size_t aligned_size = (((size + (alignment - 1)) / alignment) * alignment);
+  return aligned_size;
+}
+
+TfLiteStatus TfLiteTypeSizeOf(TfLiteType type, size_t* size,
+                              ErrorReporter* reporter) {
+  switch (type) {
+    case kTfLiteFloat32:
+      *size = sizeof(float);
+      break;
+    case kTfLiteInt16:
+      *size = sizeof(int16_t);
+      break;
+    case kTfLiteInt32:
+      *size = sizeof(int32_t);
+      break;
+    case kTfLiteUInt8:
+      *size = sizeof(uint8_t);
+      break;
+    case kTfLiteInt8:
+      *size = sizeof(int8_t);
+      break;
+    case kTfLiteInt64:
+      *size = sizeof(int64_t);
+      break;
+    case kTfLiteBool:
+      *size = sizeof(bool);
+      break;
+    case kTfLiteComplex64:
+      *size = sizeof(float) * 2;
+      break;
+    default:
+      reporter->Report("Type %s (%d) not is not supported",
+                       TfLiteTypeGetName(type), type);
+      return kTfLiteError;
+  }
+  return kTfLiteOk;
+}
+
+TfLiteStatus BytesRequiredForTensor(const tflite::Tensor& flatbuffer_tensor,
+                                    size_t* bytes, size_t* type_size,
+                                    ErrorReporter* error_reporter) {
+  int element_count = 1;
+  for (size_t n = 0; n < flatbuffer_tensor.shape()->Length(); ++n) {
+    element_count *= flatbuffer_tensor.shape()->Get(n);
+  }
+
+  TfLiteType tf_lite_type;
+  TF_LITE_ENSURE_STATUS(ConvertTensorType(flatbuffer_tensor.type(),
+                                          &tf_lite_type, error_reporter));
+  TF_LITE_ENSURE_STATUS(
+      TfLiteTypeSizeOf(tf_lite_type, type_size, error_reporter));
+  *bytes = element_count * (*type_size);
+  return kTfLiteOk;
+}
+
+}  // namespace tflite
diff --git a/tensorflow/lite/experimental/micro/memory_helpers.h b/tensorflow/lite/experimental/micro/memory_helpers.h
new file mode 100644
index 00000000000..a956e67b02a
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/memory_helpers.h
@@ -0,0 +1,44 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_MICRO_MEMORY_HELPERS_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_MICRO_MEMORY_HELPERS_H_
+
+#include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/core/api/error_reporter.h"
+#include "tensorflow/lite/schema/schema_generated.h"
+
+namespace tflite {
+
+// Returns the next pointer address aligned to the given alignment.
+uint8_t* AlignPointerUp(uint8_t* data, size_t alignment);
+
+// Returns the previous pointer address aligned to the given alignment.
+uint8_t* AlignPointerDown(uint8_t* data, size_t alignment);
+
+// Returns an increased size that's a multiple of alignment.
+size_t AlignSizeUp(size_t size, size_t alignment);
+
+// Returns size in bytes for a given TfLiteType.
+TfLiteStatus TfLiteTypeSizeOf(TfLiteType type, size_t* size,
+                              ErrorReporter* reporter);
+
+// How many bytes are needed to hold a tensor's contents.
+TfLiteStatus BytesRequiredForTensor(const tflite::Tensor& flatbuffer_tensor,
+                                    size_t* bytes, size_t* type_size,
+                                    ErrorReporter* error_reporter);
+
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_MICRO_MEMORY_HELPERS_H_
diff --git a/tensorflow/lite/experimental/micro/memory_helpers_test.cc b/tensorflow/lite/experimental/micro/memory_helpers_test.cc
new file mode 100644
index 00000000000..b522bb901df
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/memory_helpers_test.cc
@@ -0,0 +1,166 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/experimental/micro/memory_helpers.h"
+
+#include "tensorflow/lite/experimental/micro/test_helpers.h"
+#include "tensorflow/lite/experimental/micro/testing/micro_test.h"
+
+TF_LITE_MICRO_TESTS_BEGIN
+
+TF_LITE_MICRO_TEST(TestAlignPointerUp) {
+  uint8_t* input0 = reinterpret_cast<uint8_t*>(0);
+
+  uint8_t* input0_aligned1 = tflite::AlignPointerUp(input0, 1);
+  TF_LITE_MICRO_EXPECT_EQ(input0, input0_aligned1);
+
+  uint8_t* input0_aligned2 = tflite::AlignPointerUp(input0, 2);
+  TF_LITE_MICRO_EXPECT_EQ(input0, input0_aligned2);
+
+  uint8_t* input0_aligned3 = tflite::AlignPointerUp(input0, 3);
+  TF_LITE_MICRO_EXPECT_EQ(input0, input0_aligned3);
+
+  uint8_t* input0_aligned16 = tflite::AlignPointerUp(input0, 16);
+  TF_LITE_MICRO_EXPECT_EQ(input0, input0_aligned16);
+
+  uint8_t* input23 = reinterpret_cast<uint8_t*>(23);
+
+  uint8_t* input23_aligned1 = tflite::AlignPointerUp(input23, 1);
+  TF_LITE_MICRO_EXPECT_EQ(input23, input23_aligned1);
+
+  uint8_t* input23_aligned2 = tflite::AlignPointerUp(input23, 2);
+  uint8_t* expected23_aligned2 = reinterpret_cast<uint8_t*>(24);
+  TF_LITE_MICRO_EXPECT_EQ(expected23_aligned2, input23_aligned2);
+
+  uint8_t* input23_aligned3 = tflite::AlignPointerUp(input23, 3);
+  uint8_t* expected23_aligned3 = reinterpret_cast<uint8_t*>(24);
+  TF_LITE_MICRO_EXPECT_EQ(expected23_aligned3, input23_aligned3);
+
+  uint8_t* input23_aligned16 = tflite::AlignPointerUp(input23, 16);
+  uint8_t* expected23_aligned16 = reinterpret_cast<uint8_t*>(32);
+  TF_LITE_MICRO_EXPECT_EQ(expected23_aligned16, input23_aligned16);
+}
+
+TF_LITE_MICRO_TEST(TestAlignPointerDown) {
+  uint8_t* input0 = reinterpret_cast<uint8_t*>(0);
+
+  uint8_t* input0_aligned1 = tflite::AlignPointerDown(input0, 1);
+  TF_LITE_MICRO_EXPECT_EQ(input0, input0_aligned1);
+
+  uint8_t* input0_aligned2 = tflite::AlignPointerDown(input0, 2);
+  TF_LITE_MICRO_EXPECT_EQ(input0, input0_aligned2);
+
+  uint8_t* input0_aligned3 = tflite::AlignPointerDown(input0, 3);
+  TF_LITE_MICRO_EXPECT_EQ(input0, input0_aligned3);
+
+  uint8_t* input0_aligned16 = tflite::AlignPointerDown(input0, 16);
+  TF_LITE_MICRO_EXPECT_EQ(input0, input0_aligned16);
+
+  uint8_t* input23 = reinterpret_cast<uint8_t*>(23);
+
+  uint8_t* input23_aligned1 = tflite::AlignPointerDown(input23, 1);
+  TF_LITE_MICRO_EXPECT_EQ(input23, input23_aligned1);
+
+  uint8_t* input23_aligned2 = tflite::AlignPointerDown(input23, 2);
+  uint8_t* expected23_aligned2 = reinterpret_cast<uint8_t*>(22);
+  TF_LITE_MICRO_EXPECT_EQ(expected23_aligned2, input23_aligned2);
+
+  uint8_t* input23_aligned3 = tflite::AlignPointerDown(input23, 3);
+  uint8_t* expected23_aligned3 = reinterpret_cast<uint8_t*>(21);
+  TF_LITE_MICRO_EXPECT_EQ(expected23_aligned3, input23_aligned3);
+
+  uint8_t* input23_aligned16 = tflite::AlignPointerDown(input23, 16);
+  uint8_t* expected23_aligned16 = reinterpret_cast<uint8_t*>(16);
+  TF_LITE_MICRO_EXPECT_EQ(expected23_aligned16, input23_aligned16);
+}
+
+TF_LITE_MICRO_TEST(TestAlignSizeUp) {
+  TF_LITE_MICRO_EXPECT_EQ(1, tflite::AlignSizeUp(1, 1));
+  TF_LITE_MICRO_EXPECT_EQ(2, tflite::AlignSizeUp(1, 2));
+  TF_LITE_MICRO_EXPECT_EQ(3, tflite::AlignSizeUp(1, 3));
+  TF_LITE_MICRO_EXPECT_EQ(16, tflite::AlignSizeUp(1, 16));
+
+  TF_LITE_MICRO_EXPECT_EQ(23, tflite::AlignSizeUp(23, 1));
+  TF_LITE_MICRO_EXPECT_EQ(24, tflite::AlignSizeUp(23, 2));
+  TF_LITE_MICRO_EXPECT_EQ(24, tflite::AlignSizeUp(23, 3));
+  TF_LITE_MICRO_EXPECT_EQ(32, tflite::AlignSizeUp(23, 16));
+}
+
+TF_LITE_MICRO_TEST(TestTypeSizeOf) {
+  size_t size;
+  TF_LITE_MICRO_EXPECT_EQ(
+      kTfLiteOk,
+      tflite::TfLiteTypeSizeOf(kTfLiteFloat32, &size, micro_test::reporter));
+  TF_LITE_MICRO_EXPECT_EQ(sizeof(float), size);
+
+  TF_LITE_MICRO_EXPECT_EQ(
+      kTfLiteOk,
+      tflite::TfLiteTypeSizeOf(kTfLiteInt16, &size, micro_test::reporter));
+  TF_LITE_MICRO_EXPECT_EQ(sizeof(int16_t), size);
+
+  TF_LITE_MICRO_EXPECT_EQ(
+      kTfLiteOk,
+      tflite::TfLiteTypeSizeOf(kTfLiteInt32, &size, micro_test::reporter));
+  TF_LITE_MICRO_EXPECT_EQ(sizeof(int32_t), size);
+
+  TF_LITE_MICRO_EXPECT_EQ(
+      kTfLiteOk,
+      tflite::TfLiteTypeSizeOf(kTfLiteUInt8, &size, micro_test::reporter));
+  TF_LITE_MICRO_EXPECT_EQ(sizeof(uint8_t), size);
+
+  TF_LITE_MICRO_EXPECT_EQ(
+      kTfLiteOk,
+      tflite::TfLiteTypeSizeOf(kTfLiteInt8, &size, micro_test::reporter));
+  TF_LITE_MICRO_EXPECT_EQ(sizeof(int8_t), size);
+
+  TF_LITE_MICRO_EXPECT_EQ(
+      kTfLiteOk,
+      tflite::TfLiteTypeSizeOf(kTfLiteInt64, &size, micro_test::reporter));
+  TF_LITE_MICRO_EXPECT_EQ(sizeof(int64_t), size);
+
+  TF_LITE_MICRO_EXPECT_EQ(
+      kTfLiteOk,
+      tflite::TfLiteTypeSizeOf(kTfLiteBool, &size, micro_test::reporter));
+  TF_LITE_MICRO_EXPECT_EQ(sizeof(bool), size);
+
+  TF_LITE_MICRO_EXPECT_EQ(
+      kTfLiteOk,
+      tflite::TfLiteTypeSizeOf(kTfLiteComplex64, &size, micro_test::reporter));
+  TF_LITE_MICRO_EXPECT_EQ(sizeof(float) * 2, size);
+
+  TF_LITE_MICRO_EXPECT_NE(
+      kTfLiteOk, tflite::TfLiteTypeSizeOf(static_cast<TfLiteType>(-1), &size,
+                                          micro_test::reporter));
+}
+
+TF_LITE_MICRO_TEST(TestBytesRequiredForTensor) {
+  const tflite::Tensor* tensor100 = tflite::Create1dFlatbufferTensor(100);
+  size_t bytes;
+  size_t type_size;
+  TF_LITE_MICRO_EXPECT_EQ(
+      kTfLiteOk, tflite::BytesRequiredForTensor(*tensor100, &bytes, &type_size,
+                                                micro_test::reporter));
+  TF_LITE_MICRO_EXPECT_EQ(400, bytes);
+  TF_LITE_MICRO_EXPECT_EQ(4, type_size);
+
+  const tflite::Tensor* tensor200 = tflite::Create1dFlatbufferTensor(200);
+  TF_LITE_MICRO_EXPECT_EQ(
+      kTfLiteOk, tflite::BytesRequiredForTensor(*tensor200, &bytes, &type_size,
+                                                micro_test::reporter));
+  TF_LITE_MICRO_EXPECT_EQ(800, bytes);
+  TF_LITE_MICRO_EXPECT_EQ(4, type_size);
+}
+
+TF_LITE_MICRO_TESTS_END
diff --git a/tensorflow/lite/experimental/micro/memory_planner/BUILD b/tensorflow/lite/experimental/micro/memory_planner/BUILD
new file mode 100644
index 00000000000..8418e696b55
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/memory_planner/BUILD
@@ -0,0 +1,90 @@
+load(
+    "//tensorflow/lite/experimental/micro/testing:micro_test.bzl",
+    "tflite_micro_cc_test",
+)
+
+package(
+    default_visibility = ["//visibility:public"],
+    licenses = ["notice"],  # Apache 2.0
+)
+
+cc_library(
+    name = "memory_planner",
+    hdrs = [
+        "memory_planner.h",
+    ],
+    copts = [
+        "-Werror",
+        "-Wdouble-promotion",
+        "-Wsign-compare",
+    ],
+    deps = [
+        "//tensorflow/lite/c:c_api_internal",
+        "//tensorflow/lite/core/api",
+        "//tensorflow/lite/schema:schema_fbs",
+    ],
+)
+
+cc_library(
+    name = "linear_memory_planner",
+    srcs = [
+        "linear_memory_planner.cc",
+    ],
+    hdrs = [
+        "linear_memory_planner.h",
+    ],
+    copts = [
+        "-Werror",
+        "-Wdouble-promotion",
+        "-Wsign-compare",
+    ],
+    deps = [
+        ":memory_planner",
+        "//tensorflow/lite/c:c_api_internal",
+        "//tensorflow/lite/core/api",
+        "//tensorflow/lite/schema:schema_fbs",
+    ],
+)
+
+cc_library(
+    name = "greedy_memory_planner",
+    srcs = [
+        "greedy_memory_planner.cc",
+    ],
+    hdrs = [
+        "greedy_memory_planner.h",
+    ],
+    copts = [
+        "-Werror",
+        "-Wdouble-promotion",
+        "-Wsign-compare",
+    ],
+    deps = [
+        ":memory_planner",
+        "//tensorflow/lite/c:c_api_internal",
+        "//tensorflow/lite/core/api",
+        "//tensorflow/lite/schema:schema_fbs",
+    ],
+)
+
+tflite_micro_cc_test(
+    name = "linear_memory_planner_test",
+    srcs = [
+        "linear_memory_planner_test.cc",
+    ],
+    deps = [
+        ":linear_memory_planner",
+        "//tensorflow/lite/experimental/micro/testing:micro_test",
+    ],
+)
+
+tflite_micro_cc_test(
+    name = "greedy_memory_planner_test",
+    srcs = [
+        "greedy_memory_planner_test.cc",
+    ],
+    deps = [
+        ":greedy_memory_planner",
+        "//tensorflow/lite/experimental/micro/testing:micro_test",
+    ],
+)
diff --git a/tensorflow/lite/experimental/micro/memory_planner/greedy_memory_planner.cc b/tensorflow/lite/experimental/micro/memory_planner/greedy_memory_planner.cc
new file mode 100644
index 00000000000..ed7d900fac2
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/memory_planner/greedy_memory_planner.cc
@@ -0,0 +1,376 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/experimental/micro/memory_planner/greedy_memory_planner.h"
+
+namespace tflite {
+
+// Simple stable in-place sort function. Not time-efficient for large arrays.
+// Would normally be in an anonymous namespace to keep it private, but we want
+// to be able to test it externally.
+void ReverseSortInPlace(int* values, int* ids, int size) {
+  bool any_swapped;
+  do {
+    any_swapped = false;
+    for (int i = 1; i < size; ++i) {
+      if (values[i - 1] < values[i]) {
+        const int value_temp = values[i - 1];
+        values[i - 1] = values[i];
+        values[i] = value_temp;
+        const int id_temp = ids[i - 1];
+        ids[i - 1] = ids[i];
+        ids[i] = id_temp;
+        any_swapped = true;
+      }
+    }
+  } while (any_swapped);
+}
+
+GreedyMemoryPlanner::GreedyMemoryPlanner(unsigned char* scratch_buffer,
+                                         int scratch_buffer_size)
+    : buffer_count_(0), need_to_calculate_offsets_(true) {
+  const int per_buffer_size = sizeof(BufferRequirements) +  // requirements_
+                              sizeof(int) +  // buffer_sizes_sorted_by_size_
+                              sizeof(int) +  // buffer_ids_sorted_by_size_
+                              sizeof(ListEntry) +  // buffers_sorted_by_offset_
+                              sizeof(int);         // buffer_offsets_;
+  // Allocate the arrays we need within the scratch buffer arena.
+  max_buffer_count_ = scratch_buffer_size / per_buffer_size;
+
+  unsigned char* next_free = scratch_buffer;
+  requirements_ = reinterpret_cast<BufferRequirements*>(next_free);
+  next_free += sizeof(BufferRequirements) * max_buffer_count_;
+
+  buffer_sizes_sorted_by_size_ = reinterpret_cast<int*>(next_free);
+  next_free += sizeof(int) * max_buffer_count_;
+
+  buffer_ids_sorted_by_size_ = reinterpret_cast<int*>(next_free);
+  next_free += sizeof(int) * max_buffer_count_;
+
+  buffers_sorted_by_offset_ = reinterpret_cast<ListEntry*>(next_free);
+  next_free += sizeof(ListEntry) * max_buffer_count_;
+
+  buffer_offsets_ = reinterpret_cast<int*>(next_free);
+}
+
+GreedyMemoryPlanner::~GreedyMemoryPlanner() {
+  // We don't own the scratch buffer, so don't deallocate anything.
+}
+
+TfLiteStatus GreedyMemoryPlanner::AddBuffer(
+    tflite::ErrorReporter* error_reporter, int size, int first_time_used,
+    int last_time_used) {
+  if (buffer_count_ >= max_buffer_count_) {
+    error_reporter->Report("Too many buffers (max is %d)", max_buffer_count_);
+    return kTfLiteError;
+  }
+  BufferRequirements* current = &requirements_[buffer_count_];
+  current->size = size;
+  current->first_time_used = first_time_used;
+  current->last_time_used = last_time_used;
+  ++buffer_count_;
+  need_to_calculate_offsets_ = true;
+  return kTfLiteOk;
+}
+
+bool GreedyMemoryPlanner::DoesEntryOverlapInTime(
+    const GreedyMemoryPlanner::ListEntry* entry, const int first_time_used,
+    const int last_time_used) const {
+  const BufferRequirements* entry_requirements =
+      &requirements_[entry->requirements_index];
+  if (entry_requirements->first_time_used > last_time_used) {
+    return false;
+  }
+  if (first_time_used > entry_requirements->last_time_used) {
+    return false;
+  }
+  return true;
+}
+
+GreedyMemoryPlanner::ListEntry*
+GreedyMemoryPlanner::NextSimultaneouslyActiveBuffer(
+    const GreedyMemoryPlanner::ListEntry* start, const int first_time_used,
+    const int last_time_used) {
+  ListEntry* result = nullptr;
+  ListEntry* candidate_next_entry;
+  if (start == nullptr) {
+    candidate_next_entry = &buffers_sorted_by_offset_[0];
+  } else {
+    if (start->next_entry_index == -1) {
+      return nullptr;
+    }
+    candidate_next_entry = &buffers_sorted_by_offset_[start->next_entry_index];
+  }
+  do {
+    if (DoesEntryOverlapInTime(candidate_next_entry, first_time_used,
+                               last_time_used)) {
+      result = candidate_next_entry;
+      break;
+    }
+    if (candidate_next_entry->next_entry_index == -1) {
+      break;
+    }
+    candidate_next_entry =
+        &buffers_sorted_by_offset_[candidate_next_entry->next_entry_index];
+  } while (true);
+  return result;
+}
+
+void GreedyMemoryPlanner::CalculateOffsetsIfNeeded() {
+  if (!need_to_calculate_offsets_ || (buffer_count_ == 0)) {
+    return;
+  }
+  need_to_calculate_offsets_ = false;
+
+  // Start off by ordering the buffers in descending order of size.
+  // This helps find a more compact layout. Intuitively, you can think
+  // about putting the large buffers in place first, and then the
+  // smaller buffers can fit in the gaps, rather than fragmenting the
+  // gaps with small buffers at the beginning.
+  for (int i = 0; i < buffer_count_; ++i) {
+    buffer_sizes_sorted_by_size_[i] = requirements_[i].size;
+    buffer_ids_sorted_by_size_[i] = i;
+    buffer_offsets_[i] = -1;
+  }
+  // This sorting algorithm is naive, and may end up taking a very long time
+  // with hundreds of buffers.
+  ReverseSortInPlace(buffer_sizes_sorted_by_size_, buffer_ids_sorted_by_size_,
+                     buffer_count_);
+
+  // Put the largest buffer at offset zero to start the process.
+  ListEntry* first_entry = &buffers_sorted_by_offset_[0];
+  first_entry->offset = 0;
+  first_entry->requirements_index = buffer_ids_sorted_by_size_[0];
+  first_entry->next_entry_index = -1;
+  next_free_entry_ = 1;
+  buffer_offsets_[buffer_ids_sorted_by_size_[0]] = 0;
+
+  // Work through the rest of the buffers to find a good gap to place each one.
+  for (int i = 1; i < buffer_count_; ++i) {
+    // The id is the order the buffer was originally added by the client.
+    const int buffer_id = buffer_ids_sorted_by_size_[i];
+    // Look at what size and time range the buffer needs to be active.
+    BufferRequirements* wanted_requirements = &requirements_[buffer_id];
+    const int wanted_size = wanted_requirements->size;
+    const int wanted_first_time_used = wanted_requirements->first_time_used;
+    const int wanted_last_time_used = wanted_requirements->last_time_used;
+
+    // Find the first buffer that's active in our time range. All placed
+    // buffers are stored in the order of their starting position in the arena
+    // so that it's easy to find the next buffer in memory, and so the gap.
+    // The candidate_entry variable holds the buffer that we're considering
+    // placing the current buffer after.
+    ListEntry* prior_entry = nullptr;
+    int candidate_offset = 0;
+    // Loop through the offset-ordered list of buffers, looking for gaps.
+    while (true) {
+      // Find out what the next active buffer is.
+      ListEntry* next_entry = NextSimultaneouslyActiveBuffer(
+          prior_entry, wanted_first_time_used, wanted_last_time_used);
+
+      if (prior_entry) {
+        BufferRequirements* candidate_requirements =
+            &requirements_[prior_entry->requirements_index];
+        const int prior_entry_offset =
+            prior_entry->offset + candidate_requirements->size;
+        if (prior_entry_offset > candidate_offset) {
+          candidate_offset = prior_entry_offset;
+        }
+      }
+      if (next_entry == nullptr) {
+        // We're at the end of the list, so we can always append the buffer
+        // here.
+        break;
+      }
+      // Find out how much space there is between us and the next buffer.
+      const int gap = next_entry->offset - candidate_offset;
+      if (gap >= wanted_size) {
+        // This entry has a big enough gap between it and the next, so
+        // use it!
+        break;
+      }
+      // The gap wasn't big enough, so move on to another candidate.
+      prior_entry = next_entry;
+    }
+    // At this point, we've either found a gap (possibly at the end of the
+    // list) and want to place the buffer there, or there are no other active
+    // buffers in this time range and so we can put it at offset zero.
+    // Record the buffer's offset in our plan.
+    buffer_offsets_[buffer_id] = candidate_offset;
+    // Add the newly-placed buffer to our offset-ordered list, so that
+    // subsequent passes can fit in their buffers around it.
+    ListEntry* new_entry = &buffers_sorted_by_offset_[next_free_entry_];
+    new_entry->offset = candidate_offset;
+    new_entry->requirements_index = buffer_id;
+    const int new_entry_index = next_free_entry_;
+    ++next_free_entry_;
+    ListEntry* current_entry = first_entry;
+    // Make sure that we insert the buffer at the correct place in the ordered
+    // list.
+    while (true) {
+      const int next_entry_index = current_entry->next_entry_index;
+      if (next_entry_index == -1) {
+        // We're at the end of the list, so just add the new entry here.
+        current_entry->next_entry_index = new_entry_index;
+        new_entry->next_entry_index = -1;
+        break;
+      }
+      ListEntry* next_entry = &buffers_sorted_by_offset_[next_entry_index];
+      if (next_entry->offset > candidate_offset) {
+        // We're at the right spot to do an insertion and retain the sorting
+        // order, so place the new entry here.
+        new_entry->next_entry_index = current_entry->next_entry_index;
+        current_entry->next_entry_index = new_entry_index;
+        break;
+      }
+      current_entry = next_entry;
+    }
+  }
+}
+
+int GreedyMemoryPlanner::GetMaximumMemorySize() {
+  CalculateOffsetsIfNeeded();
+  if (buffer_count_ == 0) {
+    return 0;
+  }
+  ListEntry* entry = &buffers_sorted_by_offset_[0];
+  int max_size = 0;
+  while (entry) {
+    BufferRequirements* requirements =
+        &requirements_[entry->requirements_index];
+    const int current_size = entry->offset + requirements->size;
+    if (current_size > max_size) {
+      max_size = current_size;
+    }
+    if (entry->next_entry_index == -1) {
+      break;
+    }
+    entry = &buffers_sorted_by_offset_[entry->next_entry_index];
+  }
+  return max_size;
+}
+
+void GreedyMemoryPlanner::PrintMemoryPlan(ErrorReporter* error_reporter) {
+  CalculateOffsetsIfNeeded();
+  constexpr int kLineWidth = 80;
+  int max_size = kLineWidth;
+  int max_time = 0;
+  for (int i = 0; i < buffer_count_; ++i) {
+    BufferRequirements* requirements = &requirements_[i];
+    const int offset = buffer_offsets_[i];
+    const int last_time_used = requirements->last_time_used;
+    const int size = offset + requirements->size;
+    if (size > max_size) {
+      max_size = size;
+    }
+    if (last_time_used > max_time) {
+      max_time = last_time_used;
+    }
+  }
+
+  char line[kLineWidth + 1];
+  for (int t = 0; t <= max_time; ++t) {
+    for (int c = 0; c < kLineWidth; ++c) {
+      line[c] = '.';
+    }
+    for (int i = 0; i < buffer_count_; ++i) {
+      BufferRequirements* requirements = &requirements_[i];
+      if ((t < requirements->first_time_used) ||
+          (t > requirements->last_time_used)) {
+        continue;
+      }
+      const int offset = buffer_offsets_[i];
+      if (offset == -1) {
+        continue;
+      }
+      const int size = requirements->size;
+      const int line_start = (offset * kLineWidth) / max_size;
+      const int line_end = ((offset + size) * kLineWidth) / max_size;
+      for (int n = line_start; n < line_end; ++n) {
+        if (line[n] == '.') {
+          char display;
+          if (i < 10) {
+            display = '0' + i;
+          } else if (i < 36) {
+            display = 'a' + (i - 10);
+          } else if (i < 62) {
+            display = 'A' + (i - 36);
+          } else {
+            display = '*';
+          }
+          line[n] = display;
+        } else {
+          line[n] = '!';
+        }
+      }
+    }
+    line[kLineWidth] = 0;
+    error_reporter->Report("%s", line);
+  }
+}
+
+int GreedyMemoryPlanner::GetBufferCount() { return buffer_count_; }
+
+TfLiteStatus GreedyMemoryPlanner::GetOffsetForBuffer(
+    tflite::ErrorReporter* error_reporter, int buffer_index, int* offset) {
+  CalculateOffsetsIfNeeded();
+  if ((buffer_index < 0) || (buffer_index >= buffer_count_)) {
+    error_reporter->Report("buffer index %d is outside range 0 to %d",
+                           buffer_index, buffer_count_);
+    return kTfLiteError;
+  }
+  *offset = buffer_offsets_[buffer_index];
+  return kTfLiteOk;
+}
+
+bool GreedyMemoryPlanner::DoAnyBuffersOverlap(ErrorReporter* error_reporter) {
+  CalculateOffsetsIfNeeded();
+  bool were_overlaps_found = false;
+  for (int i = 0; i < buffer_count_; ++i) {
+    BufferRequirements* a_requirements = &requirements_[i];
+    const int a_start_offset = buffer_offsets_[i];
+    const int a_first_time_used = a_requirements->first_time_used;
+    const int a_last_time_used = a_requirements->last_time_used;
+    const int a_end_offset = a_start_offset + a_requirements->size;
+    for (int j = 0; j < buffer_count_; ++j) {
+      if (i == j) {
+        continue;
+      }
+      BufferRequirements* b_requirements = &requirements_[j];
+      const int b_start_offset = buffer_offsets_[j];
+      const int b_first_time_used = b_requirements->first_time_used;
+      const int b_last_time_used = b_requirements->last_time_used;
+      const int b_end_offset = b_start_offset + b_requirements->size;
+      if ((a_first_time_used > b_last_time_used) ||
+          (b_first_time_used > a_last_time_used)) {
+        // Buffers don't overlap in time.
+        continue;
+      }
+      if ((a_start_offset >= b_end_offset) ||
+          (b_start_offset >= a_end_offset)) {
+        // No overlap in memory.
+        continue;
+      }
+      were_overlaps_found = true;
+      error_reporter->Report(
+          "Overlap: %d (%d=>%d, %d->%d) vs %d (%d=>%d, %d->%d)", i,
+          a_first_time_used, a_last_time_used, a_start_offset, a_end_offset, j,
+          b_first_time_used, b_last_time_used, b_start_offset, b_end_offset);
+    }
+  }
+  return were_overlaps_found;
+}
+
+}  // namespace tflite
diff --git a/tensorflow/lite/experimental/micro/memory_planner/greedy_memory_planner.h b/tensorflow/lite/experimental/micro/memory_planner/greedy_memory_planner.h
new file mode 100644
index 00000000000..2d681860529
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/memory_planner/greedy_memory_planner.h
@@ -0,0 +1,132 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_MICRO_MEMORY_PLANNER_GREEDY_MEMORY_PLANNER_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_MICRO_MEMORY_PLANNER_GREEDY_MEMORY_PLANNER_H_
+
+#include "tensorflow/lite/experimental/micro/memory_planner/memory_planner.h"
+
+namespace tflite {
+
+// A memory planner that uses a greedy algorithm to arrange buffers in memory
+// to minimize the overall arena size needed.
+//
+// The algorithm works like this:
+//  - The client enters the buffer information through AddBuffer().
+//  - When a function like GetOffsetForBuffer() is called, the
+//    CalculateOffsetsIfNeeded() method is invoked.
+//  - If an up to date plan is not already present, one will be calculated.
+//  - The buffers are sorted in descending order of size.
+//  - The largest buffer is placed at offset zero.
+//  - The rest of the buffers are looped through in descending size order.
+//  - The other buffers that need to be in memory at the same time are found.
+//  - The first gap between simultaneously active buffers that the current
+//    buffer fits into will be used.
+//  - If no large-enough gap is found, the current buffer is placed after the
+//    last buffer that's simultaneously active.
+//  - This continues until all buffers are placed, and the offsets stored.
+//
+// This is not guaranteed to produce the best placement, since that's an
+// NP-Complete problem, but in practice it should produce one that's decent.
+class GreedyMemoryPlanner : public MemoryPlanner {
+ public:
+  // You need to pass in an area of memory to be used for planning. This memory
+  // needs to have a lifetime as long as the planner, but isn't owned by this
+  // object, so management should be handled by the client. This is so it can be
+  // stack or globally allocated if necessary on devices without dynamic memory
+  // allocation. How many buffers can be planned for will depend on the size of
+  // this scratch memory, so you should enlarge it if you see an error when
+  // calling AddBuffer(). The memory can be reused once you're done with the
+  // planner, as long as you copy the calculated offsets to another location.
+  // Each buffer requires about 36 bytes of scratch.
+  GreedyMemoryPlanner(unsigned char* scratch_buffer, int scratch_buffer_size);
+  ~GreedyMemoryPlanner() override;
+
+  // Record details of a buffer we want to place.
+  TfLiteStatus AddBuffer(ErrorReporter* error_reporter, int size,
+                         int first_time_used, int last_time_used) override;
+
+  // Returns the high-water mark of used memory. This is the minimum size of a
+  // memory arena you'd need to allocate to hold these buffers.
+  int GetMaximumMemorySize() override;
+
+  // How many buffers have been recorded.
+  int GetBufferCount() override;
+
+  // Where a given buffer should be placed in the memory arena.
+  // This information is stored in the memory arena itself, so once the arena
+  // is used for inference, it will be overwritten.
+  TfLiteStatus GetOffsetForBuffer(ErrorReporter* error_reporter,
+                                  int buffer_index, int* offset) override;
+
+  // Prints an ascii-art diagram of the buffer layout plan.
+  void PrintMemoryPlan(ErrorReporter* error_reporter);
+
+  // Debug method to check whether any buffer allocations are overlapping. This
+  // is an O(N^2) complexity operation, so only use for testing.
+  bool DoAnyBuffersOverlap(ErrorReporter* error_reporter);
+
+  // Used to store a list of buffers ordered by their offset.
+  struct ListEntry {
+    int offset;
+    int requirements_index;
+    int next_entry_index;
+  };
+
+ private:
+  // Whether a buffer is active in a given time range.
+  bool DoesEntryOverlapInTime(const ListEntry* entry, const int first_time_used,
+                              const int last_time_used) const;
+
+  // Walks the list to return the next buffer that is active in a given time
+  // range, or a null pointer if there are none.
+  ListEntry* NextSimultaneouslyActiveBuffer(const ListEntry* start,
+                                            const int first_time_used,
+                                            const int last_time_used);
+
+  // If there isn't an up to date plan, calculate a new one.
+  void CalculateOffsetsIfNeeded();
+
+  // How many buffers we can plan for, based on the arena size we're given in
+  // the constructor.
+  int max_buffer_count_;
+
+  // The number of buffers added so far.
+  int buffer_count_;
+
+  // Records the client-provided information about each buffer.
+  struct BufferRequirements {
+    int size;
+    int first_time_used;
+    int last_time_used;
+  };
+
+  // Working arrays used during the layout algorithm.
+  BufferRequirements* requirements_;
+  int* buffer_sizes_sorted_by_size_;
+  int* buffer_ids_sorted_by_size_;
+  ListEntry* buffers_sorted_by_offset_;
+  int next_free_entry_;
+
+  // Stores the outcome of the plan, the location of each buffer in the arena.
+  int* buffer_offsets_;
+
+  // Whether buffers have been added since the last plan was calculated.
+  bool need_to_calculate_offsets_;
+};
+
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_MICRO_MEMORY_PLANNER_GREEDY_MEMORY_PLANNER_H_
diff --git a/tensorflow/lite/experimental/micro/memory_planner/greedy_memory_planner_test.cc b/tensorflow/lite/experimental/micro/memory_planner/greedy_memory_planner_test.cc
new file mode 100644
index 00000000000..c1a6c3d7239
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/memory_planner/greedy_memory_planner_test.cc
@@ -0,0 +1,270 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/experimental/micro/memory_planner/greedy_memory_planner.h"
+
+#include "tensorflow/lite/experimental/micro/testing/micro_test.h"
+
+namespace tflite {
+// We don't declare this in the header since it's not a public interface, but we
+// need to call it to test it, so declare it here instead.
+void ReverseSortInPlace(int* values, int* ids, int size);
+}  // namespace tflite
+
+namespace {
+constexpr int kScratchBufferSize = 4096;
+unsigned char g_scratch_buffer[kScratchBufferSize];
+}  // namespace
+
+TF_LITE_MICRO_TESTS_BEGIN
+
+TF_LITE_MICRO_TEST(TestReverseSortInPlace) {
+  tflite::MicroErrorReporter micro_error_reporter;
+  tflite::ErrorReporter* error_reporter = &micro_error_reporter;
+
+  constexpr int a_size = 10;
+  int a_values[a_size] = {10, 9, 8, 7, 6, 5, 4, 3, 2, 1};
+  int a_ids[a_size] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
+  const int a_expected_values[a_size] = {10, 9, 8, 7, 6, 5, 4, 3, 2, 1};
+  const int a_expected_ids[a_size] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
+  tflite::ReverseSortInPlace(a_values, a_ids, a_size);
+  for (int i = 0; i < a_size; ++i) {
+    TF_LITE_MICRO_EXPECT_EQ(a_expected_values[i], a_values[i]);
+    TF_LITE_MICRO_EXPECT_EQ(a_expected_ids[i], a_ids[i]);
+  }
+
+  constexpr int b_size = 10;
+  int b_values[b_size] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10};
+  int b_ids[b_size] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
+  const int b_expected_values[b_size] = {10, 9, 8, 7, 6, 5, 4, 3, 2, 1};
+  const int b_expected_ids[b_size] = {9, 8, 7, 6, 5, 4, 3, 2, 1, 0};
+  tflite::ReverseSortInPlace(b_values, b_ids, b_size);
+  for (int i = 0; i < b_size; ++i) {
+    TF_LITE_MICRO_EXPECT_EQ(b_expected_values[i], b_values[i]);
+    TF_LITE_MICRO_EXPECT_EQ(b_expected_ids[i], b_ids[i]);
+  }
+
+  constexpr int c_size = 100;
+  int c_values[c_size] = {
+      1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10,
+      1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10,
+      1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10,
+      1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10,
+      1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10};
+  int c_ids[c_size] = {
+      0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 16,
+      17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
+      34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50,
+      51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67,
+      68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84,
+      85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99};
+  const int c_expected_values[c_size] = {
+      10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+      8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+      6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+      4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+      2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
+  const int c_expected_ids[c_size] = {
+      9,  19, 29, 39, 49, 59, 69, 79, 89, 99, 8,  18, 28, 38, 48, 58, 68,
+      78, 88, 98, 7,  17, 27, 37, 47, 57, 67, 77, 87, 97, 6,  16, 26, 36,
+      46, 56, 66, 76, 86, 96, 5,  15, 25, 35, 45, 55, 65, 75, 85, 95, 4,
+      14, 24, 34, 44, 54, 64, 74, 84, 94, 3,  13, 23, 33, 43, 53, 63, 73,
+      83, 93, 2,  12, 22, 32, 42, 52, 62, 72, 82, 92, 1,  11, 21, 31, 41,
+      51, 61, 71, 81, 91, 0,  10, 20, 30, 40, 50, 60, 70, 80, 90};
+  tflite::ReverseSortInPlace(c_values, c_ids, c_size);
+  for (int i = 0; i < c_size; ++i) {
+    TF_LITE_MICRO_EXPECT_EQ(c_expected_values[i], c_values[i]);
+    TF_LITE_MICRO_EXPECT_EQ(c_expected_ids[i], c_ids[i]);
+  }
+}
+
+TF_LITE_MICRO_TEST(TestGreedyBasics) {
+  tflite::MicroErrorReporter micro_error_reporter;
+  tflite::ErrorReporter* error_reporter = &micro_error_reporter;
+
+  tflite::GreedyMemoryPlanner planner(g_scratch_buffer, kScratchBufferSize);
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk,
+                          planner.AddBuffer(error_reporter, 10, 0, 1));
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk,
+                          planner.AddBuffer(error_reporter, 20, 2, 3));
+
+  TF_LITE_MICRO_EXPECT_EQ(false, planner.DoAnyBuffersOverlap(error_reporter));
+
+  TF_LITE_MICRO_EXPECT_EQ(20, planner.GetMaximumMemorySize());
+
+  int offset = -1;
+  TF_LITE_MICRO_EXPECT_EQ(
+      kTfLiteOk, planner.GetOffsetForBuffer(error_reporter, 0, &offset));
+  TF_LITE_MICRO_EXPECT_EQ(0, offset);
+
+  TF_LITE_MICRO_EXPECT_EQ(
+      kTfLiteOk, planner.GetOffsetForBuffer(error_reporter, 1, &offset));
+  TF_LITE_MICRO_EXPECT_EQ(0, offset);
+}
+
+TF_LITE_MICRO_TEST(TestGreedyMedium) {
+  tflite::MicroErrorReporter micro_error_reporter;
+  tflite::ErrorReporter* error_reporter = &micro_error_reporter;
+
+  tflite::GreedyMemoryPlanner planner(g_scratch_buffer, kScratchBufferSize);
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk,
+                          planner.AddBuffer(error_reporter, 10, 0, 1));
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk,
+                          planner.AddBuffer(error_reporter, 20, 1, 2));
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk,
+                          planner.AddBuffer(error_reporter, 30, 2, 3));
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk,
+                          planner.AddBuffer(error_reporter, 40, 3, 4));
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk,
+                          planner.AddBuffer(error_reporter, 50, 0, 1));
+
+  int offset = -1;
+  TF_LITE_MICRO_EXPECT_EQ(
+      kTfLiteOk, planner.GetOffsetForBuffer(error_reporter, 0, &offset));
+  TF_LITE_MICRO_EXPECT_EQ(50, offset);
+
+  TF_LITE_MICRO_EXPECT_EQ(
+      kTfLiteOk, planner.GetOffsetForBuffer(error_reporter, 1, &offset));
+  TF_LITE_MICRO_EXPECT_EQ(70, offset);
+
+  TF_LITE_MICRO_EXPECT_EQ(
+      kTfLiteOk, planner.GetOffsetForBuffer(error_reporter, 2, &offset));
+  TF_LITE_MICRO_EXPECT_EQ(40, offset);
+
+  TF_LITE_MICRO_EXPECT_EQ(
+      kTfLiteOk, planner.GetOffsetForBuffer(error_reporter, 3, &offset));
+  TF_LITE_MICRO_EXPECT_EQ(0, offset);
+
+  TF_LITE_MICRO_EXPECT_EQ(
+      kTfLiteOk, planner.GetOffsetForBuffer(error_reporter, 4, &offset));
+  TF_LITE_MICRO_EXPECT_EQ(0, offset);
+
+  planner.PrintMemoryPlan(error_reporter);
+
+  TF_LITE_MICRO_EXPECT_EQ(false, planner.DoAnyBuffersOverlap(error_reporter));
+
+  TF_LITE_MICRO_EXPECT_EQ(90, planner.GetMaximumMemorySize());
+}
+
+TF_LITE_MICRO_TEST(TestPersonDetectionModel) {
+  tflite::MicroErrorReporter micro_error_reporter;
+  tflite::ErrorReporter* error_reporter = &micro_error_reporter;
+
+  tflite::GreedyMemoryPlanner planner(g_scratch_buffer, kScratchBufferSize);
+  // These buffer sizes and time ranges are taken from the 250KB MobileNet model
+  // used in the person detection example.
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk,
+                          planner.AddBuffer(error_reporter, 9216, 0, 29));
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk,
+                          planner.AddBuffer(error_reporter, 3, 28, 29));
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk,
+                          planner.AddBuffer(error_reporter, 256, 27, 28));
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk,
+                          planner.AddBuffer(error_reporter, 2304, 26, 27));
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk,
+                          planner.AddBuffer(error_reporter, 2304, 25, 26));
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk,
+                          planner.AddBuffer(error_reporter, 2304, 24, 25));
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk,
+                          planner.AddBuffer(error_reporter, 1152, 23, 24));
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk,
+                          planner.AddBuffer(error_reporter, 4608, 22, 23));
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk,
+                          planner.AddBuffer(error_reporter, 4608, 21, 22));
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk,
+                          planner.AddBuffer(error_reporter, 4608, 20, 21));
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk,
+                          planner.AddBuffer(error_reporter, 4608, 19, 20));
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk,
+                          planner.AddBuffer(error_reporter, 4608, 18, 19));
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk,
+                          planner.AddBuffer(error_reporter, 4608, 17, 18));
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk,
+                          planner.AddBuffer(error_reporter, 4608, 16, 17));
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk,
+                          planner.AddBuffer(error_reporter, 4608, 15, 16));
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk,
+                          planner.AddBuffer(error_reporter, 4608, 14, 15));
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk,
+                          planner.AddBuffer(error_reporter, 4608, 13, 14));
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk,
+                          planner.AddBuffer(error_reporter, 4608, 12, 13));
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk,
+                          planner.AddBuffer(error_reporter, 2304, 11, 12));
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk,
+                          planner.AddBuffer(error_reporter, 9216, 10, 11));
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk,
+                          planner.AddBuffer(error_reporter, 9216, 9, 10));
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk,
+                          planner.AddBuffer(error_reporter, 9216, 8, 9));
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk,
+                          planner.AddBuffer(error_reporter, 4608, 7, 8));
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk,
+                          planner.AddBuffer(error_reporter, 18432, 6, 7));
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk,
+                          planner.AddBuffer(error_reporter, 18432, 5, 6));
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk,
+                          planner.AddBuffer(error_reporter, 18432, 4, 5));
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk,
+                          planner.AddBuffer(error_reporter, 9216, 3, 4));
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk,
+                          planner.AddBuffer(error_reporter, 36864, 2, 3));
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk,
+                          planner.AddBuffer(error_reporter, 18432, 1, 2));
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk,
+                          planner.AddBuffer(error_reporter, 18432, 0, 1));
+
+  planner.PrintMemoryPlan(error_reporter);
+
+  TF_LITE_MICRO_EXPECT_EQ(false, planner.DoAnyBuffersOverlap(error_reporter));
+
+  // The sum of all the buffers is 241,027 bytes, so we at least expect the plan
+  // to come up with something smaller than this.
+  TF_LITE_MICRO_EXPECT_GT(241027, planner.GetMaximumMemorySize());
+}
+
+TF_LITE_MICRO_TEST(TestOverlapCase) {
+  tflite::MicroErrorReporter micro_error_reporter;
+  tflite::ErrorReporter* error_reporter = &micro_error_reporter;
+
+  tflite::GreedyMemoryPlanner planner(g_scratch_buffer, kScratchBufferSize);
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk,
+                          planner.AddBuffer(error_reporter, 100, 0, 1));
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk,
+                          planner.AddBuffer(error_reporter, 50, 2, 3));
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk,
+                          planner.AddBuffer(error_reporter, 20, 1, 2));
+
+  planner.PrintMemoryPlan(error_reporter);
+
+  TF_LITE_MICRO_EXPECT_EQ(false, planner.DoAnyBuffersOverlap(error_reporter));
+
+  TF_LITE_MICRO_EXPECT_EQ(120, planner.GetMaximumMemorySize());
+}
+
+TF_LITE_MICRO_TEST(TestSmallScratch) {
+  tflite::MicroErrorReporter micro_error_reporter;
+  tflite::ErrorReporter* error_reporter = &micro_error_reporter;
+
+  constexpr int scratch_buffer_size = 40;
+  unsigned char scratch_buffer[scratch_buffer_size];
+  tflite::GreedyMemoryPlanner planner(scratch_buffer, scratch_buffer_size);
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk,
+                          planner.AddBuffer(error_reporter, 100, 0, 1));
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteError,
+                          planner.AddBuffer(error_reporter, 50, 2, 3));
+}
+
+TF_LITE_MICRO_TESTS_END
diff --git a/tensorflow/lite/experimental/micro/memory_planner/linear_memory_planner.cc b/tensorflow/lite/experimental/micro/memory_planner/linear_memory_planner.cc
new file mode 100644
index 00000000000..eb1eef74344
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/memory_planner/linear_memory_planner.cc
@@ -0,0 +1,52 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/experimental/micro/memory_planner/linear_memory_planner.h"
+
+namespace tflite {
+
+LinearMemoryPlanner::LinearMemoryPlanner()
+    : current_buffer_count_(0), next_free_offset_(0) {}
+LinearMemoryPlanner::~LinearMemoryPlanner() {}
+
+TfLiteStatus LinearMemoryPlanner::AddBuffer(
+    tflite::ErrorReporter* error_reporter, int size, int first_time_used,
+    int last_time_used) {
+  if (current_buffer_count_ >= kMaxBufferCount) {
+    error_reporter->Report("Too many buffers (max is %d)", kMaxBufferCount);
+    return kTfLiteError;
+  }
+  buffer_offsets_[current_buffer_count_] = next_free_offset_;
+  next_free_offset_ += size;
+  ++current_buffer_count_;
+  return kTfLiteOk;
+}
+
+int LinearMemoryPlanner::GetMaximumMemorySize() { return next_free_offset_; }
+
+int LinearMemoryPlanner::GetBufferCount() { return current_buffer_count_; }
+
+TfLiteStatus LinearMemoryPlanner::GetOffsetForBuffer(
+    tflite::ErrorReporter* error_reporter, int buffer_index, int* offset) {
+  if ((buffer_index < 0) || (buffer_index >= current_buffer_count_)) {
+    error_reporter->Report("buffer index %d is outside range 0 to %d",
+                           buffer_index, current_buffer_count_);
+    return kTfLiteError;
+  }
+  *offset = buffer_offsets_[buffer_index];
+  return kTfLiteOk;
+}
+
+}  // namespace tflite
diff --git a/tensorflow/lite/experimental/micro/memory_planner/linear_memory_planner.h b/tensorflow/lite/experimental/micro/memory_planner/linear_memory_planner.h
new file mode 100644
index 00000000000..fe4b71ece83
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/memory_planner/linear_memory_planner.h
@@ -0,0 +1,47 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_MICRO_MEMORY_PLANNER_LINEAR_MEMORY_PLANNER_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_MICRO_MEMORY_PLANNER_LINEAR_MEMORY_PLANNER_H_
+
+#include "tensorflow/lite/experimental/micro/memory_planner/memory_planner.h"
+
+namespace tflite {
+
+// The simplest possible memory planner that just lays out all buffers at
+// increasing offsets without trying to reuse memory.
+class LinearMemoryPlanner : public MemoryPlanner {
+ public:
+  LinearMemoryPlanner();
+  ~LinearMemoryPlanner() override;
+
+  TfLiteStatus AddBuffer(tflite::ErrorReporter* error_reporter, int size,
+                         int first_time_used, int last_time_used) override;
+
+  int GetMaximumMemorySize() override;
+  int GetBufferCount() override;
+  TfLiteStatus GetOffsetForBuffer(tflite::ErrorReporter* error_reporter,
+                                  int buffer_index, int* offset) override;
+
+ private:
+  static constexpr int kMaxBufferCount = 1024;
+  int buffer_offsets_[kMaxBufferCount];
+  int current_buffer_count_;
+  int next_free_offset_;
+};
+
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_MICRO_MEMORY_PLANNER_LINEAR_MEMORY_PLANNER_H_
diff --git a/tensorflow/lite/experimental/micro/memory_planner/linear_memory_planner_test.cc b/tensorflow/lite/experimental/micro/memory_planner/linear_memory_planner_test.cc
new file mode 100644
index 00000000000..17cf2f3b1e0
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/memory_planner/linear_memory_planner_test.cc
@@ -0,0 +1,124 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/experimental/micro/memory_planner/linear_memory_planner.h"
+
+#include "tensorflow/lite/experimental/micro/testing/micro_test.h"
+
+TF_LITE_MICRO_TESTS_BEGIN
+
+TF_LITE_MICRO_TEST(TestBasics) {
+  tflite::MicroErrorReporter micro_error_reporter;
+  tflite::ErrorReporter* error_reporter = &micro_error_reporter;
+
+  tflite::LinearMemoryPlanner planner;
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk,
+                          planner.AddBuffer(error_reporter, 10, 0, 1));
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk,
+                          planner.AddBuffer(error_reporter, 20, 1, 2));
+  TF_LITE_MICRO_EXPECT_EQ(30, planner.GetMaximumMemorySize());
+
+  int offset = -1;
+  TF_LITE_MICRO_EXPECT_EQ(
+      kTfLiteOk, planner.GetOffsetForBuffer(error_reporter, 0, &offset));
+  TF_LITE_MICRO_EXPECT_EQ(0, offset);
+
+  TF_LITE_MICRO_EXPECT_EQ(
+      kTfLiteOk, planner.GetOffsetForBuffer(error_reporter, 1, &offset));
+  TF_LITE_MICRO_EXPECT_EQ(10, offset);
+}
+
+TF_LITE_MICRO_TEST(TestErrorHandling) {
+  tflite::MicroErrorReporter micro_error_reporter;
+  tflite::ErrorReporter* error_reporter = &micro_error_reporter;
+
+  tflite::LinearMemoryPlanner planner;
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk,
+                          planner.AddBuffer(error_reporter, 10, 0, 1));
+
+  int offset = -1;
+  TF_LITE_MICRO_EXPECT_EQ(
+      kTfLiteError, planner.GetOffsetForBuffer(error_reporter, 1, &offset));
+}
+
+TF_LITE_MICRO_TEST(TestPersonDetectionModel) {
+  tflite::MicroErrorReporter micro_error_reporter;
+  tflite::ErrorReporter* error_reporter = &micro_error_reporter;
+
+  tflite::LinearMemoryPlanner planner;
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk,
+                          planner.AddBuffer(error_reporter, 9216, 0, 29));
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk,
+                          planner.AddBuffer(error_reporter, 3, 28, 29));
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk,
+                          planner.AddBuffer(error_reporter, 256, 27, 28));
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk,
+                          planner.AddBuffer(error_reporter, 2304, 26, 27));
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk,
+                          planner.AddBuffer(error_reporter, 2304, 25, 26));
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk,
+                          planner.AddBuffer(error_reporter, 2304, 24, 25));
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk,
+                          planner.AddBuffer(error_reporter, 1152, 23, 24));
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk,
+                          planner.AddBuffer(error_reporter, 4608, 22, 23));
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk,
+                          planner.AddBuffer(error_reporter, 4608, 21, 22));
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk,
+                          planner.AddBuffer(error_reporter, 4608, 20, 21));
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk,
+                          planner.AddBuffer(error_reporter, 4608, 19, 20));
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk,
+                          planner.AddBuffer(error_reporter, 4608, 18, 19));
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk,
+                          planner.AddBuffer(error_reporter, 4608, 17, 18));
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk,
+                          planner.AddBuffer(error_reporter, 4608, 16, 17));
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk,
+                          planner.AddBuffer(error_reporter, 4608, 15, 16));
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk,
+                          planner.AddBuffer(error_reporter, 4608, 14, 15));
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk,
+                          planner.AddBuffer(error_reporter, 4608, 13, 14));
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk,
+                          planner.AddBuffer(error_reporter, 4608, 12, 13));
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk,
+                          planner.AddBuffer(error_reporter, 2304, 11, 12));
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk,
+                          planner.AddBuffer(error_reporter, 9216, 10, 11));
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk,
+                          planner.AddBuffer(error_reporter, 9216, 9, 10));
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk,
+                          planner.AddBuffer(error_reporter, 9216, 8, 9));
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk,
+                          planner.AddBuffer(error_reporter, 4608, 7, 8));
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk,
+                          planner.AddBuffer(error_reporter, 18432, 6, 7));
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk,
+                          planner.AddBuffer(error_reporter, 18432, 5, 6));
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk,
+                          planner.AddBuffer(error_reporter, 18432, 4, 5));
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk,
+                          planner.AddBuffer(error_reporter, 9216, 3, 4));
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk,
+                          planner.AddBuffer(error_reporter, 36864, 2, 3));
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk,
+                          planner.AddBuffer(error_reporter, 18432, 1, 2));
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk,
+                          planner.AddBuffer(error_reporter, 18432, 0, 1));
+  TF_LITE_MICRO_EXPECT_EQ(241027, planner.GetMaximumMemorySize());
+}
+
+TF_LITE_MICRO_TESTS_END
diff --git a/tensorflow/lite/experimental/micro/memory_planner/memory_planner.h b/tensorflow/lite/experimental/micro/memory_planner/memory_planner.h
new file mode 100644
index 00000000000..26376a834bd
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/memory_planner/memory_planner.h
@@ -0,0 +1,71 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_MICRO_MEMORY_PLANNER_MEMORY_PLANNER_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_MICRO_MEMORY_PLANNER_MEMORY_PLANNER_H_
+
+#include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/core/api/error_reporter.h"
+
+namespace tflite {
+
+// Interface class for planning the layout of memory buffers during the
+// execution of a graph.
+// It's designed to be used by a client that iterates in any order through the
+// buffers it wants to lay out, and then calls the getter functions for
+// information about the calculated layout. For example:
+//
+// SomeMemoryPlanner planner;
+// planner.AddBuffer(reporter, 100, 0, 1);  // Buffer 0
+// planner.AddBuffer(reporter, 50, 2, 3);   // Buffer 1
+// planner.AddBuffer(reporter, 50, 2, 3);   // Buffer 2
+//
+// int offset0;
+// TF_EXPECT_OK(planner.GetOffsetForBuffer(reporter, 0, &offset0));
+// int offset1;
+// TF_EXPECT_OK(planner.GetOffsetForBuffer(reporter, 1, &offset1));
+// int offset2;
+// TF_EXPECT_OK(planner.GetOffsetForBuffer(reporter, 2, &offset2));
+// const int arena_size_needed = planner.GetMaximumMemorySize();
+//
+// The goal is for applications to be able to experiment with different layout
+// strategies without changing their client code, by swapping out classes that
+// implement this interface.=
+class MemoryPlanner {
+ public:
+  MemoryPlanner() {}
+  virtual ~MemoryPlanner() {}
+
+  // Pass information about a buffer's size and lifetime to the layout
+  // algorithm. The order this is called implicitly assigns an index to the
+  // result, so the buffer information that's passed into the N-th call of
+  // this method will be used as the buffer_index argument to
+  // GetOffsetForBuffer().
+  virtual TfLiteStatus AddBuffer(tflite::ErrorReporter* error_reporter,
+                                 int size, int first_time_used,
+                                 int last_time_used) = 0;
+
+  // The largest contguous block of memory that's needed to hold the layout.
+  virtual int GetMaximumMemorySize() = 0;
+  // How many buffers have been added to the planner.
+  virtual int GetBufferCount() = 0;
+  // Calculated layout offset for the N-th buffer added to the planner.
+  virtual TfLiteStatus GetOffsetForBuffer(tflite::ErrorReporter* error_reporter,
+                                          int buffer_index, int* offset) = 0;
+};
+
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_MICRO_MEMORY_PLANNER_MEMORY_PLANNER_H_
diff --git a/tensorflow/lite/experimental/micro/micro_allocator.cc b/tensorflow/lite/experimental/micro/micro_allocator.cc
index 84fb6416a19..72ec2516e51 100644
--- a/tensorflow/lite/experimental/micro/micro_allocator.cc
+++ b/tensorflow/lite/experimental/micro/micro_allocator.cc
@@ -16,17 +16,38 @@ limitations under the License.
 #include "tensorflow/lite/experimental/micro/micro_allocator.h"
 
 #include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/core/api/flatbuffer_conversions.h"
 #include "tensorflow/lite/core/api/tensor_utils.h"
+#include "tensorflow/lite/experimental/micro/memory_helpers.h"
+#include "tensorflow/lite/experimental/micro/memory_planner/greedy_memory_planner.h"
 
 namespace tflite {
 
+namespace {
+// Used to hold information used during allocation calculations.
+struct TensorInfo {
+  const tflite::Tensor* flatbuffer_tensor;
+  TfLiteTensor* runtime_tensor;
+  int first_created;
+  int last_used;
+  bool needs_allocating;
+};
+
+// We align tensor buffers to 16-byte boundaries, since this is a common
+// requirement for SIMD extensions.
+constexpr int kBufferAlignment = 16;
+
+}  // namespace
+
 MicroAllocator::MicroAllocator(TfLiteContext* context, const Model* model,
                                uint8_t* tensor_arena, size_t arena_size,
                                ErrorReporter* error_reporter)
     : model_(model),
-      tensor_allocator_(tensor_arena, arena_size),
+      memory_allocator_(tensor_arena, arena_size),
       error_reporter_(error_reporter),
-      context_(context) {
+      context_(context),
+      arena_(tensor_arena),
+      arena_size_(arena_size) {
   auto* subgraphs = model->subgraphs();
   if (subgraphs->size() != 1) {
     error_reporter->Report("Only 1 subgraph is currently supported.\n");
@@ -38,14 +59,13 @@ MicroAllocator::MicroAllocator(TfLiteContext* context, const Model* model,
 
   context_->tensors_size = tensors_->size();
   context_->tensors =
-      reinterpret_cast<TfLiteTensor*>(tensor_allocator_.AllocateMemory(
+      reinterpret_cast<TfLiteTensor*>(memory_allocator_.AllocateFromTail(
           sizeof(TfLiteTensor) * context_->tensors_size, 4));
 
   // Null all inputs so we can later perform a null check to avoid re-allocating
   // registered pre-allocated inputs.
-  for (size_t i = 0; i < subgraph_->inputs()->size(); ++i) {
-    const int tensor_index = subgraph_->inputs()->Get(i);
-    context_->tensors[tensor_index].data.raw = nullptr;
+  for (size_t i = 0; i < context_->tensors_size; ++i) {
+    context_->tensors[i].data.raw = nullptr;
   }
 }
 
@@ -62,88 +82,240 @@ TfLiteStatus MicroAllocator::RegisterPreallocatedInput(uint8_t* buffer,
 
   const int tensor_index = subgraph_->inputs()->Get(input_index);
   const auto* tensor = tensors_->Get(tensor_index);
-  return tensor_allocator_.AllocateTensor(
-      *tensor, 0, operators_->size(), buffers, error_reporter_,
-      &context_->tensors[tensor_index], buffer);
+  return InitializeRuntimeTensor(*tensor, buffers, error_reporter_,
+                                 &context_->tensors[tensor_index], buffer);
 }
 
 TfLiteStatus MicroAllocator::AllocateTensors() {
+  const size_t tensors_size = tensors_->size();
+
+  // It would be better not to allocate this memory for the lifetime of the
+  // model, but we don't have a straightforward way to avoid it.
+  TensorInfo* tensor_info =
+      reinterpret_cast<TensorInfo*>(memory_allocator_.AllocateFromTail(
+          sizeof(TensorInfo) * tensors_size, sizeof(TensorInfo)));
+
   const flatbuffers::Vector<flatbuffers::Offset<Buffer>>* buffers =
       model_->buffers();
 
-  int* first_created = reinterpret_cast<int*>(tensor_allocator_.AllocateMemory(
-      sizeof(int) * tensors_->size(), sizeof(int)));
-  int* last_used = reinterpret_cast<int*>(tensor_allocator_.AllocateMemory(
-      sizeof(int) * tensors_->size(), sizeof(int)));
-  for (size_t i = 0; i < tensors_->size(); ++i) {
-    first_created[i] = -1;
-    last_used[i] = -1;
+  // Set up the runtime data structures for all tensors.
+  for (size_t i = 0; i < tensors_size; ++i) {
+    TensorInfo* current = &tensor_info[i];
+    current->flatbuffer_tensor = &(*(tensors_->Get(i)));
+    current->runtime_tensor = &context_->tensors[i];
+    const bool is_variable = current->flatbuffer_tensor->is_variable();
+    if (is_variable) {
+      current->first_created = 0;
+      current->last_used = operators_->size();
+    } else {
+      current->first_created = -1;
+      current->last_used = -1;
+    }
+    current->needs_allocating = false;
+    // Preallocated inputs have already been set up earlier, so skip them.
+    const bool is_preallocated_input =
+        (current->runtime_tensor->data.raw != nullptr);
+    if (!is_preallocated_input) {
+      TF_LITE_ENSURE_STATUS(InitializeRuntimeTensor(
+          *current->flatbuffer_tensor, buffers, error_reporter_,
+          current->runtime_tensor, nullptr));
+    }
   }
 
-  // It is necessary to specify that model inputs have been allocated to avoid
-  // re-allocating later.  Since inputs are not created by a particular node, we
-  // make up an index which does not overlap with any node.
-  const int kInputIndex = subgraph_->inputs()->size();
+  // First go through the inputs and figure out if they need to be allocated.
   for (size_t i = 0; i < subgraph_->inputs()->size(); ++i) {
     const int tensor_index = subgraph_->inputs()->Get(i);
-    const auto* tensor = tensors_->Get(tensor_index);
-    // Check for and skip pre-allocated inputs.
-    if (context_->tensors[tensor_index].data.raw == nullptr) {
-      const TfLiteStatus status = tensor_allocator_.AllocateTensor(
-          *tensor, 0, operators_->size(), buffers, error_reporter_,
-          &context_->tensors[tensor_index]);
-      TF_LITE_ENSURE_OK(context_, status);
-    }
-    first_created[tensor_index] = kInputIndex;
+    TensorInfo* current = &tensor_info[tensor_index];
+    // Check for pre-allocated inputs.
+    current->needs_allocating = (current->runtime_tensor->data.raw == nullptr);
+    current->first_created = 0;
   }
 
+  // Figure out when the first and last use of each tensor is.
   for (int i = (operators_->size() - 1); i >= 0; --i) {
     const auto* op = operators_->Get(i);
     for (size_t n = 0; n < op->inputs()->size(); ++n) {
       const int tensor_index = op->inputs()->Get(n);
-      if ((last_used[tensor_index] == -1) || (last_used[tensor_index] < i)) {
-        last_used[tensor_index] = i;
+      TensorInfo* current = &tensor_info[tensor_index];
+      if ((current->last_used == -1) || (current->last_used > i)) {
+        current->last_used = i;
       }
     }
     for (size_t n = 0; n < op->outputs()->size(); ++n) {
       const int tensor_index = op->outputs()->Get(n);
-      const int create_before = i;
-      int destroy_after = last_used[tensor_index];
-      if (destroy_after == -1) {
-        destroy_after = operators_->size();
-      }
-      const auto* tensor = tensors_->Get(tensor_index);
-      if (!tensor->is_variable()) {
-        const TfLiteStatus status = tensor_allocator_.AllocateTensor(
-            *tensor, create_before, destroy_after, buffers, error_reporter_,
-            &context_->tensors[tensor_index]);
-        if (status != kTfLiteOk) {
-          return status;
-        }
-        first_created[tensor_index] = i;
+      TensorInfo* current = &tensor_info[tensor_index];
+      if ((current->first_created == -1) || (current->first_created < i)) {
+        current->first_created = i;
       }
     }
   }
 
+  // Work out which tensors need to be allocated.
   for (size_t i = 0; i < tensors_->size(); ++i) {
-    const auto* tensor = tensors_->Get(i);
-    const bool is_read_only = (first_created[i] == -1) && (last_used[i] != -1);
-    if (tensor->is_variable() || is_read_only) {
-      const TfLiteStatus status = tensor_allocator_.AllocateTensor(
-          *tensor, 0, operators_->size(), buffers, error_reporter_,
-          &context_->tensors[i]);
-      if (status != kTfLiteOk) {
-        return status;
-      }
+    TensorInfo* current = &tensor_info[i];
+    const bool is_read_only =
+        (current->first_created == -1) && (current->last_used != -1);
+    const bool is_preallocated_input =
+        (current->runtime_tensor->data.raw != nullptr);
+    if (!is_read_only && !is_preallocated_input) {
+      current->needs_allocating = true;
+    }
+  }
 
-      // Set default value for variable tensors:
-      if (tensor->is_variable()) {
-        tflite::ResetVariableTensor(&context_->tensors[i]);
+  uint8_t* aligned_arena = AlignPointerUp(arena_, kBufferAlignment);
+  const size_t alignment_loss = (aligned_arena - arena_);
+
+  int remaining_arena_size =
+      arena_size_ - (memory_allocator_.GetDataSize() + alignment_loss);
+  GreedyMemoryPlanner planner(aligned_arena, remaining_arena_size);
+
+  // Add the tensors to our allocation plan.
+  for (size_t i = 0; i < tensors_->size(); ++i) {
+    TensorInfo* current = &tensor_info[i];
+    if (current->needs_allocating) {
+      size_t bytes_required;
+      size_t type_size;
+      TF_LITE_ENSURE_STATUS(BytesRequiredForTensor(*current->flatbuffer_tensor,
+                                                   &bytes_required, &type_size,
+                                                   error_reporter_));
+      size_t aligned_bytes_required =
+          AlignSizeUp(bytes_required, kBufferAlignment);
+      planner.AddBuffer(error_reporter_, aligned_bytes_required,
+                        current->first_created, current->last_used);
+    }
+  }
+
+  // Make sure we have enough room.
+  if (planner.GetMaximumMemorySize() > remaining_arena_size) {
+    error_reporter_->Report(
+        "Arena size is too small for activation buffers. Needed %d but only %d "
+        "was available.",
+        planner.GetMaximumMemorySize(), remaining_arena_size);
+    return kTfLiteError;
+  }
+
+  // Figure out the actual memory addresses for each buffer, based on the plan.
+  int planner_index = 0;
+  for (size_t i = 0; i < tensors_->size(); ++i) {
+    TensorInfo* current = &tensor_info[i];
+    if (current->needs_allocating) {
+      int offset;
+      TF_LITE_ENSURE_STATUS(
+          planner.GetOffsetForBuffer(error_reporter_, planner_index, &offset));
+      current->runtime_tensor->data.uint8 = aligned_arena + offset;
+      ++planner_index;
+    }
+    // Set default value for variable tensors:
+    if (current->flatbuffer_tensor->is_variable()) {
+      if (current->runtime_tensor->data.uint8 == nullptr) {
+        error_reporter_->Report("Variable is not allocated");
+        return kTfLiteError;
       }
+      tflite::ResetVariableTensor(current->runtime_tensor);
     }
   }
 
   return kTfLiteOk;
 }
 
+TfLiteStatus MicroAllocator::InitializeRuntimeTensor(
+    const tflite::Tensor& flatbuffer_tensor,
+    const flatbuffers::Vector<flatbuffers::Offset<Buffer>>* buffers,
+    ErrorReporter* error_reporter, TfLiteTensor* result,
+    uint8_t* preallocated_buffer) {
+  // Make sure the serialized type is one we know how to deal with, and convert
+  // it from a flatbuffer enum into a constant used by the kernel C API.
+  TF_LITE_ENSURE_STATUS(ConvertTensorType(flatbuffer_tensor.type(),
+                                          &result->type, error_reporter));
+  // Make sure we remember if the serialized tensor is designated as a variable.
+  result->is_variable = flatbuffer_tensor.is_variable();
+
+  // We need to figure out where the actual contents of this tensor are stored
+  // in memory. We'll check to see if there's a serialized buffer (pretty much
+  // the same as a constant op in TensorFlow) associated with this tensor first,
+  // and if there is update the runtime structure to point to its location in
+  // memory.
+  result->data.raw = nullptr;
+  result->bytes = 0;
+  // First see if there's any buffer information in the serialized tensor.
+  if (auto* buffer = (*buffers)[flatbuffer_tensor.buffer()]) {
+    // If we've found a buffer, does it have any data?
+    if (auto* array = buffer->data()) {
+      // If it has any data, is the data size larger than zero?
+      if (size_t array_size = array->size()) {
+        // We've found a buffer with valid data, so update the runtime tensor
+        // data structure to point to it.
+        result->data.raw =
+            const_cast<char*>(reinterpret_cast<const char*>(array->data()));
+        // We set the data from a serialized buffer, so record tha.
+        result->allocation_type = kTfLiteMmapRo;
+      }
+    }
+    // TODO(petewarden): It's not clear in what circumstances we could have a
+    // buffer in the serialized tensor, but it doesn't have any data in it. Is
+    // that a validly-generated file, and if so what does it mean, or is it an
+    // error condition? It would be good to tighten up the specification to make
+    // it less ambiguous.
+  }
+
+  // TODO(petewarden): Some of these paths aren't getting enough testing
+  // coverage, so we should figure out some tests that exercise them.
+  if (!result->data.raw) {
+    // The tensor contents haven't been set from a serialized buffer, so
+    // make a note that they will be allocated from memory. The actual
+    // allocation won't happen until later.
+    result->allocation_type = kTfLiteArenaRw;
+    if (preallocated_buffer != nullptr) {
+      // If the client is supplying memory for the contents of the tensor
+      // themselves, use it.
+      // TODO(petewarden): Should we store the fact this is a client-allocated
+      // buffer?
+      result->data.raw = reinterpret_cast<char*>(preallocated_buffer);
+    }
+  }
+
+  // Figure out what the size in bytes of the buffer is and store it.
+  size_t type_size;
+  TF_LITE_ENSURE_STATUS(BytesRequiredForTensor(
+      flatbuffer_tensor, &result->bytes, &type_size, error_reporter));
+  // Copy the shape of the tensor from the serialized data into the runtime
+  // form. We have to allocate memory for this.
+  result->dims =
+      reinterpret_cast<TfLiteIntArray*>(memory_allocator_.AllocateFromTail(
+          sizeof(int) * (flatbuffer_tensor.shape()->Length() + 1),
+          sizeof(int)));
+  result->dims->size = flatbuffer_tensor.shape()->Length();
+  for (size_t n = 0; n < flatbuffer_tensor.shape()->Length(); ++n) {
+    result->dims->data[n] = flatbuffer_tensor.shape()->Get(n);
+  }
+  // Copy the quantization information from the serialized data.
+  const auto* src_quantization = flatbuffer_tensor.quantization();
+  if (src_quantization && src_quantization->scale() &&
+      (src_quantization->scale()->size() > 0) &&
+      src_quantization->zero_point() &&
+      (src_quantization->zero_point()->size() > 0)) {
+    result->params.scale = src_quantization->scale()->Get(0);
+    // This magic handles issues with little-endianness.
+    for (unsigned int b = 0; b < sizeof(int64_t); ++b)
+      *(reinterpret_cast<char*>(&result->params.zero_point) + b) =
+          *(reinterpret_cast<const char*>(
+                src_quantization->zero_point()->Data()) +
+            b);
+    result->params.zero_point =
+        flatbuffers::EndianScalar(result->params.zero_point);
+  }
+  // Copy the name, if there is one.
+  if (flatbuffer_tensor.name()->c_str() != nullptr) {
+    result->name = flatbuffer_tensor.name()->c_str();
+  } else {
+    result->name = "<No name>";
+  }
+  // These aren't used by the micro flavor of TFL, so set them to defaults.
+  result->allocation = nullptr;
+  result->delegate = nullptr;
+  result->buffer_handle = 0;
+  result->data_is_stale = false;
+  return kTfLiteOk;
+}
+
 }  // namespace tflite
diff --git a/tensorflow/lite/experimental/micro/micro_allocator.h b/tensorflow/lite/experimental/micro/micro_allocator.h
index adc9c69b778..cd25a82c827 100644
--- a/tensorflow/lite/experimental/micro/micro_allocator.h
+++ b/tensorflow/lite/experimental/micro/micro_allocator.h
@@ -17,7 +17,7 @@ limitations under the License.
 
 #include "tensorflow/lite/c/c_api_internal.h"
 #include "tensorflow/lite/core/api/error_reporter.h"
-#include "tensorflow/lite/experimental/micro/simple_tensor_allocator.h"
+#include "tensorflow/lite/experimental/micro/simple_memory_allocator.h"
 #include "tensorflow/lite/schema/schema_generated.h"
 
 namespace tflite {
@@ -44,6 +44,15 @@ class MicroAllocator {
   // prematurely overwritten.
   TfLiteStatus RegisterPreallocatedInput(uint8_t* buffer, size_t input_index);
 
+  // Sets up all of the data structure members for a runtime tensor based on the
+  // contents of a serialized tensor. This method doesn't allocate any memory,
+  // all allocations happen subsequently in AllocateTensors.
+  TfLiteStatus InitializeRuntimeTensor(
+      const tflite::Tensor& flatbuffer_tensor,
+      const flatbuffers::Vector<flatbuffers::Offset<Buffer>>* buffers,
+      ErrorReporter* error_reporter, TfLiteTensor* result,
+      uint8_t* preallocated_buffer = nullptr);
+
   // Run through the model and allocate all necessary input, output and
   // intermediate tensors except for those already provided via calls to
   // registerPreallocatedInput.
@@ -51,9 +60,11 @@ class MicroAllocator {
 
  private:
   const Model* model_;
-  SimpleTensorAllocator tensor_allocator_;
+  SimpleMemoryAllocator memory_allocator_;
   ErrorReporter* error_reporter_;
   TfLiteContext* context_;
+  uint8_t* arena_;
+  size_t arena_size_;
 
   const SubGraph* subgraph_;
   const flatbuffers::Vector<flatbuffers::Offset<Operator>>* operators_;
diff --git a/tensorflow/lite/experimental/micro/micro_allocator_test.cc b/tensorflow/lite/experimental/micro/micro_allocator_test.cc
new file mode 100644
index 00000000000..9d15fe8a8da
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/micro_allocator_test.cc
@@ -0,0 +1,148 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/experimental/micro/micro_allocator.h"
+
+#include "tensorflow/lite/experimental/micro/test_helpers.h"
+#include "tensorflow/lite/experimental/micro/testing/micro_test.h"
+
+TF_LITE_MICRO_TESTS_BEGIN
+
+TF_LITE_MICRO_TEST(TestInitializeRuntimeTensor) {
+  const tflite::Model* model = tflite::GetMockModel();
+  TfLiteContext context;
+  constexpr size_t arena_size = 1024;
+  uint8_t arena[arena_size];
+  tflite::MicroAllocator allocator(&context, model, arena, arena_size,
+                                   micro_test::reporter);
+
+  const tflite::Tensor* tensor = tflite::Create1dFlatbufferTensor(100);
+  const flatbuffers::Vector<flatbuffers::Offset<tflite::Buffer>>* buffers =
+      tflite::CreateFlatbufferBuffers();
+
+  TfLiteTensor allocated_tensor;
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, allocator.InitializeRuntimeTensor(
+                                         *tensor, buffers, micro_test::reporter,
+                                         &allocated_tensor));
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteInt32, allocated_tensor.type);
+  TF_LITE_MICRO_EXPECT_EQ(1, allocated_tensor.dims->size);
+  TF_LITE_MICRO_EXPECT_EQ(100, allocated_tensor.dims->data[0]);
+  TF_LITE_MICRO_EXPECT_EQ(400, allocated_tensor.bytes);
+  TF_LITE_MICRO_EXPECT_EQ(nullptr, allocated_tensor.data.i32);
+}
+
+TF_LITE_MICRO_TEST(TestMissingQuantization) {
+  const tflite::Model* model = tflite::GetMockModel();
+  TfLiteContext context;
+  constexpr size_t arena_size = 1024;
+  uint8_t arena[arena_size];
+  tflite::MicroAllocator allocator(&context, model, arena, arena_size,
+                                   micro_test::reporter);
+
+  const tflite::Tensor* tensor =
+      tflite::CreateMissingQuantizationFlatbufferTensor(100);
+  const flatbuffers::Vector<flatbuffers::Offset<tflite::Buffer>>* buffers =
+      tflite::CreateFlatbufferBuffers();
+
+  TfLiteTensor allocated_tensor;
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, allocator.InitializeRuntimeTensor(
+                                         *tensor, buffers, micro_test::reporter,
+                                         &allocated_tensor));
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteInt32, allocated_tensor.type);
+  TF_LITE_MICRO_EXPECT_EQ(1, allocated_tensor.dims->size);
+  TF_LITE_MICRO_EXPECT_EQ(100, allocated_tensor.dims->data[0]);
+  TF_LITE_MICRO_EXPECT_EQ(400, allocated_tensor.bytes);
+  TF_LITE_MICRO_EXPECT_EQ(nullptr, allocated_tensor.data.i32);
+}
+
+TF_LITE_MICRO_TEST(TestAllocateTensors) {
+  const tflite::Model* model = tflite::GetMockModel();
+  TfLiteContext context;
+  constexpr size_t arena_size = 1024;
+  uint8_t arena[arena_size];
+  tflite::MicroAllocator allocator(&context, model, arena, arena_size,
+                                   micro_test::reporter);
+  TF_LITE_MICRO_EXPECT_EQ(3, context.tensors_size);
+
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, allocator.AllocateTensors());
+
+  constexpr int kExpectedAlignment = 4;
+
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteInt32, context.tensors[0].type);
+  TF_LITE_MICRO_EXPECT_EQ(1, context.tensors[0].dims->size);
+  TF_LITE_MICRO_EXPECT_EQ(1, context.tensors[0].dims->data[0]);
+  TF_LITE_MICRO_EXPECT_EQ(4, context.tensors[0].bytes);
+  TF_LITE_MICRO_EXPECT_NE(nullptr, context.tensors[0].data.raw);
+  TF_LITE_MICRO_EXPECT_EQ(
+      0, (reinterpret_cast<int64_t>(context.tensors[0].data.raw) %
+          kExpectedAlignment));
+
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteUInt8, context.tensors[1].type);
+  TF_LITE_MICRO_EXPECT_EQ(1, context.tensors[1].dims->size);
+  TF_LITE_MICRO_EXPECT_EQ(1, context.tensors[1].dims->data[0]);
+  TF_LITE_MICRO_EXPECT_EQ(1, context.tensors[1].bytes);
+  TF_LITE_MICRO_EXPECT_NE(nullptr, context.tensors[1].data.raw);
+
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteInt32, context.tensors[2].type);
+  TF_LITE_MICRO_EXPECT_EQ(1, context.tensors[2].dims->size);
+  TF_LITE_MICRO_EXPECT_EQ(1, context.tensors[2].dims->data[0]);
+  TF_LITE_MICRO_EXPECT_EQ(4, context.tensors[2].bytes);
+  TF_LITE_MICRO_EXPECT_NE(nullptr, context.tensors[2].data.raw);
+  TF_LITE_MICRO_EXPECT_EQ(
+      0, (reinterpret_cast<int64_t>(context.tensors[2].data.raw) %
+          kExpectedAlignment));
+}
+
+TF_LITE_MICRO_TEST(TestPreallocatedInput) {
+  const tflite::Model* model = tflite::GetMockModel();
+  TfLiteContext context;
+  constexpr size_t arena_size = 1024;
+  uint8_t arena[arena_size];
+  tflite::MicroAllocator allocator(&context, model, arena, arena_size,
+                                   micro_test::reporter);
+  TF_LITE_MICRO_EXPECT_EQ(3, context.tensors_size);
+
+  uint8_t preallocated_input_buffer[4];
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, allocator.RegisterPreallocatedInput(
+                                         preallocated_input_buffer, 0));
+
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, allocator.AllocateTensors());
+
+  constexpr int kExpectedAlignment = 4;
+
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteInt32, context.tensors[0].type);
+  TF_LITE_MICRO_EXPECT_EQ(1, context.tensors[0].dims->size);
+  TF_LITE_MICRO_EXPECT_EQ(1, context.tensors[0].dims->data[0]);
+  TF_LITE_MICRO_EXPECT_EQ(4, context.tensors[0].bytes);
+  TF_LITE_MICRO_EXPECT_EQ(preallocated_input_buffer,
+                          context.tensors[0].data.uint8);
+
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteUInt8, context.tensors[1].type);
+  TF_LITE_MICRO_EXPECT_EQ(1, context.tensors[1].dims->size);
+  TF_LITE_MICRO_EXPECT_EQ(1, context.tensors[1].dims->data[0]);
+  TF_LITE_MICRO_EXPECT_EQ(1, context.tensors[1].bytes);
+  TF_LITE_MICRO_EXPECT_NE(nullptr, context.tensors[1].data.raw);
+
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteInt32, context.tensors[2].type);
+  TF_LITE_MICRO_EXPECT_EQ(1, context.tensors[2].dims->size);
+  TF_LITE_MICRO_EXPECT_EQ(1, context.tensors[2].dims->data[0]);
+  TF_LITE_MICRO_EXPECT_EQ(4, context.tensors[2].bytes);
+  TF_LITE_MICRO_EXPECT_NE(nullptr, context.tensors[2].data.raw);
+  TF_LITE_MICRO_EXPECT_EQ(
+      0, (reinterpret_cast<int64_t>(context.tensors[2].data.raw) %
+          kExpectedAlignment));
+}
+
+TF_LITE_MICRO_TESTS_END
diff --git a/tensorflow/lite/experimental/micro/micro_error_reporter.cc b/tensorflow/lite/experimental/micro/micro_error_reporter.cc
index 07f4af6ad6f..0711a05d163 100644
--- a/tensorflow/lite/experimental/micro/micro_error_reporter.cc
+++ b/tensorflow/lite/experimental/micro/micro_error_reporter.cc
@@ -56,7 +56,7 @@ void DebugLogPrintf(const char* format, va_list args) {
     DebugLog(output_cache);
     output_cache_index = 0;
   }
-  DebugLog("\n");
+  DebugLog("\r\n");
 }
 }  // namespace
 
diff --git a/tensorflow/lite/experimental/micro/micro_interpreter.h b/tensorflow/lite/experimental/micro/micro_interpreter.h
index ca0a42e2c38..e3f2e542e15 100644
--- a/tensorflow/lite/experimental/micro/micro_interpreter.h
+++ b/tensorflow/lite/experimental/micro/micro_interpreter.h
@@ -19,7 +19,6 @@ limitations under the License.
 #include "tensorflow/lite/core/api/error_reporter.h"
 #include "tensorflow/lite/core/api/op_resolver.h"
 #include "tensorflow/lite/experimental/micro/micro_allocator.h"
-#include "tensorflow/lite/experimental/micro/simple_tensor_allocator.h"
 #include "tensorflow/lite/schema/schema_generated.h"
 
 namespace tflite {
diff --git a/tensorflow/lite/experimental/micro/micro_interpreter_test.cc b/tensorflow/lite/experimental/micro/micro_interpreter_test.cc
index 61b5d06e588..e4cce8406fa 100644
--- a/tensorflow/lite/experimental/micro/micro_interpreter_test.cc
+++ b/tensorflow/lite/experimental/micro/micro_interpreter_test.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/lite/experimental/micro/micro_interpreter.h"
 
+#include "tensorflow/lite/experimental/micro/test_helpers.h"
 #include "tensorflow/lite/experimental/micro/testing/micro_test.h"
 
 namespace tflite {
@@ -64,109 +65,6 @@ class MockOpResolver : public OpResolver {
   }
 };
 
-class StackAllocator : public flatbuffers::Allocator {
- public:
-  StackAllocator() : data_(data_backing_), data_size_(0) {}
-
-  uint8_t* allocate(size_t size) override {
-    if ((data_size_ + size) > kStackAllocatorSize) {
-      // TODO(petewarden): Add error reporting beyond returning null!
-      return nullptr;
-    }
-    uint8_t* result = data_;
-    data_ += size;
-    data_size_ += size;
-    return result;
-  }
-
-  void deallocate(uint8_t* p, size_t) override {}
-
-  static StackAllocator& instance() {
-    // Avoid using true dynamic memory allocation to be portable to bare metal.
-    static char inst_memory[sizeof(StackAllocator)];
-    static StackAllocator* inst = new (inst_memory) StackAllocator;
-    return *inst;
-  }
-
-  static constexpr int kStackAllocatorSize = 4096;
-
- private:
-  uint8_t data_backing_[kStackAllocatorSize];
-  uint8_t* data_;
-  int data_size_;
-};
-
-const Model* BuildMockModel() {
-  using flatbuffers::Offset;
-  flatbuffers::FlatBufferBuilder builder(StackAllocator::kStackAllocatorSize,
-                                         &StackAllocator::instance());
-  constexpr size_t buffer_data_size = 1;
-  const uint8_t buffer_data[buffer_data_size] = {21};
-  constexpr size_t buffers_size = 2;
-  const Offset<Buffer> buffers[buffers_size] = {
-      CreateBuffer(builder),
-      CreateBuffer(builder,
-                   builder.CreateVector(buffer_data, buffer_data_size))};
-  constexpr size_t tensor_shape_size = 1;
-  const int32_t tensor_shape[tensor_shape_size] = {1};
-  constexpr size_t tensors_size = 3;
-  const Offset<Tensor> tensors[tensors_size] = {
-      CreateTensor(builder,
-                   builder.CreateVector(tensor_shape, tensor_shape_size),
-                   TensorType_INT32, 0,
-                   builder.CreateString("test_input_tensor"), 0, false),
-      CreateTensor(builder,
-                   builder.CreateVector(tensor_shape, tensor_shape_size),
-                   TensorType_UINT8, 1,
-                   builder.CreateString("test_weight_tensor"), 0, false),
-      CreateTensor(builder,
-                   builder.CreateVector(tensor_shape, tensor_shape_size),
-                   TensorType_INT32, 0,
-                   builder.CreateString("test_output_tensor"), 0, false),
-  };
-  constexpr size_t inputs_size = 1;
-  const int32_t inputs[inputs_size] = {0};
-  constexpr size_t outputs_size = 1;
-  const int32_t outputs[outputs_size] = {2};
-  constexpr size_t operator_inputs_size = 2;
-  const int32_t operator_inputs[operator_inputs_size] = {0, 1};
-  constexpr size_t operator_outputs_size = 1;
-  const int32_t operator_outputs[operator_outputs_size] = {2};
-  constexpr size_t operators_size = 1;
-  const Offset<Operator> operators[operators_size] = {CreateOperator(
-      builder, 0, builder.CreateVector(operator_inputs, operator_inputs_size),
-      builder.CreateVector(operator_outputs, operator_outputs_size),
-      BuiltinOptions_NONE)};
-  constexpr size_t subgraphs_size = 1;
-  const Offset<SubGraph> subgraphs[subgraphs_size] = {
-      CreateSubGraph(builder, builder.CreateVector(tensors, tensors_size),
-                     builder.CreateVector(inputs, inputs_size),
-                     builder.CreateVector(outputs, outputs_size),
-                     builder.CreateVector(operators, operators_size),
-                     builder.CreateString("test_subgraph"))};
-  constexpr size_t operator_codes_size = 1;
-  const Offset<OperatorCode> operator_codes[operator_codes_size] = {
-      CreateOperatorCodeDirect(builder, BuiltinOperator_CUSTOM, "mock_custom",
-                               0)};
-  const Offset<Model> model_offset = CreateModel(
-      builder, 0, builder.CreateVector(operator_codes, operator_codes_size),
-      builder.CreateVector(subgraphs, subgraphs_size),
-      builder.CreateString("test_model"),
-      builder.CreateVector(buffers, buffers_size));
-  FinishModelBuffer(builder, model_offset);
-  void* model_pointer = builder.GetBufferPointer();
-  const Model* model = flatbuffers::GetRoot<Model>(model_pointer);
-  return model;
-}
-
-const Model* GetMockModel() {
-  static Model* model = nullptr;
-  if (!model) {
-    model = const_cast<Model*>(BuildMockModel());
-  }
-  return model;
-}
-
 }  // namespace
 }  // namespace tflite
 
diff --git a/tensorflow/lite/experimental/micro/simple_memory_allocator.cc b/tensorflow/lite/experimental/micro/simple_memory_allocator.cc
new file mode 100644
index 00000000000..a9e8c228fe1
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/simple_memory_allocator.cc
@@ -0,0 +1,37 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/experimental/micro/simple_memory_allocator.h"
+
+#include "tensorflow/lite/core/api/flatbuffer_conversions.h"
+#include "tensorflow/lite/experimental/micro/memory_helpers.h"
+
+namespace tflite {
+
+uint8_t* SimpleMemoryAllocator::AllocateFromTail(size_t size,
+                                                 size_t alignment) {
+  uint8_t* previous_free = (data_ + data_size_max_) - data_size_;
+  uint8_t* current_data = previous_free - size;
+  uint8_t* aligned_result = AlignPointerDown(current_data, alignment);
+  size_t aligned_size = (previous_free - aligned_result);
+  if ((data_size_ + aligned_size) > data_size_max_) {
+    // TODO(petewarden): Add error reporting beyond returning null!
+    return nullptr;
+  }
+  data_size_ += aligned_size;
+  return aligned_result;
+}
+
+}  // namespace tflite
diff --git a/tensorflow/lite/experimental/micro/simple_tensor_allocator.h b/tensorflow/lite/experimental/micro/simple_memory_allocator.h
similarity index 67%
rename from tensorflow/lite/experimental/micro/simple_tensor_allocator.h
rename to tensorflow/lite/experimental/micro/simple_memory_allocator.h
index 87e9b7b104f..f44c012e0e2 100644
--- a/tensorflow/lite/experimental/micro/simple_tensor_allocator.h
+++ b/tensorflow/lite/experimental/micro/simple_memory_allocator.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_LITE_EXPERIMENTAL_MICRO_SIMPLE_TENSOR_ALLOCATOR_H_
-#define TENSORFLOW_LITE_EXPERIMENTAL_MICRO_SIMPLE_TENSOR_ALLOCATOR_H_
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_MICRO_SIMPLE_MEMORY_ALLOCATOR_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_MICRO_SIMPLE_MEMORY_ALLOCATOR_H_
 
 #include "tensorflow/lite/c/c_api_internal.h"
 #include "tensorflow/lite/core/api/error_reporter.h"
@@ -25,19 +25,15 @@ namespace tflite {
 // TODO(petewarden): This allocator never frees up or reuses  any memory, even
 // though we have enough information about lifetimes of the tensors to do so.
 // This makes it pretty wasteful, so we should use a more intelligent method.
-class SimpleTensorAllocator {
+class SimpleMemoryAllocator {
  public:
-  SimpleTensorAllocator(uint8_t* buffer, size_t buffer_size)
+  SimpleMemoryAllocator(uint8_t* buffer, size_t buffer_size)
       : data_size_(0), data_size_max_(buffer_size), data_(buffer) {}
 
-  TfLiteStatus AllocateTensor(
-      const tflite::Tensor& flatbuffer_tensor, int create_before,
-      int destroy_after,
-      const flatbuffers::Vector<flatbuffers::Offset<Buffer>>* buffers,
-      ErrorReporter* error_reporter, TfLiteTensor* result,
-      uint8_t* preallocated_memory = nullptr);
-
-  uint8_t* AllocateMemory(size_t size, size_t alignment);
+  // Allocates memory starting at the end of the arena (highest address and
+  // moving downwards, so that tensor buffers can be allocated from the start
+  // in ascending order.
+  uint8_t* AllocateFromTail(size_t size, size_t alignment);
 
   int GetDataSize() const { return data_size_; }
 
@@ -49,4 +45,4 @@ class SimpleTensorAllocator {
 
 }  // namespace tflite
 
-#endif  // TENSORFLOW_LITE_EXPERIMENTAL_MICRO_SIMPLE_TENSOR_ALLOCATOR_H_
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_MICRO_SIMPLE_MEMORY_ALLOCATOR_H_
diff --git a/tensorflow/lite/experimental/micro/simple_memory_allocator_test.cc b/tensorflow/lite/experimental/micro/simple_memory_allocator_test.cc
new file mode 100644
index 00000000000..51f53cd7674
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/simple_memory_allocator_test.cc
@@ -0,0 +1,57 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/experimental/micro/simple_memory_allocator.h"
+
+#include "tensorflow/lite/experimental/micro/test_helpers.h"
+#include "tensorflow/lite/experimental/micro/testing/micro_test.h"
+
+TF_LITE_MICRO_TESTS_BEGIN
+
+TF_LITE_MICRO_TEST(TestJustFits) {
+  constexpr size_t arena_size = 1024;
+  uint8_t arena[arena_size];
+  tflite::SimpleMemoryAllocator allocator(arena, arena_size);
+
+  uint8_t* result = allocator.AllocateFromTail(arena_size, 1);
+  TF_LITE_MICRO_EXPECT_NE(nullptr, result);
+}
+
+TF_LITE_MICRO_TEST(TestAligned) {
+  constexpr size_t arena_size = 1024;
+  uint8_t arena[arena_size];
+  tflite::SimpleMemoryAllocator allocator(arena, arena_size);
+
+  uint8_t* result = allocator.AllocateFromTail(1, 1);
+  TF_LITE_MICRO_EXPECT_NE(nullptr, result);
+
+  result = allocator.AllocateFromTail(16, 4);
+  TF_LITE_MICRO_EXPECT_NE(nullptr, result);
+  TF_LITE_MICRO_EXPECT_EQ(0, reinterpret_cast<size_t>(result) & 3);
+}
+
+TF_LITE_MICRO_TEST(TestMultipleTooLarge) {
+  constexpr size_t arena_size = 1024;
+  uint8_t arena[arena_size];
+  tflite::SimpleMemoryAllocator allocator(arena, arena_size);
+
+  uint8_t* result = allocator.AllocateFromTail(768, 1);
+  TF_LITE_MICRO_EXPECT_NE(nullptr, result);
+
+  result = allocator.AllocateFromTail(768, 1);
+  TF_LITE_MICRO_EXPECT_EQ(nullptr, result);
+}
+
+TF_LITE_MICRO_TESTS_END
diff --git a/tensorflow/lite/experimental/micro/simple_tensor_allocator.cc b/tensorflow/lite/experimental/micro/simple_tensor_allocator.cc
deleted file mode 100644
index dad998c8a8f..00000000000
--- a/tensorflow/lite/experimental/micro/simple_tensor_allocator.cc
+++ /dev/null
@@ -1,179 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/lite/experimental/micro/simple_tensor_allocator.h"
-
-#include "tensorflow/lite/c/c_api_internal.h"
-#include "tensorflow/lite/core/api/flatbuffer_conversions.h"
-
-namespace tflite {
-namespace {
-
-TfLiteStatus TfLiteTypeSizeOf(TfLiteType type, size_t* size,
-                              ErrorReporter* reporter) {
-  switch (type) {
-    case kTfLiteFloat32:
-      *size = sizeof(float);
-      break;
-    case kTfLiteInt16:
-      *size = sizeof(int16_t);
-      break;
-    case kTfLiteInt32:
-      *size = sizeof(int32_t);
-      break;
-    case kTfLiteInt8:
-      *size = sizeof(int8_t);
-      break;
-    case kTfLiteUInt8:
-      *size = sizeof(uint8_t);
-      break;
-    case kTfLiteInt64:
-      *size = sizeof(int64_t);
-      break;
-    case kTfLiteBool:
-      *size = sizeof(bool);
-      break;
-    case kTfLiteComplex64:
-      *size = sizeof(float) * 2;
-      break;
-    default:
-      reporter->Report("Type %s (%d) not is not supported",
-                       TfLiteTypeGetName(type), type);
-      return kTfLiteError;
-  }
-  return kTfLiteOk;
-}
-
-TfLiteStatus BytesRequired(const tflite::Tensor& flatbuffer_tensor,
-                           size_t dims_size, size_t* bytes, size_t* type_size,
-                           ErrorReporter* error_reporter) {
-  TfLiteType tf_lite_type;
-  TF_LITE_ENSURE_STATUS(ConvertTensorType(flatbuffer_tensor.type(),
-                                          &tf_lite_type, error_reporter));
-  TF_LITE_ENSURE_STATUS(
-      TfLiteTypeSizeOf(tf_lite_type, type_size, error_reporter));
-  *bytes = dims_size * (*type_size);
-  return kTfLiteOk;
-}
-
-uint8_t* AlignPointerRoundUp(uint8_t* data, size_t alignment) {
-  size_t data_as_size_t = reinterpret_cast<size_t>(data);
-  uint8_t* aligned_result = reinterpret_cast<uint8_t*>(
-      ((data_as_size_t + (alignment - 1)) / alignment) * alignment);
-  return aligned_result;
-}
-
-}  // namespace
-
-TfLiteStatus SimpleTensorAllocator::AllocateTensor(
-    const tflite::Tensor& flatbuffer_tensor, int create_before,
-    int destroy_after,
-    const flatbuffers::Vector<flatbuffers::Offset<Buffer>>* buffers,
-    ErrorReporter* error_reporter, TfLiteTensor* result,
-    uint8_t* preallocated_buffer) {
-  TF_LITE_ENSURE_STATUS(ConvertTensorType(flatbuffer_tensor.type(),
-                                          &result->type, error_reporter));
-  result->is_variable = flatbuffer_tensor.is_variable();
-
-  result->data.raw = nullptr;
-  result->bytes = 0;
-  if (auto* buffer = (*buffers)[flatbuffer_tensor.buffer()]) {
-    if (auto* array = buffer->data()) {
-      if (size_t array_size = array->size()) {
-        result->data.raw =
-            const_cast<char*>(reinterpret_cast<const char*>(array->data()));
-        size_t type_size;
-        TF_LITE_ENSURE_STATUS(BytesRequired(flatbuffer_tensor, array_size,
-                                            &result->bytes, &type_size,
-                                            error_reporter));
-      }
-    }
-  }
-  if (result->data.raw) {
-    result->allocation_type = kTfLiteMmapRo;
-  } else {
-    int data_size = 1;
-    for (size_t n = 0; n < flatbuffer_tensor.shape()->Length(); ++n) {
-      data_size *= flatbuffer_tensor.shape()->Get(n);
-    }
-    size_t type_size;
-    TF_LITE_ENSURE_STATUS(BytesRequired(flatbuffer_tensor, data_size,
-                                        &result->bytes, &type_size,
-                                        error_reporter));
-    if (preallocated_buffer != nullptr) {
-      result->data.raw = reinterpret_cast<char*>(preallocated_buffer);
-    } else {
-      result->data.raw =
-          reinterpret_cast<char*>(AllocateMemory(result->bytes, type_size));
-    }
-    if (result->data.raw == nullptr) {
-      const char* tensor_name = flatbuffer_tensor.name()->c_str();
-      if (tensor_name == nullptr) {
-        tensor_name = "<None>";
-      }
-      error_reporter->Report(
-          "Couldn't allocate memory for tensor '%s', wanted %d bytes but only "
-          "%d were available",
-          tensor_name, result->bytes, (data_size_max_ - data_size_));
-      return kTfLiteError;
-    }
-    result->allocation_type = kTfLiteArenaRw;
-  }
-  result->dims = reinterpret_cast<TfLiteIntArray*>(AllocateMemory(
-      sizeof(int) * (flatbuffer_tensor.shape()->Length() + 1), sizeof(int)));
-  result->dims->size = flatbuffer_tensor.shape()->Length();
-  for (size_t n = 0; n < flatbuffer_tensor.shape()->Length(); ++n) {
-    result->dims->data[n] = flatbuffer_tensor.shape()->Get(n);
-  }
-  const auto* src_quantization = flatbuffer_tensor.quantization();
-  if (src_quantization && src_quantization->scale() &&
-      (src_quantization->scale()->size() > 0) &&
-      src_quantization->zero_point() &&
-      (src_quantization->zero_point()->size() > 0)) {
-    result->params.scale = src_quantization->scale()->Get(0);
-    for (unsigned int b = 0; b < sizeof(int64_t); ++b)
-      *(reinterpret_cast<char*>(&result->params.zero_point) + b) =
-          *(reinterpret_cast<const char*>(
-                src_quantization->zero_point()->Data()) +
-            b);
-    result->params.zero_point =
-        flatbuffers::EndianScalar(result->params.zero_point);
-  }
-  result->allocation = nullptr;
-  if (flatbuffer_tensor.name()->c_str() != nullptr) {
-    result->name = flatbuffer_tensor.name()->c_str();
-  } else {
-    result->name = "<No name>";
-  }
-  result->delegate = nullptr;
-  result->buffer_handle = 0;
-  result->data_is_stale = false;
-  return kTfLiteOk;
-}
-
-uint8_t* SimpleTensorAllocator::AllocateMemory(size_t size, size_t alignment) {
-  uint8_t* current_data = data_ + data_size_;
-  uint8_t* aligned_result = AlignPointerRoundUp(current_data, alignment);
-  uint8_t* next_free = aligned_result + size;
-  size_t aligned_size = (next_free - current_data);
-  if ((data_size_ + aligned_size) > data_size_max_) {
-    // TODO(petewarden): Add error reporting beyond returning null!
-    return nullptr;
-  }
-  data_size_ += aligned_size;
-  return aligned_result;
-}
-
-}  // namespace tflite
diff --git a/tensorflow/lite/experimental/micro/simple_tensor_allocator_test.cc b/tensorflow/lite/experimental/micro/test_helpers.cc
similarity index 50%
rename from tensorflow/lite/experimental/micro/simple_tensor_allocator_test.cc
rename to tensorflow/lite/experimental/micro/test_helpers.cc
index 0f7945a4c48..28d732427c2 100644
--- a/tensorflow/lite/experimental/micro/simple_tensor_allocator_test.cc
+++ b/tensorflow/lite/experimental/micro/test_helpers.cc
@@ -1,4 +1,4 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -13,11 +13,11 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/lite/experimental/micro/micro_interpreter.h"
-#include "tensorflow/lite/experimental/micro/testing/micro_test.h"
+#include "tensorflow/lite/experimental/micro/test_helpers.h"
 
 namespace tflite {
 namespace {
+
 class StackAllocator : public flatbuffers::Allocator {
  public:
   StackAllocator() : data_(data_backing_), data_size_(0) {}
@@ -58,7 +58,80 @@ flatbuffers::FlatBufferBuilder* BuilderInstance() {
   return inst;
 }
 
-const Tensor* Create1dTensor(int size) {
+const Model* BuildMockModel() {
+  using flatbuffers::Offset;
+  flatbuffers::FlatBufferBuilder* builder = BuilderInstance();
+
+  constexpr size_t buffer_data_size = 1;
+  const uint8_t buffer_data[buffer_data_size] = {21};
+  constexpr size_t buffers_size = 2;
+  const Offset<Buffer> buffers[buffers_size] = {
+      CreateBuffer(*builder),
+      CreateBuffer(*builder,
+                   builder->CreateVector(buffer_data, buffer_data_size))};
+  constexpr size_t tensor_shape_size = 1;
+  const int32_t tensor_shape[tensor_shape_size] = {1};
+  constexpr size_t tensors_size = 3;
+  const Offset<Tensor> tensors[tensors_size] = {
+      CreateTensor(*builder,
+                   builder->CreateVector(tensor_shape, tensor_shape_size),
+                   TensorType_INT32, 0,
+                   builder->CreateString("test_input_tensor"), 0, false),
+      CreateTensor(*builder,
+                   builder->CreateVector(tensor_shape, tensor_shape_size),
+                   TensorType_UINT8, 1,
+                   builder->CreateString("test_weight_tensor"), 0, false),
+      CreateTensor(*builder,
+                   builder->CreateVector(tensor_shape, tensor_shape_size),
+                   TensorType_INT32, 0,
+                   builder->CreateString("test_output_tensor"), 0, false),
+  };
+  constexpr size_t inputs_size = 1;
+  const int32_t inputs[inputs_size] = {0};
+  constexpr size_t outputs_size = 1;
+  const int32_t outputs[outputs_size] = {2};
+  constexpr size_t operator_inputs_size = 2;
+  const int32_t operator_inputs[operator_inputs_size] = {0, 1};
+  constexpr size_t operator_outputs_size = 1;
+  const int32_t operator_outputs[operator_outputs_size] = {2};
+  constexpr size_t operators_size = 1;
+  const Offset<Operator> operators[operators_size] = {CreateOperator(
+      *builder, 0, builder->CreateVector(operator_inputs, operator_inputs_size),
+      builder->CreateVector(operator_outputs, operator_outputs_size),
+      BuiltinOptions_NONE)};
+  constexpr size_t subgraphs_size = 1;
+  const Offset<SubGraph> subgraphs[subgraphs_size] = {
+      CreateSubGraph(*builder, builder->CreateVector(tensors, tensors_size),
+                     builder->CreateVector(inputs, inputs_size),
+                     builder->CreateVector(outputs, outputs_size),
+                     builder->CreateVector(operators, operators_size),
+                     builder->CreateString("test_subgraph"))};
+  constexpr size_t operator_codes_size = 1;
+  const Offset<OperatorCode> operator_codes[operator_codes_size] = {
+      CreateOperatorCodeDirect(*builder, BuiltinOperator_CUSTOM, "mock_custom",
+                               0)};
+  const Offset<Model> model_offset = CreateModel(
+      *builder, 0, builder->CreateVector(operator_codes, operator_codes_size),
+      builder->CreateVector(subgraphs, subgraphs_size),
+      builder->CreateString("test_model"),
+      builder->CreateVector(buffers, buffers_size));
+  FinishModelBuffer(*builder, model_offset);
+  void* model_pointer = builder->GetBufferPointer();
+  const Model* model = flatbuffers::GetRoot<Model>(model_pointer);
+  return model;
+}
+
+}  // namespace
+
+const Model* GetMockModel() {
+  static Model* model = nullptr;
+  if (!model) {
+    model = const_cast<Model*>(BuildMockModel());
+  }
+  return model;
+}
+
+const Tensor* Create1dFlatbufferTensor(int size) {
   using flatbuffers::Offset;
   flatbuffers::FlatBufferBuilder* builder = BuilderInstance();
   constexpr size_t tensor_shape_size = 1;
@@ -72,7 +145,7 @@ const Tensor* Create1dTensor(int size) {
   return tensor;
 }
 
-const Tensor* CreateMissingQuantizationTensor(int size) {
+const Tensor* CreateMissingQuantizationFlatbufferTensor(int size) {
   using flatbuffers::Offset;
   flatbuffers::FlatBufferBuilder* builder = BuilderInstance();
   const Offset<QuantizationParameters> quant_params =
@@ -90,7 +163,8 @@ const Tensor* CreateMissingQuantizationTensor(int size) {
   return tensor;
 }
 
-const flatbuffers::Vector<flatbuffers::Offset<Buffer>>* CreateBuffers() {
+const flatbuffers::Vector<flatbuffers::Offset<Buffer>>*
+CreateFlatbufferBuffers() {
   using flatbuffers::Offset;
   flatbuffers::FlatBufferBuilder* builder = BuilderInstance();
   constexpr size_t buffers_size = 1;
@@ -107,101 +181,4 @@ const flatbuffers::Vector<flatbuffers::Offset<Buffer>>* CreateBuffers() {
   return result;
 }
 
-}  // namespace
 }  // namespace tflite
-
-TF_LITE_MICRO_TESTS_BEGIN
-
-TF_LITE_MICRO_TEST(TestAllocateTensor) {
-  constexpr size_t arena_size = 1024;
-  uint8_t arena[arena_size];
-  tflite::SimpleTensorAllocator allocator(arena, arena_size);
-
-  const tflite::Tensor* tensor = tflite::Create1dTensor(100);
-  const flatbuffers::Vector<flatbuffers::Offset<tflite::Buffer>>* buffers =
-      tflite::CreateBuffers();
-
-  TfLiteTensor allocated_tensor;
-  TF_LITE_MICRO_EXPECT_EQ(
-      kTfLiteOk,
-      allocator.AllocateTensor(*tensor, 0, 1, buffers, micro_test::reporter,
-                               &allocated_tensor));
-  TF_LITE_MICRO_EXPECT_EQ(kTfLiteInt32, allocated_tensor.type);
-  TF_LITE_MICRO_EXPECT_EQ(1, allocated_tensor.dims->size);
-  TF_LITE_MICRO_EXPECT_EQ(100, allocated_tensor.dims->data[0]);
-  TF_LITE_MICRO_EXPECT_EQ(400, allocated_tensor.bytes);
-  TF_LITE_MICRO_EXPECT_NE(nullptr, allocated_tensor.data.i32);
-}
-
-TF_LITE_MICRO_TEST(TestTooLarge) {
-  constexpr size_t arena_size = 1024;
-  uint8_t arena[arena_size];
-  tflite::SimpleTensorAllocator allocator(arena, arena_size);
-
-  const tflite::Tensor* tensor = tflite::Create1dTensor(2000);
-  const flatbuffers::Vector<flatbuffers::Offset<tflite::Buffer>>* buffers =
-      tflite::CreateBuffers();
-
-  TfLiteTensor allocated_tensor;
-  TF_LITE_MICRO_EXPECT_NE(
-      kTfLiteOk,
-      allocator.AllocateTensor(*tensor, 0, 1, buffers, micro_test::reporter,
-                               &allocated_tensor));
-}
-
-TF_LITE_MICRO_TEST(TestJustFits) {
-  constexpr size_t arena_size = 1024;
-  uint8_t arena[arena_size];
-  tflite::SimpleTensorAllocator allocator(arena, arena_size);
-
-  uint8_t* result = allocator.AllocateMemory(arena_size, 1);
-  TF_LITE_MICRO_EXPECT_NE(nullptr, result);
-}
-
-TF_LITE_MICRO_TEST(TestAligned) {
-  constexpr size_t arena_size = 1024;
-  uint8_t arena[arena_size];
-  tflite::SimpleTensorAllocator allocator(arena, arena_size);
-
-  uint8_t* result = allocator.AllocateMemory(1, 1);
-  TF_LITE_MICRO_EXPECT_NE(nullptr, result);
-
-  result = allocator.AllocateMemory(16, 4);
-  TF_LITE_MICRO_EXPECT_NE(nullptr, result);
-  TF_LITE_MICRO_EXPECT_EQ(0, reinterpret_cast<size_t>(result) & 3);
-}
-
-TF_LITE_MICRO_TEST(TestMultipleTooLarge) {
-  constexpr size_t arena_size = 1024;
-  uint8_t arena[arena_size];
-  tflite::SimpleTensorAllocator allocator(arena, arena_size);
-
-  uint8_t* result = allocator.AllocateMemory(768, 1);
-  TF_LITE_MICRO_EXPECT_NE(nullptr, result);
-
-  result = allocator.AllocateMemory(768, 1);
-  TF_LITE_MICRO_EXPECT_EQ(nullptr, result);
-}
-
-TF_LITE_MICRO_TEST(TestAllocateTensor) {
-  constexpr size_t arena_size = 1024;
-  uint8_t arena[arena_size];
-  tflite::SimpleTensorAllocator allocator(arena, arena_size);
-
-  const tflite::Tensor* tensor = tflite::CreateMissingQuantizationTensor(100);
-  const flatbuffers::Vector<flatbuffers::Offset<tflite::Buffer>>* buffers =
-      tflite::CreateBuffers();
-
-  TfLiteTensor allocated_tensor;
-  TF_LITE_MICRO_EXPECT_EQ(
-      kTfLiteOk,
-      allocator.AllocateTensor(*tensor, 0, 1, buffers, micro_test::reporter,
-                               &allocated_tensor));
-  TF_LITE_MICRO_EXPECT_EQ(kTfLiteInt32, allocated_tensor.type);
-  TF_LITE_MICRO_EXPECT_EQ(1, allocated_tensor.dims->size);
-  TF_LITE_MICRO_EXPECT_EQ(100, allocated_tensor.dims->data[0]);
-  TF_LITE_MICRO_EXPECT_EQ(400, allocated_tensor.bytes);
-  TF_LITE_MICRO_EXPECT_NE(nullptr, allocated_tensor.data.i32);
-}
-
-TF_LITE_MICRO_TESTS_END
diff --git a/tensorflow/lite/experimental/micro/test_helpers.h b/tensorflow/lite/experimental/micro/test_helpers.h
new file mode 100644
index 00000000000..32eb11f8452
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/test_helpers.h
@@ -0,0 +1,42 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_MICRO_TEST_HELPERS_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_MICRO_TEST_HELPERS_H_
+
+// Useful functions for writing tests.
+
+#include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/core/api/error_reporter.h"
+#include "tensorflow/lite/schema/schema_generated.h"
+
+namespace tflite {
+
+// Returns an example flatbuffer TensorFlow Lite model.
+const Model* GetMockModel();
+
+// Builds a one-dimensional flatbuffer tensor of the given size.
+const Tensor* Create1dFlatbufferTensor(int size);
+
+// Creates a one-dimensional tensor with no quantization metadata.
+const Tensor* CreateMissingQuantizationFlatbufferTensor(int size);
+
+// Creates a vector of flatbuffer buffers.
+const flatbuffers::Vector<flatbuffers::Offset<Buffer>>*
+CreateFlatbufferBuffers();
+
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_MICRO_TEST_HELPERS_H_
diff --git a/tensorflow/lite/experimental/micro/tools/ci_build/install_mbed_cli.sh b/tensorflow/lite/experimental/micro/tools/ci_build/install_mbed_cli.sh
new file mode 100755
index 00000000000..7e6a8cdf18e
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/tools/ci_build/install_mbed_cli.sh
@@ -0,0 +1,20 @@
+#!/usr/bin/env bash
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+#
+# Installs the latest Mbed command-line toolchain.
+
+pip install mbed-cli
+apt install -y gcc-arm-none-eabi 
diff --git a/tensorflow/lite/experimental/micro/tools/ci_build/test_arduino.sh b/tensorflow/lite/experimental/micro/tools/ci_build/test_arduino.sh
index c1b3006f5ae..e2df0854326 100755
--- a/tensorflow/lite/experimental/micro/tools/ci_build/test_arduino.sh
+++ b/tensorflow/lite/experimental/micro/tools/ci_build/test_arduino.sh
@@ -16,11 +16,6 @@
 #
 # Creates the project file distributions for the TensorFlow Lite Micro test and
 # example targets aimed at embedded platforms.
-#
-# Usage: ci_build_micro_projects.sh <TARGET OS> <TAGS>
-#
-# For example:
-# ci_build_micro_projects.sh mbed "CMSIS disco_f746ng"
 
 set -e
 
diff --git a/tensorflow/lite/experimental/micro/tools/ci_build/test_mbed.sh b/tensorflow/lite/experimental/micro/tools/ci_build/test_mbed.sh
new file mode 100755
index 00000000000..c4a25a4bc5f
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/tools/ci_build/test_mbed.sh
@@ -0,0 +1,52 @@
+#!/usr/bin/env bash
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+#
+# Creates the project file distributions for the TensorFlow Lite Micro test and
+# example targets aimed at embedded platforms.
+
+set -e
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+ROOT_DIR=${SCRIPT_DIR}/../../../../../..
+cd ${ROOT_DIR}
+pwd
+
+make -f tensorflow/lite/experimental/micro/tools/make/Makefile \
+  clean clean_downloads
+
+make -f tensorflow/lite/experimental/micro/tools/make/Makefile \
+  TARGET=mbed \
+  TAGS="portable_optimized disco_f746ng" \
+  generate_projects
+
+tensorflow/lite/experimental/micro/tools/ci_build/install_mbed_cli.sh
+
+for PROJECT_PATH in tensorflow/lite/experimental/micro/tools/make/gen/mbed_*/prj/*/mbed; do
+  PROJECT_PARENT_DIR=$(dirname ${PROJECT_PATH})
+  PROJECT_NAME=$(basename ${PROJECT_PARENT_DIR})
+  # Don't try to build and package up test projects, because there are too many.
+  if [[ ${PROJECT_NAME} == *"_test" ]]; then
+    continue
+  fi
+  cp -r ${PROJECT_PATH} ${PROJECT_PARENT_DIR}/${PROJECT_NAME}
+  pushd ${PROJECT_PARENT_DIR}
+  zip -q -r ${PROJECT_NAME}.zip ${PROJECT_NAME}
+  popd
+  tensorflow/lite/experimental/micro/tools/ci_build/test_mbed_library.sh ${PROJECT_PATH}
+done
+
+# Needed to solve CI build bug triggered by files added to source tree.
+make -f tensorflow/lite/experimental/micro/tools/make/Makefile clean_downloads
diff --git a/tensorflow/lite/experimental/micro/tools/ci_build/test_mbed_library.sh b/tensorflow/lite/experimental/micro/tools/ci_build/test_mbed_library.sh
new file mode 100755
index 00000000000..9c2a332b522
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/tools/ci_build/test_mbed_library.sh
@@ -0,0 +1,33 @@
+#!/usr/bin/env bash
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+#
+# Tests an individual Arduino library. Because libraries need to be installed
+# globally, this can cause problems with previously-installed modules, so we
+# recommend that you only run this within a VM.
+
+set -e
+
+cd ${1}
+
+mbed config root .
+mbed deploy
+
+python -c 'import fileinput, glob;
+for filename in glob.glob("mbed-os/tools/profiles/*.json"):
+  for line in fileinput.input(filename, inplace=True):
+    print(line.replace("\"-std=gnu++98\"","\"-std=c++11\", \"-fpermissive\""))'
+
+mbed compile -m DISCO_F746NG -t GCC_ARM
diff --git a/tensorflow/lite/experimental/micro/tools/make/Makefile b/tensorflow/lite/experimental/micro/tools/make/Makefile
index d2ed2e62341..9d52a931e40 100644
--- a/tensorflow/lite/experimental/micro/tools/make/Makefile
+++ b/tensorflow/lite/experimental/micro/tools/make/Makefile
@@ -22,7 +22,8 @@ else
 	endif
 endif
 
-HOST_ARCH := $(shell if [[ $(shell uname -m) =~ i[345678]86 ]]; then echo x86_32; else echo $(shell uname -m); fi)
+# Determine the host architecture, with any ix86 architecture being labelled x86_32
+HOST_ARCH := $(shell if uname -m | grep -Eq 'i[345678]86'; then echo x86_32; else echo $(shell uname -m); fi)
 
 # Override these on the make command line to target a specific architecture. For example:
 # make -f tensorflow/lite/Makefile TARGET=rpi TARGET_ARCH=armv7l
@@ -76,7 +77,8 @@ MICROLITE_LIB_NAME := libtensorflow-microlite.a
 
 MICROLITE_TEST_SRCS := \
 $(wildcard tensorflow/lite/experimental/micro/*test.cc) \
-$(wildcard tensorflow/lite/experimental/micro/kernels/*test.cc)
+$(wildcard tensorflow/lite/experimental/micro/kernels/*test.cc) \
+$(wildcard tensorflow/lite/experimental/micro/memory_planner/*test.cc)
 
 MICROLITE_TEST_HDRS := \
 $(wildcard tensorflow/lite/experimental/micro/testing/*.h)
@@ -84,6 +86,7 @@ $(wildcard tensorflow/lite/experimental/micro/testing/*.h)
 MICROLITE_CC_BASE_SRCS := \
 $(wildcard tensorflow/lite/experimental/micro/*.cc) \
 $(wildcard tensorflow/lite/experimental/micro/kernels/*.cc) \
+$(wildcard tensorflow/lite/experimental/micro/memory_planner/*.cc) \
 tensorflow/lite/c/c_api_internal.c \
 tensorflow/lite/core/api/error_reporter.cc \
 tensorflow/lite/core/api/flatbuffer_conversions.cc \
@@ -96,6 +99,7 @@ MICROLITE_CC_SRCS := $(filter-out $(MICROLITE_TEST_SRCS), $(MICROLITE_CC_BASE_SR
 MICROLITE_CC_HDRS := \
 $(wildcard tensorflow/lite/experimental/micro/*.h) \
 $(wildcard tensorflow/lite/experimental/micro/kernels/*.h) \
+$(wildcard tensorflow/lite/experimental/micro/memory_planner/*.h) \
 LICENSE \
 tensorflow/core/public/version.h \
 tensorflow/lite/c/builtin_op_data.h \
@@ -137,7 +141,7 @@ tensorflow/lite/kernels/kernel_util.h \
 tensorflow/lite/kernels/op_macros.h \
 tensorflow/lite/kernels/padding.h \
 tensorflow/lite/schema/schema_generated.h \
-tensorflow/lite/string.h \
+tensorflow/lite/string_type.h \
 tensorflow/lite/string_util.h \
 tensorflow/lite/version.h
 
@@ -159,8 +163,7 @@ MAKE_PROJECT_FILES := \
 MBED_PROJECT_FILES := \
   README_MBED.md \
   mbed-os.lib \
-  mbed_app.json \
-  .vscode/tasks.json
+  mbed_app.json
 
 KEIL_PROJECT_FILES := \
   README_KEIL.md \
@@ -182,6 +185,9 @@ $(eval $(call add_third_party_download,$(FLATBUFFERS_URL),$(FLATBUFFERS_MD5),fla
 # keep this main makefile focused on the sources and dependencies.
 include $(wildcard $(MAKEFILE_DIR)/targets/*_makefile.inc)
 
+# Load dependencies for optimized kernel implementations.
+include $(wildcard $(MAKEFILE_DIR)/ext_libs/*.inc)
+
 # Call specialize here so that platform-specific tags can be taken into account.
 MICROLITE_CC_SRCS := $(call specialize,$(MICROLITE_CC_SRCS))
 
@@ -204,9 +210,6 @@ CXX := $(CC_PREFIX)${TARGET_TOOLCHAIN_PREFIX}g++
 CC := $(CC_PREFIX)${TARGET_TOOLCHAIN_PREFIX}gcc
 AR := $(CC_PREFIX)${TARGET_TOOLCHAIN_PREFIX}ar
 
-# Load optimized kernel implementations
-include $(wildcard $(MAKEFILE_DIR)/ext_libs/*.inc)
-
 # Load the examples.
 include $(wildcard tensorflow/lite/experimental/micro/examples/*/Makefile.inc)
 
diff --git a/tensorflow/lite/experimental/micro/tools/make/ext_libs/cmsis.inc b/tensorflow/lite/experimental/micro/tools/make/ext_libs/cmsis.inc
index c15828db5f0..2763e23baa4 100644
--- a/tensorflow/lite/experimental/micro/tools/make/ext_libs/cmsis.inc
+++ b/tensorflow/lite/experimental/micro/tools/make/ext_libs/cmsis.inc
@@ -16,11 +16,263 @@ ifneq ($(filter cmsis-nn,$(ALL_TAGS)),)
     endif
 
     # Setup CMSIS-NN lib and add required header files to microlite lib INCLUDE
+    THIRD_PARTY_DOWNLOADS += \
+      $(eval $(call add_third_party_download,$(CMSIS_URL),$(CMSIS_MD5),cmsis,))
+
     CMSIS_PATH = $(MAKEFILE_DIR)/downloads/cmsis/
-    THIRD_PARTY_CC_SRCS += $(shell find $(CMSIS_PATH)/CMSIS/NN/Source/ -name *.c)
-    THIRD_PARTY_CC_HDRS += $(shell find $(CMSIS_PATH)/CMSIS/Core/Include/ -name *.h) \
-                           $(shell find $(CMSIS_PATH)/CMSIS/NN/Include/ -name *.h) \
-                           $(shell find $(CMSIS_PATH)/CMSIS/DSP/Include/ -name *.h)
+    # List created by running:
+    # find tensorflow/lite/experimental/micro/tools/make/downloads/cmsis/CMSIS/NN/Source/ -name *.c | sed -E 's#tensorflow/lite/experimental/micro/tools/make/downloads/cmsis(.*)$#      ${CMSIS_PATH}\1 \\#g'
+    THIRD_PARTY_CC_SRCS += \
+      $(CMSIS_PATH)/CMSIS/NN/Source/BasicMathFunctions/arm_elementwise_mul_s8.c \
+      $(CMSIS_PATH)/CMSIS/NN/Source/BasicMathFunctions/arm_elementwise_add_s8.c \
+      $(CMSIS_PATH)/CMSIS/NN/Source/FullyConnectedFunctions/arm_fully_connected_q7_opt.c \
+      $(CMSIS_PATH)/CMSIS/NN/Source/FullyConnectedFunctions/arm_fully_connected_s8.c \
+      $(CMSIS_PATH)/CMSIS/NN/Source/FullyConnectedFunctions/arm_fully_connected_q15_opt.c \
+      $(CMSIS_PATH)/CMSIS/NN/Source/FullyConnectedFunctions/arm_fully_connected_q15.c \
+      $(CMSIS_PATH)/CMSIS/NN/Source/FullyConnectedFunctions/arm_fully_connected_mat_q7_vec_q15.c \
+      $(CMSIS_PATH)/CMSIS/NN/Source/FullyConnectedFunctions/arm_fully_connected_q7.c \
+      $(CMSIS_PATH)/CMSIS/NN/Source/FullyConnectedFunctions/arm_fully_connected_mat_q7_vec_q15_opt.c \
+      $(CMSIS_PATH)/CMSIS/NN/Source/ConvolutionFunctions/arm_depthwise_conv_u8_basic_ver1.c \
+      $(CMSIS_PATH)/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_s8.c \
+      $(CMSIS_PATH)/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_1x1_s8_fast.c \
+      $(CMSIS_PATH)/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_HWC_q7_fast.c \
+      $(CMSIS_PATH)/CMSIS/NN/Source/ConvolutionFunctions/arm_nn_mat_mult_kernel_q7_q15_reordered.c \
+      $(CMSIS_PATH)/CMSIS/NN/Source/ConvolutionFunctions/arm_nn_mat_mult_kernel_q7_q15.c \
+      $(CMSIS_PATH)/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_HWC_q7_RGB.c \
+      $(CMSIS_PATH)/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_1x1_HWC_q7_fast_nonsquare.c \
+      $(CMSIS_PATH)/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_HWC_q7_basic.c \
+      $(CMSIS_PATH)/CMSIS/NN/Source/ConvolutionFunctions/arm_depthwise_separable_conv_HWC_q7_nonsquare.c \
+      $(CMSIS_PATH)/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_HWC_q15_fast_nonsquare.c \
+      $(CMSIS_PATH)/CMSIS/NN/Source/ConvolutionFunctions/arm_depthwise_conv_s8_opt.c \
+      $(CMSIS_PATH)/CMSIS/NN/Source/ConvolutionFunctions/arm_nn_mat_mult_kernel_s8_s16.c \
+      $(CMSIS_PATH)/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_HWC_q15_basic.c \
+      $(CMSIS_PATH)/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_HWC_q15_fast.c \
+      $(CMSIS_PATH)/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_HWC_q7_fast_nonsquare.c \
+      $(CMSIS_PATH)/CMSIS/NN/Source/ConvolutionFunctions/arm_nn_mat_mult_kernel_s8_s16_reordered.c \
+      $(CMSIS_PATH)/CMSIS/NN/Source/ConvolutionFunctions/arm_depthwise_separable_conv_HWC_q7.c \
+      $(CMSIS_PATH)/CMSIS/NN/Source/ConvolutionFunctions/arm_depthwise_conv_s8.c \
+      $(CMSIS_PATH)/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_HWC_q7_basic_nonsquare.c \
+      $(CMSIS_PATH)/CMSIS/NN/Source/PoolingFunctions/arm_pool_q7_HWC.c \
+      $(CMSIS_PATH)/CMSIS/NN/Source/PoolingFunctions/arm_avgpool_s8.c \
+      $(CMSIS_PATH)/CMSIS/NN/Source/ActivationFunctions/arm_relu_q15.c \
+      $(CMSIS_PATH)/CMSIS/NN/Source/ActivationFunctions/arm_relu_q7.c \
+      $(CMSIS_PATH)/CMSIS/NN/Source/ActivationFunctions/arm_nn_activations_q7.c \
+      $(CMSIS_PATH)/CMSIS/NN/Source/ActivationFunctions/arm_nn_activations_q15.c \
+      $(CMSIS_PATH)/CMSIS/NN/Source/NNSupportFunctions/arm_nn_add_q7.c \
+      $(CMSIS_PATH)/CMSIS/NN/Source/NNSupportFunctions/arm_q7_to_q15_reordered_no_shift.c \
+      $(CMSIS_PATH)/CMSIS/NN/Source/NNSupportFunctions/arm_nntables.c \
+      $(CMSIS_PATH)/CMSIS/NN/Source/NNSupportFunctions/arm_nn_accumulate_q7_to_q15.c \
+      $(CMSIS_PATH)/CMSIS/NN/Source/NNSupportFunctions/arm_nn_mult_q7.c \
+      $(CMSIS_PATH)/CMSIS/NN/Source/NNSupportFunctions/arm_nn_mult_q15.c \
+      $(CMSIS_PATH)/CMSIS/NN/Source/NNSupportFunctions/arm_q7_to_q15_reordered_with_offset.c \
+      $(CMSIS_PATH)/CMSIS/NN/Source/NNSupportFunctions/arm_q7_to_q15_with_offset.c \
+      $(CMSIS_PATH)/CMSIS/NN/Source/NNSupportFunctions/arm_q7_to_q15_no_shift.c \
+      $(CMSIS_PATH)/CMSIS/NN/Source/SoftmaxFunctions/arm_softmax_q15.c \
+      $(CMSIS_PATH)/CMSIS/NN/Source/SoftmaxFunctions/arm_softmax_q7.c \
+      $(CMSIS_PATH)/CMSIS/NN/Source/SoftmaxFunctions/arm_softmax_with_batch_q7.c
+
+    # List created by running:
+    # find tensorflow/lite/experimental/micro/tools/make/downloads/cmsis/CMSIS/{Core,NN,DSP} -name *.h | sed -E 's#tensorflow/lite/experimental/micro/tools/make/downloads/cmsis(.*)$#      ${CMSIS_PATH}\1 \\#g'
+    THIRD_PARTY_CC_HDRS += \
+      ${CMSIS_PATH}/CMSIS/Core/Include/core_cm3.h \
+      ${CMSIS_PATH}/CMSIS/Core/Include/core_armv81mml.h \
+      ${CMSIS_PATH}/CMSIS/Core/Include/cmsis_compiler.h \
+      ${CMSIS_PATH}/CMSIS/Core/Include/core_cm4.h \
+      ${CMSIS_PATH}/CMSIS/Core/Include/core_cm1.h \
+      ${CMSIS_PATH}/CMSIS/Core/Include/cmsis_iccarm.h \
+      ${CMSIS_PATH}/CMSIS/Core/Include/cmsis_version.h \
+      ${CMSIS_PATH}/CMSIS/Core/Include/core_armv8mml.h \
+      ${CMSIS_PATH}/CMSIS/Core/Include/core_sc300.h \
+      ${CMSIS_PATH}/CMSIS/Core/Include/cmsis_armclang_ltm.h \
+      ${CMSIS_PATH}/CMSIS/Core/Include/cmsis_armcc.h \
+      ${CMSIS_PATH}/CMSIS/Core/Include/core_cm0plus.h \
+      ${CMSIS_PATH}/CMSIS/Core/Include/core_cm33.h \
+      ${CMSIS_PATH}/CMSIS/Core/Include/cmsis_armclang.h \
+      ${CMSIS_PATH}/CMSIS/Core/Include/tz_context.h \
+      ${CMSIS_PATH}/CMSIS/Core/Include/core_armv8mbl.h \
+      ${CMSIS_PATH}/CMSIS/Core/Include/core_sc000.h \
+      ${CMSIS_PATH}/CMSIS/Core/Include/core_cm23.h \
+      ${CMSIS_PATH}/CMSIS/Core/Include/core_cm35p.h \
+      ${CMSIS_PATH}/CMSIS/Core/Include/core_cm7.h \
+      ${CMSIS_PATH}/CMSIS/Core/Include/cmsis_gcc.h \
+      ${CMSIS_PATH}/CMSIS/Core/Include/mpu_armv7.h \
+      ${CMSIS_PATH}/CMSIS/Core/Include/core_cm0.h \
+      ${CMSIS_PATH}/CMSIS/Core/Include/mpu_armv8.h \
+      ${CMSIS_PATH}/CMSIS/NN/Include/arm_nnfunctions.h \
+      ${CMSIS_PATH}/CMSIS/NN/Include/arm_nnsupportfunctions.h \
+      ${CMSIS_PATH}/CMSIS/NN/Include/arm_nn_tables.h \
+      ${CMSIS_PATH}/CMSIS/NN/Examples/ARM/arm_nn_examples/gru/arm_nnexamples_gru_test_data.h \
+      ${CMSIS_PATH}/CMSIS/NN/Examples/ARM/arm_nn_examples/gru/RTE/_ARMCM0/RTE_Components.h \
+      ${CMSIS_PATH}/CMSIS/NN/Examples/ARM/arm_nn_examples/gru/RTE/_ARMCM7_SP/RTE_Components.h \
+      ${CMSIS_PATH}/CMSIS/NN/Examples/ARM/arm_nn_examples/gru/RTE/_ARMCM4_FP/RTE_Components.h \
+      ${CMSIS_PATH}/CMSIS/NN/Examples/ARM/arm_nn_examples/gru/RTE/Compiler/EventRecorderConf.h \
+      ${CMSIS_PATH}/CMSIS/NN/Examples/ARM/arm_nn_examples/gru/RTE/_ARMCM3/RTE_Components.h \
+      ${CMSIS_PATH}/CMSIS/NN/Examples/ARM/arm_nn_examples/cifar10/RTE/_ARMCM0/RTE_Components.h \
+      ${CMSIS_PATH}/CMSIS/NN/Examples/ARM/arm_nn_examples/cifar10/RTE/_ARMCM7_SP/RTE_Components.h \
+      ${CMSIS_PATH}/CMSIS/NN/Examples/ARM/arm_nn_examples/cifar10/RTE/_ARMCM4_FP/RTE_Components.h \
+      ${CMSIS_PATH}/CMSIS/NN/Examples/ARM/arm_nn_examples/cifar10/RTE/Compiler/EventRecorderConf.h \
+      ${CMSIS_PATH}/CMSIS/NN/Examples/ARM/arm_nn_examples/cifar10/RTE/_ARMCM3/RTE_Components.h \
+      ${CMSIS_PATH}/CMSIS/NN/Examples/ARM/arm_nn_examples/cifar10/arm_nnexamples_cifar10_weights.h \
+      ${CMSIS_PATH}/CMSIS/NN/Examples/ARM/arm_nn_examples/cifar10/arm_nnexamples_cifar10_inputs.h \
+      ${CMSIS_PATH}/CMSIS/NN/Examples/ARM/arm_nn_examples/cifar10/arm_nnexamples_cifar10_parameter.h \
+      ${CMSIS_PATH}/CMSIS/NN/Examples/IAR/iar_nn_examples/NN-example-gru/arm_nnexamples_gru_test_data.h \
+      ${CMSIS_PATH}/CMSIS/NN/Examples/IAR/iar_nn_examples/NN-example-cifar10/arm_nnexamples_cifar10_weights.h \
+      ${CMSIS_PATH}/CMSIS/NN/Examples/IAR/iar_nn_examples/NN-example-cifar10/arm_nnexamples_cifar10_inputs.h \
+      ${CMSIS_PATH}/CMSIS/NN/Examples/IAR/iar_nn_examples/NN-example-cifar10/arm_nnexamples_cifar10_parameter.h \
+      ${CMSIS_PATH}/CMSIS/NN/NN_Lib_Tests/nn_test/RTE/_ARMCM0/RTE_Components.h \
+      ${CMSIS_PATH}/CMSIS/NN/NN_Lib_Tests/nn_test/RTE/_ARMCM7_SP/RTE_Components.h \
+      ${CMSIS_PATH}/CMSIS/NN/NN_Lib_Tests/nn_test/RTE/_ARMCM4_FP/RTE_Components.h \
+      ${CMSIS_PATH}/CMSIS/NN/NN_Lib_Tests/nn_test/RTE/_ARMCM3/RTE_Components.h \
+      ${CMSIS_PATH}/CMSIS/NN/NN_Lib_Tests/nn_test/arm_nnexamples_nn_test.h \
+      ${CMSIS_PATH}/CMSIS/NN/NN_Lib_Tests/nn_test/Ref_Implementations/ref_functions.h \
+      ${CMSIS_PATH}/CMSIS/NN/NN_Lib_Tests/nn_test/Ref_Implementations/fully_connected_testing_weights.h \
+      ${CMSIS_PATH}/CMSIS/DSP/PythonWrapper/cmsisdsp_pkg/src/cmsismodule.h \
+      ${CMSIS_PATH}/CMSIS/DSP/Include/arm_common_tables.h \
+      ${CMSIS_PATH}/CMSIS/DSP/Include/arm_const_structs.h \
+      ${CMSIS_PATH}/CMSIS/DSP/Include/arm_math.h \
+      ${CMSIS_PATH}/CMSIS/DSP/DSP_Lib_TestSuite/DspLibTest_FVP_A5/RTE/RTE_Components.h \
+      ${CMSIS_PATH}/CMSIS/DSP/DSP_Lib_TestSuite/DspLibTest_FVP_A5/RTE/Device/ARMCA5/mem_ARMCA5.h \
+      ${CMSIS_PATH}/CMSIS/DSP/DSP_Lib_TestSuite/DspLibTest_FVP_A5/RTE/Device/ARMCA5/system_ARMCA5.h \
+      ${CMSIS_PATH}/CMSIS/DSP/DSP_Lib_TestSuite/DspLibTest_FVP_A5/RTE/CMSIS/RTX_Config.h \
+      ${CMSIS_PATH}/CMSIS/DSP/DSP_Lib_TestSuite/Common/inc/all_tests.h \
+      ${CMSIS_PATH}/CMSIS/DSP/DSP_Lib_TestSuite/Common/inc/templates/template.h \
+      ${CMSIS_PATH}/CMSIS/DSP/DSP_Lib_TestSuite/Common/inc/templates/test_templates.h \
+      ${CMSIS_PATH}/CMSIS/DSP/DSP_Lib_TestSuite/Common/inc/type_abbrev.h \
+      ${CMSIS_PATH}/CMSIS/DSP/DSP_Lib_TestSuite/Common/inc/statistics_tests/statistics_templates.h \
+      ${CMSIS_PATH}/CMSIS/DSP/DSP_Lib_TestSuite/Common/inc/statistics_tests/statistics_test_group.h \
+      ${CMSIS_PATH}/CMSIS/DSP/DSP_Lib_TestSuite/Common/inc/statistics_tests/statistics_tests.h \
+      ${CMSIS_PATH}/CMSIS/DSP/DSP_Lib_TestSuite/Common/inc/statistics_tests/statistics_test_data.h \
+      ${CMSIS_PATH}/CMSIS/DSP/DSP_Lib_TestSuite/Common/inc/filtering_tests/filtering_templates.h \
+      ${CMSIS_PATH}/CMSIS/DSP/DSP_Lib_TestSuite/Common/inc/filtering_tests/filtering_test_group.h \
+      ${CMSIS_PATH}/CMSIS/DSP/DSP_Lib_TestSuite/Common/inc/filtering_tests/filtering_tests.h \
+      ${CMSIS_PATH}/CMSIS/DSP/DSP_Lib_TestSuite/Common/inc/filtering_tests/filtering_test_data.h \
+      ${CMSIS_PATH}/CMSIS/DSP/DSP_Lib_TestSuite/Common/inc/math_helper.h \
+      ${CMSIS_PATH}/CMSIS/DSP/DSP_Lib_TestSuite/Common/inc/fast_math_tests/fast_math_test_data.h \
+      ${CMSIS_PATH}/CMSIS/DSP/DSP_Lib_TestSuite/Common/inc/fast_math_tests/fast_math_test_group.h \
+      ${CMSIS_PATH}/CMSIS/DSP/DSP_Lib_TestSuite/Common/inc/fast_math_tests/fast_math_templates.h \
+      ${CMSIS_PATH}/CMSIS/DSP/DSP_Lib_TestSuite/Common/inc/transform_tests/transform_templates.h \
+      ${CMSIS_PATH}/CMSIS/DSP/DSP_Lib_TestSuite/Common/inc/transform_tests/transform_test_data.h \
+      ${CMSIS_PATH}/CMSIS/DSP/DSP_Lib_TestSuite/Common/inc/transform_tests/transform_test_group.h \
+      ${CMSIS_PATH}/CMSIS/DSP/DSP_Lib_TestSuite/Common/inc/transform_tests/transform_tests.h \
+      ${CMSIS_PATH}/CMSIS/DSP/DSP_Lib_TestSuite/Common/inc/intrinsics_tests/intrinsics_test_data.h \
+      ${CMSIS_PATH}/CMSIS/DSP/DSP_Lib_TestSuite/Common/inc/intrinsics_tests/intrinsics_templates.h \
+      ${CMSIS_PATH}/CMSIS/DSP/DSP_Lib_TestSuite/Common/inc/intrinsics_tests/intrinsics_test_group.h \
+      ${CMSIS_PATH}/CMSIS/DSP/DSP_Lib_TestSuite/Common/inc/basic_math_tests/basic_math_test_group.h \
+      ${CMSIS_PATH}/CMSIS/DSP/DSP_Lib_TestSuite/Common/inc/basic_math_tests/basic_math_templates.h \
+      ${CMSIS_PATH}/CMSIS/DSP/DSP_Lib_TestSuite/Common/inc/basic_math_tests/basic_math_test_data.h \
+      ${CMSIS_PATH}/CMSIS/DSP/DSP_Lib_TestSuite/Common/inc/basic_math_tests/basic_math_tests.h \
+      ${CMSIS_PATH}/CMSIS/DSP/DSP_Lib_TestSuite/Common/inc/complex_math_tests/complex_math_test_group.h \
+      ${CMSIS_PATH}/CMSIS/DSP/DSP_Lib_TestSuite/Common/inc/complex_math_tests/complex_math_tests.h \
+      ${CMSIS_PATH}/CMSIS/DSP/DSP_Lib_TestSuite/Common/inc/complex_math_tests/complex_math_test_data.h \
+      ${CMSIS_PATH}/CMSIS/DSP/DSP_Lib_TestSuite/Common/inc/complex_math_tests/complex_math_templates.h \
+      ${CMSIS_PATH}/CMSIS/DSP/DSP_Lib_TestSuite/Common/inc/matrix_tests/matrix_templates.h \
+      ${CMSIS_PATH}/CMSIS/DSP/DSP_Lib_TestSuite/Common/inc/matrix_tests/matrix_tests.h \
+      ${CMSIS_PATH}/CMSIS/DSP/DSP_Lib_TestSuite/Common/inc/matrix_tests/matrix_test_group.h \
+      ${CMSIS_PATH}/CMSIS/DSP/DSP_Lib_TestSuite/Common/inc/matrix_tests/matrix_test_data.h \
+      ${CMSIS_PATH}/CMSIS/DSP/DSP_Lib_TestSuite/Common/inc/controller_tests/controller_tests.h \
+      ${CMSIS_PATH}/CMSIS/DSP/DSP_Lib_TestSuite/Common/inc/controller_tests/controller_test_group.h \
+      ${CMSIS_PATH}/CMSIS/DSP/DSP_Lib_TestSuite/Common/inc/controller_tests/controller_test_data.h \
+      ${CMSIS_PATH}/CMSIS/DSP/DSP_Lib_TestSuite/Common/inc/controller_tests/controller_templates.h \
+      ${CMSIS_PATH}/CMSIS/DSP/DSP_Lib_TestSuite/Common/inc/support_tests/support_test_data.h \
+      ${CMSIS_PATH}/CMSIS/DSP/DSP_Lib_TestSuite/Common/inc/support_tests/support_tests.h \
+      ${CMSIS_PATH}/CMSIS/DSP/DSP_Lib_TestSuite/Common/inc/support_tests/support_templates.h \
+      ${CMSIS_PATH}/CMSIS/DSP/DSP_Lib_TestSuite/Common/inc/support_tests/support_test_group.h \
+      ${CMSIS_PATH}/CMSIS/DSP/DSP_Lib_TestSuite/Common/JTest/inc/jtest_fw.h \
+      ${CMSIS_PATH}/CMSIS/DSP/DSP_Lib_TestSuite/Common/JTest/inc/util/util.h \
+      ${CMSIS_PATH}/CMSIS/DSP/DSP_Lib_TestSuite/Common/JTest/inc/jtest_test_define.h \
+      ${CMSIS_PATH}/CMSIS/DSP/DSP_Lib_TestSuite/Common/JTest/inc/jtest_define.h \
+      ${CMSIS_PATH}/CMSIS/DSP/DSP_Lib_TestSuite/Common/JTest/inc/jtest_test_call.h \
+      ${CMSIS_PATH}/CMSIS/DSP/DSP_Lib_TestSuite/Common/JTest/inc/jtest_test_ret.h \
+      ${CMSIS_PATH}/CMSIS/DSP/DSP_Lib_TestSuite/Common/JTest/inc/jtest_group_call.h \
+      ${CMSIS_PATH}/CMSIS/DSP/DSP_Lib_TestSuite/Common/JTest/inc/jtest_group_define.h \
+      ${CMSIS_PATH}/CMSIS/DSP/DSP_Lib_TestSuite/Common/JTest/inc/jtest_cycle.h \
+      ${CMSIS_PATH}/CMSIS/DSP/DSP_Lib_TestSuite/Common/JTest/inc/jtest_util.h \
+      ${CMSIS_PATH}/CMSIS/DSP/DSP_Lib_TestSuite/Common/JTest/inc/jtest_systick.h \
+      ${CMSIS_PATH}/CMSIS/DSP/DSP_Lib_TestSuite/Common/JTest/inc/jtest.h \
+      ${CMSIS_PATH}/CMSIS/DSP/DSP_Lib_TestSuite/Common/JTest/inc/arr_desc/arr_desc.h \
+      ${CMSIS_PATH}/CMSIS/DSP/DSP_Lib_TestSuite/Common/JTest/inc/jtest_test.h \
+      ${CMSIS_PATH}/CMSIS/DSP/DSP_Lib_TestSuite/Common/JTest/inc/jtest_group.h \
+      ${CMSIS_PATH}/CMSIS/DSP/DSP_Lib_TestSuite/Common/JTest/inc/jtest_pf.h \
+      ${CMSIS_PATH}/CMSIS/DSP/DSP_Lib_TestSuite/Common/JTest/inc/opt_arg/pp_narg.h \
+      ${CMSIS_PATH}/CMSIS/DSP/DSP_Lib_TestSuite/Common/JTest/inc/opt_arg/splice.h \
+      ${CMSIS_PATH}/CMSIS/DSP/DSP_Lib_TestSuite/Common/JTest/inc/opt_arg/opt_arg.h \
+      ${CMSIS_PATH}/CMSIS/DSP/DSP_Lib_TestSuite/RefLibs/inc/ref.h \
+      ${CMSIS_PATH}/CMSIS/DSP/Platforms/FVP/ARMCA5/Include/ARMCA5.h \
+      ${CMSIS_PATH}/CMSIS/DSP/Platforms/FVP/ARMCA5/LinkScripts/AC6/mem_ARMCA5.h \
+      ${CMSIS_PATH}/CMSIS/DSP/Platforms/FVP/ARMCA5/LinkScripts/AC6/system_ARMCA5.h \
+      ${CMSIS_PATH}/CMSIS/DSP/Platforms/FVP/ARMCA5/LinkScripts/GCC/mem_ARMCA5.h \
+      ${CMSIS_PATH}/CMSIS/DSP/Platforms/FVP/ARMCA5/LinkScripts/GCC/system_ARMCA5.h \
+      ${CMSIS_PATH}/CMSIS/DSP/Platforms/FVP/ARMCM4/Include/ARMCM4_FP.h \
+      ${CMSIS_PATH}/CMSIS/DSP/Platforms/FVP/ARMCM4/Include/system_ARMCM4.h \
+      ${CMSIS_PATH}/CMSIS/DSP/Platforms/FVP/ARMCM4/Include/ARMCM4.h \
+      ${CMSIS_PATH}/CMSIS/DSP/Platforms/FVP/ARMCM4/LinkScripts/AC6/mem_ARMCM4.h \
+      ${CMSIS_PATH}/CMSIS/DSP/Platforms/FVP/ARMCM33/Include/ARMCM33_DSP_FP.h \
+      ${CMSIS_PATH}/CMSIS/DSP/Platforms/FVP/ARMCM33/Include/ARMCM33_TZ.h \
+      ${CMSIS_PATH}/CMSIS/DSP/Platforms/FVP/ARMCM33/Include/system_ARMCM33.h \
+      ${CMSIS_PATH}/CMSIS/DSP/Platforms/FVP/ARMCM33/Include/ARMCM33_DSP_FP_TZ.h \
+      ${CMSIS_PATH}/CMSIS/DSP/Platforms/FVP/ARMCM33/Include/ARMCM33.h \
+      ${CMSIS_PATH}/CMSIS/DSP/Platforms/FVP/ARMCM33/LinkScripts/AC6/mem_ARMCM33.h \
+      ${CMSIS_PATH}/CMSIS/DSP/Platforms/FVP/ARMCM7/Include/ARMCM7.h \
+      ${CMSIS_PATH}/CMSIS/DSP/Platforms/FVP/ARMCM7/Include/system_ARMCM7.h \
+      ${CMSIS_PATH}/CMSIS/DSP/Platforms/FVP/ARMCM7/Include/ARMCM7_DP.h \
+      ${CMSIS_PATH}/CMSIS/DSP/Platforms/FVP/ARMCM7/Include/ARMCM7_SP.h \
+      ${CMSIS_PATH}/CMSIS/DSP/Platforms/FVP/ARMCM7/LinkScripts/AC6/mem_ARMCM7.h \
+      ${CMSIS_PATH}/CMSIS/DSP/Platforms/FVP/ARMCM7/LinkScripts/GCC/mem_ARMCM7.h \
+      ${CMSIS_PATH}/CMSIS/DSP/Platforms/FVP/ARMCM0/Include/ARMCM0.h \
+      ${CMSIS_PATH}/CMSIS/DSP/Platforms/FVP/ARMCM0/Include/system_ARMCM0.h \
+      ${CMSIS_PATH}/CMSIS/DSP/Platforms/FVP/ARMCM0/LinkScripts/AC6/mem_ARMCM0.h \
+      ${CMSIS_PATH}/CMSIS/DSP/Examples/ARM/arm_matrix_example/math_helper.h \
+      ${CMSIS_PATH}/CMSIS/DSP/Examples/ARM/arm_fir_example/math_helper.h \
+      ${CMSIS_PATH}/CMSIS/DSP/Examples/ARM/arm_convolution_example/math_helper.h \
+      ${CMSIS_PATH}/CMSIS/DSP/Examples/ARM/arm_graphic_equalizer_example/math_helper.h \
+      ${CMSIS_PATH}/CMSIS/DSP/Examples/ARM/arm_linear_interp_example/math_helper.h \
+      ${CMSIS_PATH}/CMSIS/DSP/Examples/ARM/arm_signal_converge_example/math_helper.h \
+      ${CMSIS_PATH}/CMSIS/DSP/Source/DistanceFunctions/arm_boolean_distance_template.h \
+      ${CMSIS_PATH}/CMSIS/DSP/Testing/RTE_Components.h \
+      ${CMSIS_PATH}/CMSIS/DSP/Testing/Include/Benchmarks/FIRF32.h \
+      ${CMSIS_PATH}/CMSIS/DSP/Testing/Include/Benchmarks/BasicMathsBenchmarksF32.h \
+      ${CMSIS_PATH}/CMSIS/DSP/Testing/Include/Benchmarks/PoolingBench.h \
+      ${CMSIS_PATH}/CMSIS/DSP/Testing/Include/Benchmarks/ComplexMathsBenchmarksQ15.h \
+      ${CMSIS_PATH}/CMSIS/DSP/Testing/Include/Benchmarks/BasicMathsBenchmarksQ15.h \
+      ${CMSIS_PATH}/CMSIS/DSP/Testing/Include/Benchmarks/ComplexMathsBenchmarksF32.h \
+      ${CMSIS_PATH}/CMSIS/DSP/Testing/Include/Benchmarks/MISCQ7.h \
+      ${CMSIS_PATH}/CMSIS/DSP/Testing/Include/Benchmarks/BasicMathsBenchmarksQ7.h \
+      ${CMSIS_PATH}/CMSIS/DSP/Testing/Include/Benchmarks/BasicMathsBenchmarksQ31.h \
+      ${CMSIS_PATH}/CMSIS/DSP/Testing/Include/Benchmarks/FIRQ31.h \
+      ${CMSIS_PATH}/CMSIS/DSP/Testing/Include/Benchmarks/MISCF32.h \
+      ${CMSIS_PATH}/CMSIS/DSP/Testing/Include/Benchmarks/FullyConnectedBench.h \
+      ${CMSIS_PATH}/CMSIS/DSP/Testing/Include/Benchmarks/DECIMQ31.h \
+      ${CMSIS_PATH}/CMSIS/DSP/Testing/Include/Benchmarks/MISCQ15.h \
+      ${CMSIS_PATH}/CMSIS/DSP/Testing/Include/Benchmarks/DECIMQ15.h \
+      ${CMSIS_PATH}/CMSIS/DSP/Testing/Include/Benchmarks/DECIMF32.h \
+      ${CMSIS_PATH}/CMSIS/DSP/Testing/Include/Benchmarks/BIQUADF64.h \
+      ${CMSIS_PATH}/CMSIS/DSP/Testing/Include/Benchmarks/BIQUADF32.h \
+      ${CMSIS_PATH}/CMSIS/DSP/Testing/Include/Benchmarks/FIRQ15.h \
+      ${CMSIS_PATH}/CMSIS/DSP/Testing/Include/Benchmarks/ComplexMathsBenchmarksQ31.h \
+      ${CMSIS_PATH}/CMSIS/DSP/Testing/Include/Benchmarks/MISCQ31.h \
+      ${CMSIS_PATH}/CMSIS/DSP/Testing/Include/Tests/StatsTestsF32.h \
+      ${CMSIS_PATH}/CMSIS/DSP/Testing/Include/Tests/Softmax.h \
+      ${CMSIS_PATH}/CMSIS/DSP/Testing/Include/Tests/Pooling.h \
+      ${CMSIS_PATH}/CMSIS/DSP/Testing/Include/Tests/NNSupport.h \
+      ${CMSIS_PATH}/CMSIS/DSP/Testing/Include/Tests/SVMF32.h \
+      ${CMSIS_PATH}/CMSIS/DSP/Testing/Include/Tests/FullyConnected.h \
+      ${CMSIS_PATH}/CMSIS/DSP/Testing/Include/Tests/DistanceTestsF32.h \
+      ${CMSIS_PATH}/CMSIS/DSP/Testing/Include/Tests/DistanceTestsU32.h \
+      ${CMSIS_PATH}/CMSIS/DSP/Testing/Include/Tests/SupportTestsF32.h \
+      ${CMSIS_PATH}/CMSIS/DSP/Testing/Include/Tests/BayesF32.h \
+      ${CMSIS_PATH}/CMSIS/DSP/Testing/Include/Tests/BasicTestsF32.h \
+      ${CMSIS_PATH}/CMSIS/DSP/Testing/FrameworkInclude/Test.h \
+      ${CMSIS_PATH}/CMSIS/DSP/Testing/FrameworkInclude/Semihosting.h \
+      ${CMSIS_PATH}/CMSIS/DSP/Testing/FrameworkInclude/Pattern.h \
+      ${CMSIS_PATH}/CMSIS/DSP/Testing/FrameworkInclude/Calibrate.h \
+      ${CMSIS_PATH}/CMSIS/DSP/Testing/FrameworkInclude/ArrayMemory.h \
+      ${CMSIS_PATH}/CMSIS/DSP/Testing/FrameworkInclude/IORunner.h \
+      ${CMSIS_PATH}/CMSIS/DSP/Testing/FrameworkInclude/Error.h \
+      ${CMSIS_PATH}/CMSIS/DSP/Testing/FrameworkInclude/Generators.h \
+      ${CMSIS_PATH}/CMSIS/DSP/Testing/FrameworkInclude/Timing.h \
+      ${CMSIS_PATH}/CMSIS/DSP/Testing/FrameworkInclude/FPGA.h \
+      ${CMSIS_PATH}/CMSIS/DSP/ComputeLibrary/Include/NEMath.h
+
     INCLUDES += -I$(CMSIS_PATH)/CMSIS/Core/Include \
                 -I$(CMSIS_PATH)/CMSIS/NN/Include \
                 -I$(CMSIS_PATH)/CMSIS/DSP/Include
diff --git a/tensorflow/lite/experimental/micro/tools/make/targets/mbed_makefile.inc b/tensorflow/lite/experimental/micro/tools/make/targets/mbed_makefile.inc
index 161ff34cdbd..b3210bd32a6 100644
--- a/tensorflow/lite/experimental/micro/tools/make/targets/mbed_makefile.inc
+++ b/tensorflow/lite/experimental/micro/tools/make/targets/mbed_makefile.inc
@@ -1,4 +1,6 @@
 # Settings for mbed platforms.
 ifeq ($(TARGET), mbed)
   TARGET_ARCH := cortex-m4
+  $(eval $(call add_third_party_download,$(CMSIS_URL),$(CMSIS_MD5),cmsis,))
+  $(eval $(call add_third_party_download,$(CUST_CMSIS_URL),$(CUST_CMSIS_MD5),CMSIS_ext,))
 endif
\ No newline at end of file
diff --git a/tensorflow/lite/experimental/micro/tools/make/templates/mbed-os.lib.tpl b/tensorflow/lite/experimental/micro/tools/make/templates/mbed-os.lib.tpl
index 69fff22f335..e8c46617851 100644
--- a/tensorflow/lite/experimental/micro/tools/make/templates/mbed-os.lib.tpl
+++ b/tensorflow/lite/experimental/micro/tools/make/templates/mbed-os.lib.tpl
@@ -1 +1 @@
-https://github.com/ARMmbed/mbed-os/#6a0a86538c0b9b2bfcc4583b1e2b7fea8f4e71e9
+https://github.com/ARMmbed/mbed-os/#8ef742a49c1682f9ef3ba50148b871e38c3866cc
diff --git a/tensorflow/lite/experimental/micro/tools/make/third_party_downloads.inc b/tensorflow/lite/experimental/micro/tools/make/third_party_downloads.inc
index 6f84e25d294..6027131f1b0 100644
--- a/tensorflow/lite/experimental/micro/tools/make/third_party_downloads.inc
+++ b/tensorflow/lite/experimental/micro/tools/make/third_party_downloads.inc
@@ -20,8 +20,8 @@ LEON_BCC2_MD5 := "cdf78082be4882da2a92c9baa82fe765"
 TSIM_URL := "https://www.gaisler.com/anonftp/tsim/tsim-eval-2.0.63.tar.gz"
 TSIM_MD5 := "afa0095d3ed989a949e1467f94e41d2f"
 
-CMSIS_URL := "https://github.com/ARM-software/CMSIS_5/archive/5.4.0.zip"
-CMSIS_MD5 := "f451f1dccc844e894939055db278a40e"
+CMSIS_URL := "https://github.com/ARM-software/CMSIS_5/archive/01c7adb7685da540be9297b5a93e6640ea3333ce.zip"
+CMSIS_MD5 := "3dec53cc74f1d5d79036952137be5d5e"
 
 AM_SDK_URL := "http://s3.asia.ambiqmicro.com/downloads/AmbiqSuite-Rel2.0.0.zip"
 AM_SDK_MD5 := "70332bc6968602bd85bee600ca81d06f"
diff --git a/tensorflow/lite/experimental/objc/BUILD.apple b/tensorflow/lite/experimental/objc/BUILD.apple
index 032cba0cc86..09e672ceff3 100644
--- a/tensorflow/lite/experimental/objc/BUILD.apple
+++ b/tensorflow/lite/experimental/objc/BUILD.apple
@@ -72,9 +72,7 @@ ios_unit_test(
     name = "Tests",
     size = "medium",
     minimum_os_version = TFL_MINIMUM_OS_VERSION,
-    tags = TFL_DEFAULT_TAGS + TFL_DISABLED_SANITIZER_TAGS + [
-        "nozapfhahn",  # Fails during execution, see b/134486097.
-    ],
+    tags = TFL_DEFAULT_TAGS + TFL_DISABLED_SANITIZER_TAGS,
     deps = [
         ":TestsLibrary",
     ],
diff --git a/tensorflow/lite/experimental/ruy/BUILD b/tensorflow/lite/experimental/ruy/BUILD
index 56087276d4a..c0b799cc0a1 100644
--- a/tensorflow/lite/experimental/ruy/BUILD
+++ b/tensorflow/lite/experimental/ruy/BUILD
@@ -5,7 +5,6 @@
 load(":build_defs.bzl", "ruy_copts_avx2", "ruy_copts_base", "ruy_copts_skylake", "ruy_visibility")
 load(":ruy_test_ext.bzl", "ruy_test_ext_defines", "ruy_test_ext_deps")
 load(":ruy_test.bzl", "ruy_benchmark", "ruy_benchmark_opt_sets", "ruy_test")
-load("//tensorflow/lite:special_rules.bzl", "tflite_portable_test_suite")
 
 package(
     default_visibility = ["//visibility:private"],
@@ -760,5 +759,3 @@ ruy_benchmark_opt_sets(
         "7ff",
     ],
 )
-
-tflite_portable_test_suite()
diff --git a/tensorflow/lite/experimental/ruy/context.cc b/tensorflow/lite/experimental/ruy/context.cc
index aea42cdf501..f85a5cc5fa7 100644
--- a/tensorflow/lite/experimental/ruy/context.cc
+++ b/tensorflow/lite/experimental/ruy/context.cc
@@ -40,6 +40,13 @@ Path Context::GetRuntimeEnabledPaths() {
   // Need to resolve now. Start by considering all paths enabled.
   runtime_enabled_paths_ = kAllPaths;
 
+  // This mechanism is intended to be used for testing and benchmarking. For
+  // example, one can set RUY_FORCE_DISABLE_PATHS to Path::kAvx512 in order to
+  // evaluate AVX2 performance on an AVX-512 machine.
+#ifdef RUY_FORCE_DISABLE_PATHS
+  runtime_enabled_paths_ = runtime_enabled_paths_ & ~(RUY_FORCE_DISABLE_PATHS);
+#endif
+
 #if RUY_PLATFORM(ARM)
   // Now selectively disable paths that aren't supported on this machine.
   if ((runtime_enabled_paths_ & Path::kNeonDotprod) != Path::kNone) {
diff --git a/tensorflow/lite/experimental/ruy/detect_x86.cc b/tensorflow/lite/experimental/ruy/detect_x86.cc
index f96f172ee80..a1bf5b38ea4 100644
--- a/tensorflow/lite/experimental/ruy/detect_x86.cc
+++ b/tensorflow/lite/experimental/ruy/detect_x86.cc
@@ -49,41 +49,46 @@ inline void RunCpuid(std::uint32_t eax, std::uint32_t ecx,
 }  // namespace
 
 bool DetectCpuSse42() {
-  constexpr std::uint32_t kAvx512EcxSse42 = 1u << 20;
-  constexpr std::uint32_t kAvx512EcxAbm = 1u << 5;
+  constexpr std::uint32_t kEcxSse42 = 1u << 20;
+  constexpr std::uint32_t kEcxAbm = 1u << 5;
 
   std::uint32_t abcd[4];
 
   RunCpuid(1, 0, abcd);
-  const bool has_sse4_2_base = (abcd[2] & kAvx512EcxSse42) == kAvx512EcxSse42;
+  const bool has_sse4_2_base = (abcd[2] & kEcxSse42) == kEcxSse42;
   RunCpuid(0x80000001, 0, abcd);
-  const bool has_abm = (abcd[2] & kAvx512EcxAbm) == kAvx512EcxAbm;
+  const bool has_abm = (abcd[2] & kEcxAbm) == kEcxAbm;
 
   return has_sse4_2_base && has_abm;
 }
 
 bool DetectCpuAvx2() {
-  constexpr std::uint32_t kAvx2Ebx = 1u << 5;
+  constexpr std::uint32_t kEbxAvx2 = 1u << 5;
+  constexpr std::uint32_t kEcxFma = 1u << 12;
 
   std::uint32_t abcd[4];
-  RunCpuid(7, 0, abcd);
 
-  return (abcd[1] & kAvx2Ebx) == kAvx2Ebx;
+  RunCpuid(7, 0, abcd);
+  const bool has_avx2 = (abcd[1] & kEbxAvx2) == kEbxAvx2;
+  RunCpuid(1, 0, abcd);
+  const bool has_fma = (abcd[2] & kEcxFma) == kEcxFma;
+
+  return has_avx2 && has_fma;
 }
 
 bool DetectCpuAvx512() {
-  constexpr std::uint32_t kAvx512EbxF = 1u << 16;
-  constexpr std::uint32_t kAvx512EbxDq = 1u << 17;
-  constexpr std::uint32_t kAvx512EbxCd = 1u << 28;
-  constexpr std::uint32_t kAvx512EbxBw = 1u << 30;
-  constexpr std::uint32_t kAvx512EbxVl = 1u << 31;
+  constexpr std::uint32_t kEbxAvx512F = 1u << 16;
+  constexpr std::uint32_t kEbxAvx512Dq = 1u << 17;
+  constexpr std::uint32_t kEbxAvx512Cd = 1u << 28;
+  constexpr std::uint32_t kEbxAvx512Bw = 1u << 30;
+  constexpr std::uint32_t kEbxAvx512Vl = 1u << 31;
 
-  constexpr std::uint32_t kAvx512EbxMask =
-      kAvx512EbxF | kAvx512EbxDq | kAvx512EbxCd | kAvx512EbxBw | kAvx512EbxVl;
+  constexpr std::uint32_t kEbxAvx512Mask =
+      kEbxAvx512F | kEbxAvx512Dq | kEbxAvx512Cd | kEbxAvx512Bw | kEbxAvx512Vl;
   std::uint32_t abcd[4];
   RunCpuid(7, 0, abcd);
 
-  return (abcd[1] & kAvx512EbxMask) == kAvx512EbxMask;
+  return (abcd[1] & kEbxAvx512Mask) == kEbxAvx512Mask;
 }
 
 #endif
diff --git a/tensorflow/lite/experimental/ruy/kernel_arm32.cc b/tensorflow/lite/experimental/ruy/kernel_arm32.cc
index 1e81852bd57..95050a82c18 100644
--- a/tensorflow/lite/experimental/ruy/kernel_arm32.cc
+++ b/tensorflow/lite/experimental/ruy/kernel_arm32.cc
@@ -704,6 +704,7 @@ void Kernel8bitNeonOutOfOrder(const KernelParams8bit<4, 2>& params) {
         "vmull.s8 q15, d6, d8\n"
         "vmlal.s8 q14, d5, d9\n"
         "vmlal.s8 q15, d7, d9\n"
+        "vld1.8 {d8, d9}, [%[rhs_ptr]]!\n"
 
         // Then pairwise accumulate in to q8, q9
         "vpadal.s16 q8, q14\n"
@@ -713,7 +714,9 @@ void Kernel8bitNeonOutOfOrder(const KernelParams8bit<4, 2>& params) {
         "vmull.s8 q14, d0, d10\n"
         "vmull.s8 q15, d2, d10\n"
         "vmlal.s8 q14, d1, d11\n"
+        "vld1.8 {d0, d1}, [%[lhs_ptr]]!\n"
         "vmlal.s8 q15, d3, d11\n"
+        "vld1.8 {d2, d3}, [%[lhs_ptr]]!\n"
 
         // Then pairwise accumulate in to q8, q9
         "vpadal.s16 q10, q14\n"
@@ -723,19 +726,16 @@ void Kernel8bitNeonOutOfOrder(const KernelParams8bit<4, 2>& params) {
         "vmull.s8 q14, d4, d10\n"
         "vmull.s8 q15, d6, d10\n"
         "vmlal.s8 q14, d5, d11\n"
+        "vld1.8 {d4, d5}, [%[lhs_ptr]]!\n"
         "vmlal.s8 q15, d7, d11\n"
+        "vld1.8 {d6, d7}, [%[lhs_ptr]]!\n"
         // Then pairwise accumulate in to q12, q13
         "vpadal.s16 q12, q14\n"
+        "vld1.8 {d10, d11}, [%[rhs_ptr]]!\n"
         "vpadal.s16 q13, q15\n"
 
-        // Load the next 64 bytes of LHS and RHS data.
-        "vld1.8 {d0, d1}, [%[lhs_ptr]]!\n"
-        "vld1.8 {d2, d3}, [%[lhs_ptr]]!\n"
-        "vld1.8 {d4, d5}, [%[lhs_ptr]]!\n"
-        "vld1.8 {d6, d7}, [%[lhs_ptr]]!\n"
+        // Prefetch the next 64 bytes of LHS and RHS data.
         RUY_PREFETCH("pld [%[lhs_ptr]]\n")
-        "vld1.8 {d8, d9}, [%[rhs_ptr]]!\n"
-        "vld1.8 {d10, d11}, [%[rhs_ptr]]!\n"
         RUY_PREFETCH("pld [%[rhs_ptr]]\n")
 
         // Each iteration of this loop advances by 16 levels of depth.
@@ -789,17 +789,7 @@ void Kernel8bitNeonOutOfOrder(const KernelParams8bit<4, 2>& params) {
         "vpadal.s16 q13, q15\n"
 
         // All accumulation over depth done. q6 - q13 contain the 4x32b
-        // accumulators for the 4x2 final matrix. Need to collapse down
-        // to one 32b value per entry.
-        RUY_MAKE_ZERO(q0)
-        RUY_MAKE_ZERO(q1)
-        RUY_MAKE_ZERO(q2)
-        RUY_MAKE_ZERO(q3)
-        RUY_MAKE_ZERO(q4)
-        RUY_MAKE_ZERO(q5)
-        RUY_MAKE_ZERO(q14)
-        RUY_MAKE_ZERO(q15)
-
+        // accumulators for the 4x2 final matrix.
         // We now have to compute the final 8-bit values from these int32
         // accumulators, and advance to the next 4x2 block. We intertwine
         // these two aspects whenever possible for optimal pipelining, both
diff --git a/tensorflow/lite/experimental/ruy/pack_arm.cc b/tensorflow/lite/experimental/ruy/pack_arm.cc
index 27d5fecd5ac..8113ca0ccb0 100644
--- a/tensorflow/lite/experimental/ruy/pack_arm.cc
+++ b/tensorflow/lite/experimental/ruy/pack_arm.cc
@@ -260,10 +260,12 @@ void Pack8bitNeonOutOfOrder4Cols(const PackParams8bit& params) {
           /* Load q0 */
           "vld1.8 {d0, d1}, [%[src_ptr0]]\n"
           "add %[src_ptr0], %[src_ptr0], %[src_inc0]\n"
+          RUY_PREFETCH("pld [%[src_ptr0]]\n")
 
           /* Load q1 */
           "vld1.8 {d2, d3}, [%[src_ptr1]]\n"
           "add %[src_ptr1], %[src_ptr1], %[src_inc1]\n"
+          RUY_PREFETCH("pld [%[src_ptr1]]\n")
 
           "veor.8 q4, q0, q11\n"
           "veor.8 q5, q1, q11\n"
@@ -283,9 +285,11 @@ void Pack8bitNeonOutOfOrder4Cols(const PackParams8bit& params) {
           // Now do the same for src_ptr2 and src_ptr3.
           "vld1.8 {d0, d1}, [%[src_ptr2]]\n"
           "add %[src_ptr2], %[src_ptr2], %[src_inc2]\n"
+          RUY_PREFETCH("pld [%[src_ptr2]]\n")
 
           "vld1.8 {d2, d3}, [%[src_ptr3]]\n"
           "add %[src_ptr3], %[src_ptr3], %[src_inc3]\n"
+          RUY_PREFETCH("pld [%[src_ptr3]]\n")
 
           "veor.8 q4, q0, q11\n"
           "veor.8 q5, q1, q11\n"
diff --git a/tensorflow/lite/experimental/ruy/platform.h b/tensorflow/lite/experimental/ruy/platform.h
index 00c6441a86d..7c6b40ac95e 100644
--- a/tensorflow/lite/experimental/ruy/platform.h
+++ b/tensorflow/lite/experimental/ruy/platform.h
@@ -119,7 +119,8 @@ limitations under the License.
 #endif
 
 // Note does not check for LZCNT or POPCNT.
-#if RUY_PLATFORM(X86_ENHANCEMENTS) && RUY_PLATFORM(X86) && defined(__SSE4_2__)
+#if RUY_PLATFORM(X86_ENHANCEMENTS) && RUY_PLATFORM(X86) && \
+    defined(__SSE4_2__) && defined(__FMA__)
 #define RUY_DONOTUSEDIRECTLY_SSE4_2 1
 #else
 #define RUY_DONOTUSEDIRECTLY_SSE4_2 0
diff --git a/tensorflow/lite/experimental/swift/BUILD.apple b/tensorflow/lite/experimental/swift/BUILD.apple
index 9fcff5f5349..ba014251bcc 100644
--- a/tensorflow/lite/experimental/swift/BUILD.apple
+++ b/tensorflow/lite/experimental/swift/BUILD.apple
@@ -25,10 +25,7 @@ ios_unit_test(
     name = "Tests",
     size = "small",
     minimum_os_version = TFL_MINIMUM_OS_VERSION,
-    tags = TFL_DEFAULT_TAGS + TFL_DISABLED_SANITIZER_TAGS + [
-        "nozapfhahn",  # Fails during coverage build, see b/139134323.
-    ],
-    test_host = ":TestApp",
+    tags = TFL_DEFAULT_TAGS + TFL_DISABLED_SANITIZER_TAGS,
     deps = [
         ":TestsLibrary",
     ],
@@ -38,9 +35,7 @@ swift_library(
     name = "TestsLibrary",
     testonly = 1,
     srcs = glob(["Tests/*.swift"]),
-    tags = TFL_DEFAULT_TAGS + [
-        "nozapfhahn",  # Fails during coverage build, see b/139134323.
-    ],
+    tags = TFL_DEFAULT_TAGS,
     deps = [
         ":Resources",
         ":TensorFlowLite",
diff --git a/tensorflow/lite/experimental/swift/Sources/Interpreter.swift b/tensorflow/lite/experimental/swift/Sources/Interpreter.swift
index 7ec09a14bbb..680d8918547 100644
--- a/tensorflow/lite/experimental/swift/Sources/Interpreter.swift
+++ b/tensorflow/lite/experimental/swift/Sources/Interpreter.swift
@@ -270,6 +270,11 @@ extension Interpreter {
   }
 }
 
+/// A type alias for `Interpreter.Options` to support backwards compatiblity with the deprecated
+/// `InterpreterOptions` struct.
+@available(*, deprecated, renamed: "Interpreter.Options")
+public typealias InterpreterOptions = Interpreter.Options
+
 extension String {
   /// Returns a new `String` initialized by using the given format C array as a template into which
   /// the remaining argument values are substituted according to the user’s default locale.
diff --git a/tensorflow/lite/experimental/swift/Sources/MetalDelegate.swift b/tensorflow/lite/experimental/swift/Sources/MetalDelegate.swift
index 91ca39f2b25..8fd15f303da 100644
--- a/tensorflow/lite/experimental/swift/Sources/MetalDelegate.swift
+++ b/tensorflow/lite/experimental/swift/Sources/MetalDelegate.swift
@@ -35,7 +35,7 @@ public final class MetalDelegate: Delegate {
     var delegateOptions = TFLGpuDelegateOptions()
     delegateOptions.allow_precision_loss = options.allowsPrecisionLoss
     delegateOptions.wait_type = options.waitType.cWaitType
-    cDelegate = TFLGpuDelegateCreate(UnsafePointer<TFLGpuDelegateOptions>(&delegateOptions))
+    cDelegate = TFLGpuDelegateCreate(&delegateOptions)
   }
 
   deinit {
diff --git a/tensorflow/lite/experimental/swift/Sources/TensorFlowLite.swift b/tensorflow/lite/experimental/swift/Sources/TensorFlowLite.swift
index b7234d78855..b5ad399f457 100644
--- a/tensorflow/lite/experimental/swift/Sources/TensorFlowLite.swift
+++ b/tensorflow/lite/experimental/swift/Sources/TensorFlowLite.swift
@@ -18,5 +18,5 @@ import TensorFlowLiteC
 public enum Runtime {
   /// A string describing the semantic versioning information for the runtime. Is an empty string if
   /// the version could not be determined.
-  public static var version: String { return TfLiteVersion().map { String(cString: $0) } ?? "" }
+  public static var version: String { return TfLiteVersion().map(String.init) ?? "" }
 }
diff --git a/tensorflow/lite/g3doc/_book.yaml b/tensorflow/lite/g3doc/_book.yaml
index b9c4fea74e9..0919be9064b 100644
--- a/tensorflow/lite/g3doc/_book.yaml
+++ b/tensorflow/lite/g3doc/_book.yaml
@@ -132,6 +132,8 @@ upper_tabs:
         path: /lite/models/pose_estimation/overview
       - title: "Segmentation"
         path: /lite/models/segmentation/overview
+      - title: "Style transfer"
+        path: /lite/models/style_transfer/overview
       - title: "Smart reply"
         path: /lite/models/smart_reply/overview
 
diff --git a/tensorflow/lite/g3doc/guide/ops_compatibility.md b/tensorflow/lite/g3doc/guide/ops_compatibility.md
index 8d7e3baa0fb..1c5fc9bf9f5 100644
--- a/tensorflow/lite/g3doc/guide/ops_compatibility.md
+++ b/tensorflow/lite/g3doc/guide/ops_compatibility.md
@@ -16,7 +16,7 @@ The best way to understand how to build a TensorFlow model that can be used with
 TensorFlow Lite is to carefully consider how operations are converted and
 optimized, along with the limitations imposed by this process.
 
-## Supported Types
+## Supported types
 
 Most TensorFlow Lite operations target both floating-point (float32) and
 quantized (uint8, int8) inference, but many ops do not yet for other types like
@@ -29,60 +29,51 @@ requires "fake-quantization" during model training, getting range information
 via a calibration data set, or doing "on-the-fly" range estimation. See
 [quantization](../performance/model_optimization.md).
 
-## Data Format and Broadcasting
+## Data format and broadcasting
 
 At the moment TensorFlow Lite supports only TensorFlow's "NHWC" format, and
 broadcasting is only support in a limited number of ops (tf.add, tf.mul, tf.sub,
 and tf.div).
 
-## Compatible Operations
+## Compatible operations
 
 The following TensorFlow operations are usually mapped to their TensorFlow Lite
 counterparts:
 
-*   [tf.batch_to_space_nd](https://www.tensorflow.org/api_docs/python/tf/batch_to_space_nd) -
-    *as long as the input tensor is 4D (1 batch + 2 spatial + 1 other) and the
-    crops attribute is not used*
-*   [tf.exp](https://www.tensorflow.org/api_docs/python/tf/exp)
-*   [tf.fake_quant*](https://www.tensorflow.org/api_docs/python/tf/fake_quant_with_min_max_args)
-*   [tf.matmul](https://www.tensorflow.org/api_docs/python/tf/matmul) - *as long
-    as the second argument is constant and transposition is not used*
-*   [tf.nn.avg_pool](https://www.tensorflow.org/api_docs/python/tf/nn/avg_pool)
-*   [tf.nn.conv2d](https://www.tensorflow.org/api_docs/python/tf/nn/conv2d) -
-    *as long as the filter is constant*
-*   [tf.nn.depthwise_conv2d](https://www.tensorflow.org/api_docs/python/tf/nn/depthwise_conv2d) -
-    *as long as the filter is constant and rate is [1,1]*
-*   [tf.nn.l2_normalize](https://www.tensorflow.org/api_docs/python/tf/nn/l2_normalize) -
-    *as long as normalization is done along the last dimension*
-*   [tf.nn.local_response_normalization](https://www.tensorflow.org/api_docs/python/tf/nn/local_response_normalization)
-*   [tf.nn.log_softmax](https://www.tensorflow.org/api_docs/python/tf/nn/log_softmax) -
-    *as long as axis is not provided*
-*   [tf.nn.max_pool](https://www.tensorflow.org/api_docs/python/tf/nn/max_pool)
-*   [tf.nn.softmax](https://www.tensorflow.org/api_docs/python/tf/nn/softmax) -
-    *as long as tensors are 2D and axis is the last dimension*
-*   [tf.nn.top_k](https://www.tensorflow.org/api_docs/python/tf/nn/top_k)
-*   [tf.one_hot](https://www.tensorflow.org/api_docs/python/tf/one_hot)
-*   [tf.pad](https://www.tensorflow.org/api_docs/python/tf/pad) - *as long as
-    mode and constant_values are not used*
-*   [tf.reduce_mean](https://www.tensorflow.org/api_docs/python/tf/reduce_mean) -
-    *as long as the reduction_indices attribute is not used*
-*   [tf.reshape](https://www.tensorflow.org/api_docs/python/tf/reshape)
-*   [tf.sigmoid](https://www.tensorflow.org/api_docs/python/tf/sigmoid)
-*   [tf.space_to_batch_nd](https://www.tensorflow.org/api_docs/python/tf/space_to_batch_nd) -
-    *as long as the input tensor is 4D (1 batch + 2 spatial + 1 other)*
-*   [tf.space_to_depth](https://www.tensorflow.org/api_docs/python/tf/space_to_depth)
-*   [tf.split](https://www.tensorflow.org/api_docs/python/tf/split) - *as long
-    as num is not provided and num_or_size_split contains number of splits as a
-    0D tensor*
-*   [tf.squeeze](https://www.tensorflow.org/api_docs/python/tf/squeeze) - *as
-    long as axis is not provided*
-*   [tf.squared_difference](https://www.tensorflow.org/versions/master/api_docs/python/tf/squared_difference)
-*   [tf.strided_slice](https://www.tensorflow.org/api_docs/python/tf/strided_slice) -
-    *as long as ellipsis_mask and new_axis_mask are not used*
-*   [tf.transpose](https://www.tensorflow.org/versions/master/api_docs/python/tf/transpose) -
-    *as long as conjugate is not used*
+*   `tf.batch_to_space_nd` —As long as the input tensor is 4D (1 batch + 2
+    spatial + 1 other) and the crops attribute is not used.
+*   `tf.exp`
+*   `tf.fake_quant`
+*   `tf.matmul` —As the second argument is constant and transposition is not
+    used*
+*   `tf.nn.avg_pool`
+*   `tf.nn.conv2d` —As long as the filter is constant.
+*   `tf.nn.depthwise_conv2d` —As long as the filter is constant and rate is `[1,
+    1]`.
+*   `tf.nn.l2_normalize` —As long as normalization is done along the last
+    dimension.
+*   `tf.nn.local_response_normalization`
+*   `tf.nn.log_softmax` —As long as axis is not provided.
+*   `tf.nn.max_pool`
+*   `tf.nn.softmax` —As long as tensors are 2D and axis is the last dimension.
+*   `tf.nn.top_k`
+*   `tf.one_hot`
+*   `tf.pad` —As long as mode and constant_values are not used.
+*   `tf.reduce_mean` —As long as the reduction_indices attribute is not used.
+*   `tf.reshape`
+*   `tf.sigmoid`
+*   `tf.space_to_batch_nd` —As long as the input tensor is 4D (1 batch + 2
+    spatial + 1 other).
+*   `tf.space_to_depth`
+*   `tf.split` —As long as num is not provided and `num_or_size_split` contains
+    number of splits as a 0D tensor.
+*   `tf.squeeze` —As long as axis is not provided.
+*   `tf.squared_difference`
+*   `tf.strided_slice` —As long as `ellipsis_mask and new_axis_mask` are not
+    used.
+*   `tf.transpose` —As long as conjugate is not used.
 
-## Straightforward Conversions, Constant-Folding and Fusing
+## Straight-forward conversions, constant-folding and fusing
 
 A number of TensorFlow operations can be processed by TensorFlow Lite even
 though they have no direct equivalent. This is the case for operations that can
@@ -94,48 +85,47 @@ processes.
 Here is a non-exhaustive list of TensorFlow operations that are usually removed
 from the graph:
 
-*   [tf.add](https://www.tensorflow.org/api_docs/python/tf/add)
-*   [tf.check_numerics](https://www.tensorflow.org/api_docs/python/tf/check_numerics)
-*   [tf.constant](https://www.tensorflow.org/api_docs/python/tf/constant)
-*   [tf.div](https://www.tensorflow.org/api_docs/python/tf/div)
-*   [tf.divide](https://www.tensorflow.org/api_docs/python/tf/divide)
-*   [tf.fake_quant_with_min_max_args](https://www.tensorflow.org/api_docs/python/tf/fake_quant_with_min_max_args)
-*   [tf.fake_quant_with_min_max_vars](https://www.tensorflow.org/api_docs/python/tf/fake_quant_with_min_max_vars)
-*   [tf.identity](https://www.tensorflow.org/api_docs/python/tf/identity)
-*   [tf.maximum](https://www.tensorflow.org/api_docs/python/tf/maximum)
-*   [tf.minimum](https://www.tensorflow.org/api_docs/python/tf/minimum)
-*   [tf.multiply](https://www.tensorflow.org/api_docs/python/tf/multiply)
-*   [tf.no_op](https://www.tensorflow.org/api_docs/python/tf/no_op)
-*   [tf.placeholder](https://www.tensorflow.org/api_docs/python/tf/placeholder)
-*   [tf.placeholder_with_default](https://www.tensorflow.org/api_docs/python/tf/placeholder_with_default)
-*   [tf.realdiv](https://www.tensorflow.org/api_docs/python/tf/realdiv)
-*   [tf.reduce_max](https://www.tensorflow.org/api_docs/python/tf/reduce_max)
-*   [tf.reduce_min](https://www.tensorflow.org/api_docs/python/tf/reduce_min)
-*   [tf.reduce_sum](https://www.tensorflow.org/api_docs/python/tf/reduce_sum)
-*   [tf.rsqrt](https://www.tensorflow.org/api_docs/python/tf/rsqrt)
-*   [tf.shape](https://www.tensorflow.org/api_docs/python/tf/shape)
-*   [tf.sqrt](https://www.tensorflow.org/api_docs/python/tf/sqrt)
-*   [tf.square](https://www.tensorflow.org/api_docs/python/tf/square)
-*   [tf.subtract](https://www.tensorflow.org/api_docs/python/tf/subtract)
-*   [tf.tile](https://www.tensorflow.org/api_docs/python/tf/tile)
-*   [tf.nn.batch_norm_with_global_normalization](https://www.tensorflow.org/api_docs/python/tf/nn/batch_norm_with_global_normalization)
-*   [tf.nn.bias_add](https://www.tensorflow.org/api_docs/python/tf/nn/bias_add)
-*   [tf.nn.fused_batch_norm](https://www.tensorflow.org/api_docs/python/tf/nn/fused_batch_norm)
-*   [tf.nn.relu](https://www.tensorflow.org/api_docs/python/tf/nn/relu)
-*   [tf.nn.relu6](https://www.tensorflow.org/api_docs/python/tf/nn/relu6)
+*   `tf.add`
+*   `tf.check_numerics`
+*   `tf.constant`
+*   `tf.div`
+*   `tf.divide`
+*   `tf.fake_quant_with_min_max_args`
+*   `tf.fake_quant_with_min_max_vars`
+*   `tf.identity`
+*   `tf.maximum`
+*   `tf.minimum`
+*   `tf.multiply`
+*   `tf.no_op`
+*   `tf.placeholder`
+*   `tf.placeholder_with_default`
+*   `tf.realdiv`
+*   `tf.reduce_max`
+*   `tf.reduce_min`
+*   `tf.reduce_sum`
+*   `tf.rsqrt`
+*   `tf.shape`
+*   `tf.sqrt`
+*   `tf.square`
+*   `tf.subtract`
+*   `tf.tile`
+*   `tf.nn.batch_norm_with_global_normalization`
+*   `tf.nn.bias_add`
+*   `tf.nn.fused_batch_norm`
+*   `tf.nn.relu`
+*   `tf.nn.relu6`
 
-Note that many of those operations don't have TensorFlow Lite equivalents and
-the corresponding model will not be convertible if they can't be elided or
-fused.
+Note: Many of those operations don't have TensorFlow Lite equivalents and the
+corresponding model will not be convertible if they can't be elided or fused.
 
-## Unsupported Operations
+## Unsupported operations
 
 TensorFlow operation not listed above are likely unsupported. Notably, the
 following common ops are not supported at the moment:
 
-*   [tf.depth_to_space](https://www.tensorflow.org/api_docs/python/tf/depth_to_space)
+*   `tf.depth_to_space`
 
-## TensorFlow Lite Operations
+## TensorFlow Lite operations
 
 The following TensorFlow Lite operations are fully supported and used in place
 of the TensorFlow operations listed above:
@@ -1154,14 +1144,14 @@ Outputs {
 And these are TensorFlow Lite operations that are present but not ready for
 custom models yet:
 
-*   CALL
-*   CONCAT_EMBEDDINGS
-*   CUSTOM
-*   EMBEDDING_LOOKUP
-*   EMBEDDING_LOOKUP_SPARSE
-*   HASHTABLE_LOOKUP
-*   LSH_PROJECTION
-*   LSTM
-*   RNN
-*   SKIP_GRAM
-*   SVDF
+*   `CALL`
+*   `CONCAT_EMBEDDINGS`
+*   `CUSTOM`
+*   `EMBEDDING_LOOKUP`
+*   `EMBEDDING_LOOKUP_SPARSE`
+*   `HASHTABLE_LOOKUP`
+*   `LSH_PROJECTION`
+*   `LSTM`
+*   `RNN`
+*   `SKIP_GRAM`
+*   `SVDF`
diff --git a/tensorflow/lite/g3doc/guide/python.md b/tensorflow/lite/g3doc/guide/python.md
index fbedd0822b3..cb9cc09172f 100644
--- a/tensorflow/lite/g3doc/guide/python.md
+++ b/tensorflow/lite/g3doc/guide/python.md
@@ -87,12 +87,18 @@ from tensorflow.lite.python.interpreter import Interpreter
 So it instead reads:
 
 ```python
-from tflite_runtime import Interpreter
+from tflite_runtime.interpreter import Interpreter
 ```
 
 Now run `label_image.py` again. That's it! You're now executing TensorFlow Lite
 models.
 
+## Learn more
+
+If you have a Raspberry Pi, try the
+[classify_picamera.py example](https://github.com/tensorflow/examples/tree/master/lite/examples/image_classification/raspberry_pi)
+to perform image classification with the Pi Camera and TensorFlow Lite.
+
 For more details about the `Interpreter` API, read [Load and run a model
 in Python](inference.md#load-and-run-a-model-in-python).
 
diff --git a/tensorflow/lite/g3doc/models/style_transfer/overview.ipynb b/tensorflow/lite/g3doc/models/style_transfer/overview.ipynb
new file mode 100644
index 00000000000..bc3ea18c52f
--- /dev/null
+++ b/tensorflow/lite/g3doc/models/style_transfer/overview.ipynb
@@ -0,0 +1,494 @@
+{
+  "cells": [
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "g_nWetWWd_ns"
+      },
+      "source": [
+        "##### Copyright 2019 The TensorFlow Authors."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "cellView": "form",
+        "colab": {},
+        "colab_type": "code",
+        "id": "2pHVBk_seED1"
+      },
+      "outputs": [],
+      "source": [
+        "#@title Licensed under the Apache License, Version 2.0 (the \"License\");\n",
+        "# you may not use this file except in compliance with the License.\n",
+        "# You may obtain a copy of the License at\n",
+        "#\n",
+        "# https://www.apache.org/licenses/LICENSE-2.0\n",
+        "#\n",
+        "# Unless required by applicable law or agreed to in writing, software\n",
+        "# distributed under the License is distributed on an \"AS IS\" BASIS,\n",
+        "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n",
+        "# See the License for the specific language governing permissions and\n",
+        "# limitations under the License."
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "M7vSdG6sAIQn"
+      },
+      "source": [
+        "# Artistic Style Transfer with TensorFlow Lite"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "fwc5GKHBASdc"
+      },
+      "source": [
+        "\u003ctable class=\"tfo-notebook-buttons\" align=\"left\"\u003e\n",
+        "  \u003ctd\u003e\n",
+        "    \u003ca target=\"_blank\" href=\"https://www.tensorflow.org/lite/models/style_transfer/overview\"\u003e\u003cimg src=\"https://www.tensorflow.org/images/tf_logo_32px.png\" /\u003eView on TensorFlow.org\u003c/a\u003e\n",
+        "  \u003c/td\u003e\n",
+        "  \u003ctd\u003e\n",
+        "    \u003ca target=\"_blank\" href=\"https://colab.research.google.com/github/tensorflow/tensorflow/blob/master/tensorflow/lite/g3doc/models/style_transfer/overview.ipynb\"\u003e\u003cimg src=\"https://www.tensorflow.org/images/colab_logo_32px.png\" /\u003eRun in Google Colab\u003c/a\u003e\n",
+        "  \u003c/td\u003e\n",
+        "  \u003ctd\u003e\n",
+        "    \u003ca target=\"_blank\" href=\"https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/g3doc/models/style_transfer/overview.ipynb\"\u003e\u003cimg src=\"https://www.tensorflow.org/images/GitHub-Mark-32px.png\" /\u003eView source on GitHub\u003c/a\u003e\n",
+        "  \u003c/td\u003e\n",
+        "  \u003ctd\u003e\n",
+        "    \u003ca href=\"https://storage.googleapis.com/tensorflow_docs/tensorflow/tensorflow/lite/g3doc/models/style_transfer/overview.ipynb\"\u003e\u003cimg src=\"https://www.tensorflow.org/images/download_logo_32px.png\" /\u003eDownload notebook\u003c/a\u003e\n",
+        "  \u003c/td\u003e\n",
+        "\u003c/table\u003e"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "31O0iaROAw8z"
+      },
+      "source": [
+        "One of the most exciting developments in deep learning to come out recently is [artistic style transfer](https://arxiv.org/abs/1508.06576), or the ability to create a new image, known as a [pastiche](https://en.wikipedia.org/wiki/Pastiche), based on two input images: one representing the artistic style and one representing the content.\n",
+        "\n",
+        "![Style transfer example](https://storage.googleapis.com/download.tensorflow.org/models/tflite/arbitrary_style_transfer/formula.png)\n",
+        "\n",
+        "Using this technique, we can generate beautiful new artworks in a range of styles.\n",
+        "\n",
+        "![Style transfer example](https://storage.googleapis.com/download.tensorflow.org/models/tflite/arbitrary_style_transfer/table.png)\n",
+        "\n",
+        "This tutorial shows how to use a pre-trained TensorFlow Lite model to apply style transfer on any pair of content and style image. You can use the pre-trained model to add style transfer to your own mobile applications.\n",
+        "\n",
+        "The model is open-sourced on [GitHub](https://github.com/tensorflow/magenta/tree/master/magenta/models/arbitrary_image_stylization#train-a-model-on-a-large-dataset-with-data-augmentation-to-run-on-mobile). You can retrain the model with different parameters (e.g. increase content layers' weights to make the output image look more like the content image)."
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "ak0S4gkOCSxs"
+      },
+      "source": [
+        "## Understand the model architecture"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "oee6G_bBCgAM"
+      },
+      "source": [
+        "![Model Architecture](https://storage.googleapis.com/download.tensorflow.org/models/tflite/arbitrary_style_transfer/architecture.png)\n",
+        "\n",
+        "This Artistic Style Transfer model consists of two submodels:\n",
+        "1. **Style Prediciton Model**: A MobilenetV2-based neural network that takes an input style image to a 100-dimension style bottleneck vector.\n",
+        "1. **Style Transform Model**: A neural network that takes apply a style bottleneck vector to a content image and creates a stylized image.\n",
+        "\n",
+        "If your app only needs to support a fixed set of style images, you can compute their style bottleneck vectors in advance, and exclude the Style Prediction Model from your app's binary."
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "a7ZETsRVNMo7"
+      },
+      "source": [
+        "## Setup"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "3n8oObKZN4c8"
+      },
+      "source": [
+        "Import dependencies."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {},
+        "colab_type": "code",
+        "id": "qZhQt7ObAHsc"
+      },
+      "outputs": [],
+      "source": [
+        "from __future__ import absolute_import, division, print_function, unicode_literals"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {},
+        "colab_type": "code",
+        "id": "xz62Lb1oNm97"
+      },
+      "outputs": [],
+      "source": [
+        "try:\n",
+        "  # %tensorflow_version only exists in Colab.\n",
+        "  import tensorflow.compat.v2 as tf\n",
+        "except Exception:\n",
+        "  pass\n",
+        "tf.enable_v2_behavior()"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {},
+        "colab_type": "code",
+        "id": "1Ua5FpcJNrIj"
+      },
+      "outputs": [],
+      "source": [
+        "import IPython.display as display\n",
+        "\n",
+        "import matplotlib.pyplot as plt\n",
+        "import matplotlib as mpl\n",
+        "mpl.rcParams['figure.figsize'] = (12,12)\n",
+        "mpl.rcParams['axes.grid'] = False\n",
+        "\n",
+        "import numpy as np\n",
+        "import time\n",
+        "import functools"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "1b988wrrQnVF"
+      },
+      "source": [
+        "Download the content and style images, and the pre-trained TensorFlow Lite models."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {},
+        "colab_type": "code",
+        "id": "16g57cIMQnen"
+      },
+      "outputs": [],
+      "source": [
+        "content_path = tf.keras.utils.get_file('belfry.jpg','https://storage.googleapis.com/khanhlvg-public.appspot.com/arbitrary-style-transfer/belfry-2611573_1280.jpg')\n",
+        "style_path = tf.keras.utils.get_file('style23.jpg','https://storage.googleapis.com/khanhlvg-public.appspot.com/arbitrary-style-transfer/style23.jpg')\n",
+        "\n",
+        "style_predict_path = tf.keras.utils.get_file('style_predict.tflite', 'https://storage.googleapis.com/download.tensorflow.org/models/tflite/arbitrary_style_transfer/style_predict_quantized_256.tflite')\n",
+        "style_transform_path = tf.keras.utils.get_file('style_transform.tflite', 'https://storage.googleapis.com/download.tensorflow.org/models/tflite/arbitrary_style_transfer/style_transfer_quantized_dynamic.tflite')"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "MQZXL7kON-gM"
+      },
+      "source": [
+        "## Pre-process the inputs\n",
+        "\n",
+        "* The content image and the style image must be RGB images with pixel values being float32 numbers between [0..1].\n",
+        "* The style image size must be (1, 256, 256, 3). We central crop the image and resize it.\n",
+        "* The content image can be any size. However, as we trained the model using square-cropped data, cropping the content image to a square results in better stylized image."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {},
+        "colab_type": "code",
+        "id": "Cg0Vi-rXRUFl"
+      },
+      "outputs": [],
+      "source": [
+        "# Function to load an image from a file, and add a batch dimension.\n",
+        "def load_img(path_to_img):\n",
+        "  img = tf.io.read_file(path_to_img)\n",
+        "  img = tf.image.decode_image(img, channels=3)\n",
+        "  img = tf.image.convert_image_dtype(img, tf.float32)\n",
+        "  img = img[tf.newaxis, :]\n",
+        "\n",
+        "  return img\n",
+        "\n",
+        "# Function to pre-process style image input.\n",
+        "def preprocess_style_image(style_image):\n",
+        "  # Resize the image so that the shorter dimension becomes 256px.\n",
+        "  target_dim = 256\n",
+        "  shape = tf.cast(tf.shape(style_image)[1:-1], tf.float32)\n",
+        "  short_dim = min(shape)\n",
+        "  scale = target_dim / short_dim\n",
+        "  new_shape = tf.cast(shape * scale, tf.int32)\n",
+        "  style_image = tf.image.resize(style_image, new_shape)\n",
+        "\n",
+        "  # Central crop the image.\n",
+        "  style_image = tf.image.resize_with_crop_or_pad(style_image, target_dim, target_dim)\n",
+        "\n",
+        "  return style_image\n",
+        "\n",
+        "# Function to pre-process content image input.\n",
+        "def preprocess_content_image(content_image):\n",
+        "  # Central crop the image.\n",
+        "  shape = tf.shape(content_image)[1:-1]\n",
+        "  short_dim = min(shape)\n",
+        "  content_image = tf.image.resize_with_crop_or_pad(content_image, short_dim, short_dim)\n",
+        "\n",
+        "  return content_image\n",
+        "\n",
+        "# Load the input images.\n",
+        "content_image = load_img(content_path)\n",
+        "style_image = load_img(style_path)\n",
+        "\n",
+        "# Preprocess the input images.\n",
+        "preprocessed_content_image = preprocess_content_image(content_image)\n",
+        "preprocessed_style_image = preprocess_style_image(style_image)\n",
+        "\n",
+        "print('Style Image Shape:', preprocessed_content_image.shape)\n",
+        "print('Content Image Shape:', preprocessed_style_image.shape)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "xE4Yt8nArTeR"
+      },
+      "source": [
+        "## Visualize the inputs"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {},
+        "colab_type": "code",
+        "id": "ncPA4esJRcEu"
+      },
+      "outputs": [],
+      "source": [
+        "def imshow(image, title=None):\n",
+        "  if len(image.shape) \u003e 3:\n",
+        "    image = tf.squeeze(image, axis=0)\n",
+        "\n",
+        "  plt.imshow(image)\n",
+        "  if title:\n",
+        "    plt.title(title)\n",
+        "\n",
+        "plt.subplot(1, 2, 1)\n",
+        "imshow(preprocessed_content_image, 'Content Image')\n",
+        "\n",
+        "plt.subplot(1, 2, 2)\n",
+        "imshow(preprocessed_style_image, 'Style Image')"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "CJ7R-CHbjC3s"
+      },
+      "source": [
+        "## Run style transfer with TensorFlow Lite"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "euu00ldHjKwD"
+      },
+      "source": [
+        "### Style prediction"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {},
+        "colab_type": "code",
+        "id": "o3zd9cTFRiS_"
+      },
+      "outputs": [],
+      "source": [
+        "# Function to run style prediction on preprocessed style image.\n",
+        "def run_style_predict(preprocessed_style_image):\n",
+        "  # Load the model.\n",
+        "  interpreter = tf.lite.Interpreter(model_path=style_predict_path)\n",
+        "\n",
+        "  # Set model input.\n",
+        "  interpreter.allocate_tensors()\n",
+        "  input_details = interpreter.get_input_details()\n",
+        "  interpreter.set_tensor(input_details[0][\"index\"], preprocessed_style_image)\n",
+        "\n",
+        "  # Calculate style bottleneck.\n",
+        "  interpreter.invoke()\n",
+        "  style_bottleneck = interpreter.tensor(\n",
+        "      interpreter.get_output_details()[0][\"index\"]\n",
+        "      )()\n",
+        "\n",
+        "  return style_bottleneck\n",
+        "\n",
+        "# Calculate style bottleneck for the preprocessed style image.\n",
+        "style_bottleneck = run_style_predict(preprocessed_style_image)\n",
+        "print('Style Bottleneck Shape:', style_bottleneck.shape)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "00t8S2PekIyW"
+      },
+      "source": [
+        "### Style transform"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {},
+        "colab_type": "code",
+        "id": "cZp5bCj8SX1w"
+      },
+      "outputs": [],
+      "source": [
+        "# Run style transform on preprocessed style image\n",
+        "def run_style_transform(style_bottleneck, preprocessed_content_image):\n",
+        "  # Load the model.\n",
+        "  interpreter = tf.lite.Interpreter(model_path=style_transform_path)\n",
+        "\n",
+        "  # Set model input.\n",
+        "  input_details = interpreter.get_input_details()\n",
+        "  interpreter.resize_tensor_input(input_details[0][\"index\"],\n",
+        "                                  preprocessed_content_image.shape)\n",
+        "  interpreter.allocate_tensors()\n",
+        "\n",
+        "  # Set model inputs.\n",
+        "  interpreter.set_tensor(input_details[0][\"index\"], preprocessed_content_image)\n",
+        "  interpreter.set_tensor(input_details[1][\"index\"], style_bottleneck)\n",
+        "  interpreter.invoke()\n",
+        "\n",
+        "  # Transform content image.\n",
+        "  stylized_image = interpreter.tensor(\n",
+        "      interpreter.get_output_details()[0][\"index\"]\n",
+        "      )()\n",
+        "\n",
+        "  return stylized_image\n",
+        "\n",
+        "# Stylize the content image using the style bottleneck.\n",
+        "stylized_image = run_style_transform(style_bottleneck, preprocessed_content_image)\n",
+        "\n",
+        "# Visualize the output.\n",
+        "imshow(stylized_image, 'Stylized Image')"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "vv_71Td-QtrW"
+      },
+      "source": [
+        "### Style blending\n",
+        "\n",
+        "We can blend the style of content image into the stylized output, which in turn making the output look more like the content image."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {},
+        "colab_type": "code",
+        "id": "eJcAURXQQtJ7"
+      },
+      "outputs": [],
+      "source": [
+        "# Calculate style bottleneck of the content image.\n",
+        "style_bottleneck_content = run_style_predict(\n",
+        "    preprocess_style_image(content_image)\n",
+        "    )"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {},
+        "colab_type": "code",
+        "id": "4S3yg2MgkmRD"
+      },
+      "outputs": [],
+      "source": [
+        "# Define content blending ratio between [0..1].\n",
+        "# 0.0: 0% style extracts from content image.\n",
+        "# 1.0: 100% style extracted from content image.\n",
+        "content_blending_ratio = 0.5 #@param {type:\"slider\", min:0, max:1, step:0.01}\n",
+        "\n",
+        "# Blend the style bottleneck of style image and content image\n",
+        "style_bottleneck_blended = content_blending_ratio * style_bottleneck_content \\\n",
+        "                           + (1 - content_blending_ratio) * style_bottleneck\n",
+        "\n",
+        "# Stylize the content image using the style bottleneck.\n",
+        "stylized_image_blended = run_style_transform(style_bottleneck_blended,\n",
+        "                                             preprocessed_content_image)\n",
+        "\n",
+        "# Visualize the output.\n",
+        "imshow(stylized_image_blended, 'Blended Stylized Image')"
+      ]
+    }
+  ],
+  "metadata": {
+    "colab": {
+      "collapsed_sections": [],
+      "name": "Artistic Style Transfer with TensorFlow Lite.ipynb",
+      "private_outputs": true,
+      "provenance": [],
+      "version": "0.3.2"
+    },
+    "kernelspec": {
+      "display_name": "Python 3",
+      "name": "python3"
+    }
+  },
+  "nbformat": 4,
+  "nbformat_minor": 0
+}
diff --git a/tensorflow/lite/g3doc/performance/post_training_integer_quant.ipynb b/tensorflow/lite/g3doc/performance/post_training_integer_quant.ipynb
index ddb0d6a3827..b269990c065 100644
--- a/tensorflow/lite/g3doc/performance/post_training_integer_quant.ipynb
+++ b/tensorflow/lite/g3doc/performance/post_training_integer_quant.ipynb
@@ -1,26 +1,10 @@
 {
-  "nbformat": 4,
-  "nbformat_minor": 0,
-  "metadata": {
-    "colab": {
-      "name": "post_training_integer_quant.ipynb",
-      "version": "0.3.2",
-      "provenance": [],
-      "private_outputs": true,
-      "collapsed_sections": [],
-      "toc_visible": true
-    },
-    "kernelspec": {
-      "name": "python3",
-      "display_name": "Python 3"
-    }
-  },
   "cells": [
     {
       "cell_type": "markdown",
       "metadata": {
-        "id": "_DDaAex5Q7u-",
-        "colab_type": "text"
+        "colab_type": "text",
+        "id": "_DDaAex5Q7u-"
       },
       "source": [
         "##### Copyright 2019 The TensorFlow Authors."
@@ -28,12 +12,14 @@
     },
     {
       "cell_type": "code",
+      "execution_count": 0,
       "metadata": {
-        "id": "W1dWWdNHQ9L0",
-        "colab_type": "code",
+        "cellView": "form",
         "colab": {},
-        "cellView": "form"
+        "colab_type": "code",
+        "id": "W1dWWdNHQ9L0"
       },
+      "outputs": [],
       "source": [
         "#@title Licensed under the Apache License, Version 2.0 (the \"License\");\n",
         "# you may not use this file except in compliance with the License.\n",
@@ -46,9 +32,7 @@
         "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n",
         "# See the License for the specific language governing permissions and\n",
         "# limitations under the License."
-      ],
-      "execution_count": 0,
-      "outputs": []
+      ]
     },
     {
       "cell_type": "markdown",
@@ -124,52 +108,54 @@
     },
     {
       "cell_type": "code",
+      "execution_count": 0,
       "metadata": {
+        "colab": {},
         "colab_type": "code",
-        "id": "gyqAw1M9lyab",
-        "colab": {}
+        "id": "gyqAw1M9lyab"
       },
+      "outputs": [],
       "source": [
         "! pip uninstall -y tensorflow\n",
         "! pip install -U tf-nightly"
-      ],
-      "execution_count": 0,
-      "outputs": []
+      ]
     },
     {
       "cell_type": "code",
+      "execution_count": 0,
       "metadata": {
+        "colab": {},
         "colab_type": "code",
-        "id": "WsN6s5L1ieNl",
-        "colab": {}
+        "id": "WsN6s5L1ieNl"
       },
+      "outputs": [],
       "source": [
         "import tensorflow as tf\n",
         "tf.enable_eager_execution()"
-      ],
-      "execution_count": 0,
-      "outputs": []
+      ]
     },
     {
       "cell_type": "code",
+      "execution_count": 0,
       "metadata": {
+        "colab": {},
         "colab_type": "code",
-        "id": "00U0taBoe-w7",
-        "colab": {}
+        "id": "00U0taBoe-w7"
       },
+      "outputs": [],
       "source": [
         "! git clone --depth 1 https://github.com/tensorflow/models"
-      ],
-      "execution_count": 0,
-      "outputs": []
+      ]
     },
     {
       "cell_type": "code",
+      "execution_count": 0,
       "metadata": {
+        "colab": {},
         "colab_type": "code",
-        "id": "4XZPtSh-fUOc",
-        "colab": {}
+        "id": "4XZPtSh-fUOc"
       },
+      "outputs": [],
       "source": [
         "import sys\n",
         "import os\n",
@@ -182,9 +168,7 @@
         "# Add `models` to the python path.\n",
         "models_path = os.path.join(os.getcwd(), \"models\")\n",
         "sys.path.append(models_path)"
-      ],
-      "execution_count": 0,
-      "outputs": []
+      ]
     },
     {
       "cell_type": "markdown",
@@ -198,31 +182,31 @@
     },
     {
       "cell_type": "code",
+      "execution_count": 0,
       "metadata": {
+        "colab": {},
         "colab_type": "code",
-        "id": "eMsw_6HujaqM",
-        "colab": {}
+        "id": "eMsw_6HujaqM"
       },
+      "outputs": [],
       "source": [
         "saved_models_root = \"/tmp/mnist_saved_model\""
-      ],
-      "execution_count": 0,
-      "outputs": []
+      ]
     },
     {
       "cell_type": "code",
+      "execution_count": 0,
       "metadata": {
+        "colab": {},
         "colab_type": "code",
-        "id": "hWSAjQWagIHl",
-        "colab": {}
+        "id": "hWSAjQWagIHl"
       },
+      "outputs": [],
       "source": [
         "# The above path addition is not visible to subprocesses, add the path for the subprocess as well.\n",
         "# Note: channels_last is required here or the conversion may fail. \n",
         "!PYTHONPATH={models_path} python models/official/mnist/mnist.py --train_epochs=1 --export_dir {saved_models_root} --data_format=channels_last"
-      ],
-      "execution_count": 0,
-      "outputs": []
+      ]
     },
     {
       "cell_type": "markdown",
@@ -250,17 +234,17 @@
     },
     {
       "cell_type": "code",
+      "execution_count": 0,
       "metadata": {
+        "colab": {},
         "colab_type": "code",
-        "id": "Xp5oClaZkbtn",
-        "colab": {}
+        "id": "Xp5oClaZkbtn"
       },
+      "outputs": [],
       "source": [
         "saved_model_dir = str(sorted(pathlib.Path(saved_models_root).glob(\"*\"))[-1])\n",
         "saved_model_dir"
-      ],
-      "execution_count": 0,
-      "outputs": []
+      ]
     },
     {
       "cell_type": "markdown",
@@ -274,11 +258,13 @@
     },
     {
       "cell_type": "code",
+      "execution_count": 0,
       "metadata": {
+        "colab": {},
         "colab_type": "code",
-        "id": "_i8B2nDZmAgQ",
-        "colab": {}
+        "id": "_i8B2nDZmAgQ"
       },
+      "outputs": [],
       "source": [
         "import tensorflow as tf\n",
         "tf.enable_eager_execution()\n",
@@ -286,9 +272,7 @@
         "\n",
         "converter = tf.lite.TFLiteConverter.from_saved_model(saved_model_dir)\n",
         "tflite_model = converter.convert()"
-      ],
-      "execution_count": 0,
-      "outputs": []
+      ]
     },
     {
       "cell_type": "markdown",
@@ -302,31 +286,31 @@
     },
     {
       "cell_type": "code",
+      "execution_count": 0,
       "metadata": {
+        "colab": {},
         "colab_type": "code",
-        "id": "vptWZq2xnclo",
-        "colab": {}
+        "id": "vptWZq2xnclo"
       },
+      "outputs": [],
       "source": [
         "tflite_models_dir = pathlib.Path(\"/tmp/mnist_tflite_models/\")\n",
         "tflite_models_dir.mkdir(exist_ok=True, parents=True)"
-      ],
-      "execution_count": 0,
-      "outputs": []
+      ]
     },
     {
       "cell_type": "code",
+      "execution_count": 0,
       "metadata": {
+        "colab": {},
         "colab_type": "code",
-        "id": "Ie9pQaQrn5ue",
-        "colab": {}
+        "id": "Ie9pQaQrn5ue"
       },
+      "outputs": [],
       "source": [
         "tflite_model_file = tflite_models_dir/\"mnist_model.tflite\"\n",
         "tflite_model_file.write_bytes(tflite_model)"
-      ],
-      "execution_count": 0,
-      "outputs": []
+      ]
     },
     {
       "cell_type": "markdown",
@@ -346,23 +330,23 @@
     },
     {
       "cell_type": "code",
+      "execution_count": 0,
       "metadata": {
+        "colab": {},
         "colab_type": "code",
-        "id": "HEZ6ET1AHAS3",
-        "colab": {}
+        "id": "HEZ6ET1AHAS3"
       },
+      "outputs": [],
       "source": [
         "tf.logging.set_verbosity(tf.logging.INFO)\n",
         "converter.optimizations = [tf.lite.Optimize.DEFAULT]"
-      ],
-      "execution_count": 0,
-      "outputs": []
+      ]
     },
     {
       "cell_type": "markdown",
       "metadata": {
-        "id": "rTe8avZJHMDO",
-        "colab_type": "text"
+        "colab_type": "text",
+        "id": "rTe8avZJHMDO"
       },
       "source": [
         "Now, in order to create quantized values with an accurate dynamic range of activations, you need to provide a representative dataset:"
@@ -370,11 +354,13 @@
     },
     {
       "cell_type": "code",
+      "execution_count": 0,
       "metadata": {
-        "id": "FiwiWU3gHdkW",
+        "colab": {},
         "colab_type": "code",
-        "colab": {}
+        "id": "FiwiWU3gHdkW"
       },
+      "outputs": [],
       "source": [
         "mnist_train, _ = tf.keras.datasets.mnist.load_data()\n",
         "images = tf.cast(mnist_train[0], tf.float32)/255.0\n",
@@ -384,34 +370,32 @@
         "    yield [input_value]\n",
         "\n",
         "converter.representative_dataset = representative_data_gen"
-      ],
-      "execution_count": 0,
-      "outputs": []
+      ]
     },
     {
       "cell_type": "markdown",
       "metadata": {
-        "id": "xW84iMYjHd9t",
-        "colab_type": "text"
+        "colab_type": "text",
+        "id": "xW84iMYjHd9t"
       },
       "source": [
-        "Finally, convert the model like usual. By default, the converted model will still use float input and outputs for invocation convenience."
+        "Finally, convert the model to TensorFlow Lite format:"
       ]
     },
     {
       "cell_type": "code",
-      "metadata": {
-        "id": "yuNfl3CoHNK3",
-        "colab_type": "code",
-        "colab": {}
-      },
-      "source": [
-        "tflite_quant_model = converter.convert()\n",
-        "tflite_model_quant_file = tflite_models_dir/\"mnist_model_quant.tflite\"\n",
-        "tflite_model_quant_file.write_bytes(tflite_quant_model)"
-      ],
       "execution_count": 0,
-      "outputs": []
+      "metadata": {
+        "colab": {},
+        "colab_type": "code",
+        "id": "yuNfl3CoHNK3"
+      },
+      "outputs": [],
+      "source": [
+        "tflite_model_quant = converter.convert()\n",
+        "tflite_model_quant_file = tflite_models_dir/\"mnist_model_quant.tflite\"\n",
+        "tflite_model_quant_file.write_bytes(tflite_model_quant)"
+      ]
     },
     {
       "cell_type": "markdown",
@@ -425,16 +409,16 @@
     },
     {
       "cell_type": "code",
+      "execution_count": 0,
       "metadata": {
+        "colab": {},
         "colab_type": "code",
-        "id": "JExfcfLDscu4",
-        "colab": {}
+        "id": "JExfcfLDscu4"
       },
+      "outputs": [],
       "source": [
         "!ls -lh {tflite_models_dir}"
-      ],
-      "execution_count": 0,
-      "outputs": []
+      ]
     },
     {
       "cell_type": "markdown",
@@ -443,9 +427,9 @@
         "id": "RACBJuj2XO8x"
       },
       "source": [
-        "Your model should now be fully quantized. However, if you convert a model that includes any operations that TensorFlow Lite cannot quantize, those ops are left in floating point. This allows for conversion to complete so you have a smaller and more efficient model, but the model won't be compatible with some ML accelerators that require full integer quantization. Also, this model still uses float values for input and output, which also is not compatible with some accelerators.\n",
+        "Your model should now be fully quantized. However, if you convert a model that includes any operations that TensorFlow Lite cannot quantize, those ops are left in floating point. This allows for conversion to complete so you have a smaller and more efficient model, but the model won't be compatible with some ML accelerators that require full integer quantization. Also, by default, the converted model still use float input and outputs, which also is not compatible with some accelerators.\n",
         "\n",
-        "So to ensure that the converted model is fully quantized (make the converter throw an error if it encounters an operation it cannot quantize) and to use integers for the model's input and output, you need to convert the model again using these additional configurations:"
+        "So to ensure that the converted model is fully quantized (make the converter throw an error if it encounters an operation it cannot quantize), and to use integers for the model's input and output, you need to convert the model again using these additional configurations:"
       ]
     },
     {
@@ -462,9 +446,9 @@
         "converter.inference_input_type = tf.uint8\n",
         "converter.inference_output_type = tf.uint8\n",
         "\n",
-        "tflite_quant_model = converter.convert()\n",
+        "tflite_model_quant = converter.convert()\n",
         "tflite_model_quant_file = tflite_models_dir/\"mnist_model_quant_io.tflite\"\n",
-        "tflite_model_quant_file.write_bytes(tflite_quant_model)"
+        "tflite_model_quant_file.write_bytes(tflite_model_quant)"
       ]
     },
     {
@@ -474,7 +458,9 @@
         "id": "wYd6NxD03yjB"
       },
       "source": [
-        "In this example, the resulting model size remains the same because all operations successfully quantized to begin with. However, this new model now uses quantized input and output, making it compatible with more accelerators."
+        "In this example, the resulting model size remains the same because all operations successfully quantized to begin with. However, this new model now uses quantized input and output, making it compatible with more accelerators, such as the Coral Edge TPU.\n",
+        "\n",
+        "In the following sections, notice that we are now handling two TensorFlow Lite models: `tflite_model_file` is the converted model that still uses floating-point parameters, and `tflite_model_quant_file` is the same model converted with full integer quantization, including uint8 input and output."
       ]
     },
     {
@@ -499,25 +485,31 @@
         "\n",
         "### Load the test data\n",
         "\n",
-        "First, let's load the MNIST test data to feed to the model:"
+        "First, let's load the MNIST test data to feed to the model. Because the quantized model expects uint8 input data, we need to create a separate dataset for that model:"
       ]
     },
     {
       "cell_type": "code",
+      "execution_count": 0,
       "metadata": {
+        "colab": {},
         "colab_type": "code",
-        "id": "eTIuU07NuKFL",
-        "colab": {}
+        "id": "eTIuU07NuKFL"
       },
+      "outputs": [],
       "source": [
         "import numpy as np\n",
         "_, mnist_test = tf.keras.datasets.mnist.load_data()\n",
-        "images, labels = tf.cast(mnist_test[0], tf.float32)/255.0, mnist_test[1]\n",
+        "labels = mnist_test[1]\n",
         "\n",
-        "mnist_ds = tf.data.Dataset.from_tensor_slices((images, labels)).batch(1)"
-      ],
-      "execution_count": 0,
-      "outputs": []
+        "# Load data for float model\n",
+        "images = tf.cast(mnist_test[0], tf.float32)/255.0\n",
+        "mnist_ds = tf.data.Dataset.from_tensor_slices((images, labels)).batch(1)\n",
+        "\n",
+        "# Load data for quantized model\n",
+        "images_uint8 = tf.cast(mnist_test[0], tf.uint8)\n",
+        "mnist_ds_uint8 = tf.data.Dataset.from_tensor_slices((images_uint8, labels)).batch(1)"
+      ]
     },
     {
       "cell_type": "markdown",
@@ -531,31 +523,31 @@
     },
     {
       "cell_type": "code",
+      "execution_count": 0,
       "metadata": {
+        "colab": {},
         "colab_type": "code",
-        "id": "Jn16Rc23zTss",
-        "colab": {}
+        "id": "Jn16Rc23zTss"
       },
+      "outputs": [],
       "source": [
         "interpreter = tf.lite.Interpreter(model_path=str(tflite_model_file))\n",
         "interpreter.allocate_tensors()"
-      ],
-      "execution_count": 0,
-      "outputs": []
+      ]
     },
     {
       "cell_type": "code",
+      "execution_count": 0,
       "metadata": {
+        "colab": {},
         "colab_type": "code",
-        "id": "J8Pztk1mvNVL",
-        "colab": {}
+        "id": "J8Pztk1mvNVL"
       },
+      "outputs": [],
       "source": [
         "interpreter_quant = tf.lite.Interpreter(model_path=str(tflite_model_quant_file))\n",
         "interpreter_quant.allocate_tensors()"
-      ],
-      "execution_count": 0,
-      "outputs": []
+      ]
     },
     {
       "cell_type": "markdown",
@@ -564,16 +556,20 @@
         "id": "2opUt_JTdyEu"
       },
       "source": [
-        "### Test the models on one image"
+        "### Test the models on one image\n",
+        "\n",
+        "First test it on the float model:"
       ]
     },
     {
       "cell_type": "code",
+      "execution_count": 0,
       "metadata": {
+        "colab": {},
         "colab_type": "code",
-        "id": "AKslvo2kwWac",
-        "colab": {}
+        "id": "AKslvo2kwWac"
       },
+      "outputs": [],
       "source": [
         "for img, label in mnist_ds:\n",
         "  break\n",
@@ -582,17 +578,17 @@
         "interpreter.invoke()\n",
         "predictions = interpreter.get_tensor(\n",
         "    interpreter.get_output_details()[0][\"index\"])"
-      ],
-      "execution_count": 0,
-      "outputs": []
+      ]
     },
     {
       "cell_type": "code",
+      "execution_count": 0,
       "metadata": {
+        "colab": {},
         "colab_type": "code",
-        "id": "XZClM2vo3_bm",
-        "colab": {}
+        "id": "XZClM2vo3_bm"
       },
+      "outputs": [],
       "source": [
         "import matplotlib.pylab as plt\n",
         "\n",
@@ -601,43 +597,54 @@
         "_ = plt.title(template.format(true= str(label[0].numpy()),\n",
         "                              predict=str(predictions[0])))\n",
         "plt.grid(False)"
-      ],
-      "execution_count": 0,
-      "outputs": []
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "o3N6-UGl1dfE"
+      },
+      "source": [
+        "Now test the quantized model (using the uint8 data):"
+      ]
     },
     {
       "cell_type": "code",
+      "execution_count": 0,
       "metadata": {
+        "colab": {},
         "colab_type": "code",
-        "id": "3gwhv4lKbYZ4",
-        "colab": {}
+        "id": "3gwhv4lKbYZ4"
       },
+      "outputs": [],
       "source": [
+        "for img, label in mnist_ds_uint8:\n",
+        "  break\n",
+        "\n",
         "interpreter_quant.set_tensor(\n",
         "    interpreter_quant.get_input_details()[0][\"index\"], img)\n",
         "interpreter_quant.invoke()\n",
         "predictions = interpreter_quant.get_tensor(\n",
         "    interpreter_quant.get_output_details()[0][\"index\"])"
-      ],
-      "execution_count": 0,
-      "outputs": []
+      ]
     },
     {
       "cell_type": "code",
+      "execution_count": 0,
       "metadata": {
+        "colab": {},
         "colab_type": "code",
-        "id": "CIH7G_MwbY2x",
-        "colab": {}
+        "id": "CIH7G_MwbY2x"
       },
+      "outputs": [],
       "source": [
         "plt.imshow(img[0])\n",
         "template = \"True:{true}, predicted:{predict}\"\n",
         "_ = plt.title(template.format(true= str(label[0].numpy()),\n",
         "                              predict=str(predictions[0])))\n",
         "plt.grid(False)"
-      ],
-      "execution_count": 0,
-      "outputs": []
+      ]
     },
     {
       "cell_type": "markdown",
@@ -651,11 +658,13 @@
     },
     {
       "cell_type": "code",
+      "execution_count": 0,
       "metadata": {
+        "colab": {},
         "colab_type": "code",
-        "id": "05aeAuWjvjPx",
-        "colab": {}
+        "id": "05aeAuWjvjPx"
       },
+      "outputs": [],
       "source": [
         "def eval_model(interpreter, mnist_ds):\n",
         "  total_seen = 0\n",
@@ -677,25 +686,23 @@
         "            (total_seen, float(num_correct) / float(total_seen)))\n",
         "\n",
         "  return float(num_correct) / float(total_seen)"
-      ],
-      "execution_count": 0,
-      "outputs": []
+      ]
     },
     {
       "cell_type": "code",
+      "execution_count": 0,
       "metadata": {
+        "colab": {},
         "colab_type": "code",
-        "id": "T5mWkSbMcU5z",
-        "colab": {}
+        "id": "T5mWkSbMcU5z"
       },
+      "outputs": [],
       "source": [
         "# Create smaller dataset for demonstration purposes\n",
         "mnist_ds_demo = mnist_ds.take(2000)\n",
         "\n",
         "print(eval_model(interpreter, mnist_ds_demo))"
-      ],
-      "execution_count": 0,
-      "outputs": []
+      ]
     },
     {
       "cell_type": "markdown",
@@ -704,26 +711,27 @@
         "id": "Km3cY9ry8ZlG"
       },
       "source": [
-        "Repeat the evaluation on the fully quantized model to obtain:"
+        "Repeat the evaluation on the fully quantized model using the uint8 data:"
       ]
     },
     {
       "cell_type": "code",
+      "execution_count": 0,
       "metadata": {
+        "colab": {},
         "colab_type": "code",
-        "id": "-9cnwiPp6EGm",
-        "colab": {}
+        "id": "-9cnwiPp6EGm"
       },
+      "outputs": [],
       "source": [
-        "# NOTE: Colab runs on server CPUs. At the time of writing this, TensorFlow Lite\n",
-        "# doesn't have super optimized server CPU kernels. For this reason this may be\n",
+        "# NOTE: Colab runs on server CPUs, and TensorFlow Lite currently\n",
+        "# doesn't have super optimized server CPU kernels. So this part may be\n",
         "# slower than the above float interpreter. But for mobile CPUs, considerable\n",
         "# speedup can be observed.\n",
-        "# Only use 2000 for demonstration purposes\n",
-        "print(eval_model(interpreter_quant, mnist_ds_demo))"
-      ],
-      "execution_count": 0,
-      "outputs": []
+        "mnist_ds_demo_uint8 = mnist_ds_uint8.take(2000)\n",
+        "\n",
+        "print(eval_model(interpreter_quant, mnist_ds_demo_uint8))"
+      ]
     },
     {
       "cell_type": "markdown",
@@ -732,8 +740,28 @@
         "id": "L7lfxkor8pgv"
       },
       "source": [
-        "In this example, you have fully quantized a model with no difference in the accuracy."
+        "In this example, you have fully quantized a model with almost no difference in the accuracy, compared to the above float model."
       ]
     }
-  ]
-}
\ No newline at end of file
+  ],
+  "metadata": {
+    "colab": {
+      "collapsed_sections": [],
+      "last_runtime": {
+        "build_target": "//research/colab/notebook:notebook_backend_py3",
+        "kind": "private"
+      },
+      "name": "post_training_integer_quant.ipynb",
+      "private_outputs": true,
+      "provenance": [],
+      "toc_visible": true,
+      "version": "0.3.2"
+    },
+    "kernelspec": {
+      "display_name": "Python 3",
+      "name": "python3"
+    }
+  },
+  "nbformat": 4,
+  "nbformat_minor": 0
+}
diff --git a/tensorflow/lite/g3doc/performance/post_training_quantization.md b/tensorflow/lite/g3doc/performance/post_training_quantization.md
index b71160626ff..a0033b8db53 100644
--- a/tensorflow/lite/g3doc/performance/post_training_quantization.md
+++ b/tensorflow/lite/g3doc/performance/post_training_quantization.md
@@ -151,7 +151,7 @@ provided for specific networks in the
 [TensorFlow Lite model repository](../models/). It is important to check the
 accuracy of the quantized model to verify that any degradation in accuracy is
 within acceptable limits. There is a tool to evaluate
-[TensorFlow Lite model accuracy](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/tools/accuracy/README.md){:.external}.
+[TensorFlow Lite model accuracy](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/tools/accuracy/ilsvrc/README.md){:.external}.
 
 If the accuracy drop is too high, consider using
 [quantization aware training](https://github.com/tensorflow/tensorflow/tree/r1.13/tensorflow/contrib/quantize){:.external}.
diff --git a/tensorflow/lite/g3doc/performance/quantization_spec.md b/tensorflow/lite/g3doc/performance/quantization_spec.md
index 145aadf7b44..07d6869350d 100644
--- a/tensorflow/lite/g3doc/performance/quantization_spec.md
+++ b/tensorflow/lite/g3doc/performance/quantization_spec.md
@@ -36,8 +36,8 @@ quantization params: `scale=[1.0, 2.0, 3.0]`, `zero_point=[1, 2, 3]`,
 `quantization_dimension=1` will be quantized across the second dimension of t:
 
     t[:, 0, :, :] will have scale[0]=1.0, zero_point[0]=1
-    t[:, 1, :, :] will have scale[1]=2.0, zero_point[0]=2
-    t[:, 2, :, :] will have scale[2]=3.0, zero_point[0]=3
+    t[:, 1, :, :] will have scale[1]=2.0, zero_point[1]=2
+    t[:, 2, :, :] will have scale[2]=3.0, zero_point[2]=3
 
 Often, the quantized_dimension is the output_channel of the weights of
 convolutions, but in theory it can be the dimension that corresponds to each
diff --git a/tensorflow/lite/g3doc/r2/convert/concrete_function.md b/tensorflow/lite/g3doc/r2/convert/concrete_function.md
index c82b661eabb..a40d6b0a6a9 100644
--- a/tensorflow/lite/g3doc/r2/convert/concrete_function.md
+++ b/tensorflow/lite/g3doc/r2/convert/concrete_function.md
@@ -13,7 +13,7 @@ execution is an imperative programming environment that evaluates operations
 immediately, without building graphs. Operations return concrete values instead
 of constructing a computational graph to run later. A detailed guide on eager
 execution is available
-[here](https://github.com/tensorflow/docs/blob/master/site/en/r2/guide/eager.ipynb).
+[here](https://github.com/tensorflow/docs/blob/r2.0rc/site/en/r2/guide/eager.ipynb).
 
 While running imperatively with eager execution makes development and debugging
 more interactive, it doesn't allow for deploying on-device. The `tf.function`
@@ -171,7 +171,7 @@ tf.saved_model.save(root, export_dir, concrete_func)
 ```
 
 Reference the
-[SavedModel guide](https://github.com/tensorflow/docs/blob/master/site/en/r2/guide/saved_model.ipynb)
+[SavedModel guide](https://github.com/tensorflow/docs/blob/r2.0rc/site/en/r2/guide/saved_model.ipynb)
 for detailed instructions on using SavedModels.
 
 ### How do I get a concrete function from the SavedModel?
diff --git a/tensorflow/lite/kernels/acceleration_test_util_internal.h b/tensorflow/lite/kernels/acceleration_test_util_internal.h
index be150d95bbc..8999af7e7ad 100644
--- a/tensorflow/lite/kernels/acceleration_test_util_internal.h
+++ b/tensorflow/lite/kernels/acceleration_test_util_internal.h
@@ -16,6 +16,7 @@ limitations under the License.
 #define TENSORFLOW_LITE_KERNELS_ACCELERATION_TEST_UTIL_INTERNAL_H_
 
 #include <algorithm>
+#include <atomic>
 #include <functional>
 #include <sstream>
 #include <string>
diff --git a/tensorflow/lite/kernels/activations.cc b/tensorflow/lite/kernels/activations.cc
index 5c5ebc1edfe..74bdb9a0827 100644
--- a/tensorflow/lite/kernels/activations.cc
+++ b/tensorflow/lite/kernels/activations.cc
@@ -30,6 +30,7 @@ limitations under the License.
 #include "tensorflow/lite/kernels/internal/reference/integer_ops/log_softmax.h"
 #include "tensorflow/lite/kernels/internal/reference/integer_ops/logistic.h"
 #include "tensorflow/lite/kernels/internal/reference/integer_ops/tanh.h"
+#include "tensorflow/lite/kernels/internal/reference/logistic.h"
 #include "tensorflow/lite/kernels/internal/reference/reference_ops.h"
 #include "tensorflow/lite/kernels/internal/tensor.h"
 #include "tensorflow/lite/kernels/internal/types.h"
diff --git a/tensorflow/lite/kernels/conv_test.cc b/tensorflow/lite/kernels/conv_test.cc
index 845b7ffe266..b66e0598854 100644
--- a/tensorflow/lite/kernels/conv_test.cc
+++ b/tensorflow/lite/kernels/conv_test.cc
@@ -1383,8 +1383,8 @@ TEST_P(ConvolutionOpTest, SimplePerChannelTest) {
   // output has dimension [1 * 1 * 2 * 2] as [batch, y, x, output_channel]
   m.Invoke();
   EXPECT_THAT(m.GetDequantizedOutput(),
-              ElementsAreArray(ArrayFloatNear({28.5, 64, -59.5, -46})));
-  EXPECT_THAT(m.GetOutput(), ElementsAreArray({56, 127, -120, -93}));
+              ElementsAreArray(ArrayFloatNear({31, 64, -57, -46})));
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({61, 127, -115, -93}));
 }
 
 INSTANTIATE_TEST_SUITE_P(
diff --git a/tensorflow/lite/kernels/depthwise_conv_test.cc b/tensorflow/lite/kernels/depthwise_conv_test.cc
index 75b4d5e6a61..f01eef339c9 100644
--- a/tensorflow/lite/kernels/depthwise_conv_test.cc
+++ b/tensorflow/lite/kernels/depthwise_conv_test.cc
@@ -1647,9 +1647,9 @@ TEST_P(PerChannelQuantizedDepthwiseConvolutionOpTest, SimpleTest) {
   m.Invoke();
   EXPECT_THAT(
       m.GetDequantizedOutput(),
-      ElementsAreArray(ArrayFloatNear({40.5, 48, 27, 40, 0.5, -4, -24, -36})));
+      ElementsAreArray(ArrayFloatNear({43, 48, 21, 22, 3, -4, -30, -54})));
   EXPECT_THAT(m.GetOutput(),
-              ElementsAreArray({80, 95, 53, 79, 0, -9, -49, -73}));
+              ElementsAreArray({85, 95, 41, 43, 5, -9, -61, -109}));
 }
 
 // Same as previous test, except the shift will be negative for the outputs.
@@ -1695,9 +1695,9 @@ TEST_P(PerChannelQuantizedDepthwiseConvolutionOpTest,
   m.Invoke();
   EXPECT_THAT(
       m.GetDequantizedOutput(),
-      ElementsAreArray(ArrayFloatNear({40, 50, 14.5, 16.5, 0, -2, -32, -42})));
+      ElementsAreArray(ArrayFloatNear({43, 48, 18.5, 22, 3, -4, -28.5, -36})));
   EXPECT_THAT(m.GetOutput(),
-              ElementsAreArray({79, 99, 28, 32, -1, -5, -65, -85}));
+              ElementsAreArray({85, 95, 36, 43, 5, -9, -58, -73}));
 }
 
 // Same as previous test, except the shift will be mixed for the outputs.
@@ -1743,9 +1743,9 @@ TEST_P(PerChannelQuantizedDepthwiseConvolutionOpTest,
   m.Invoke();
   EXPECT_THAT(
       m.GetDequantizedOutput(),
-      ElementsAreArray(ArrayFloatNear({40, 48, 27, 16.5, 0, -4, -24, -42})));
+      ElementsAreArray(ArrayFloatNear({43, 48, 21, 22, 3, -4, -30, -36})));
   EXPECT_THAT(m.GetOutput(),
-              ElementsAreArray({79, 95, 53, 32, -1, -9, -49, -85}));
+              ElementsAreArray({85, 95, 41, 43, 5, -9, -61, -73}));
 }
 
 TEST_P(PerChannelQuantizedDepthwiseConvolutionOpTest, Simple3x3FilterTest) {
diff --git a/tensorflow/lite/kernels/internal/BUILD b/tensorflow/lite/kernels/internal/BUILD
index c52503e6511..b0f21c970db 100644
--- a/tensorflow/lite/kernels/internal/BUILD
+++ b/tensorflow/lite/kernels/internal/BUILD
@@ -125,6 +125,13 @@ config_setting(
     },
 )
 
+config_setting(
+    name = "ios_arm64e",
+    values = {
+        "cpu": "ios_arm64e",
+    },
+)
+
 config_setting(
     name = "k8",
     values = {
@@ -381,6 +388,7 @@ cc_library(
         "reference/integer_ops/pooling.h",
         "reference/integer_ops/softmax.h",
         "reference/integer_ops/tanh.h",
+        "reference/logistic.h",
         "reference/maximum_minimum.h",
         "reference/neg.h",
         "reference/non_max_suppression.h",
@@ -435,6 +443,7 @@ cc_library(
         "reference/floor.h",
         "reference/fully_connected.h",
         "reference/legacy_reference_ops.h",
+        "reference/logistic.h",
         "reference/maximum_minimum.h",
         "reference/neg.h",
         "reference/pooling.h",
@@ -638,6 +647,9 @@ cc_library(
         ":ios_arm64": [
             ":neon_tensor_utils",
         ],
+        ":ios_arm64e": [
+            ":neon_tensor_utils",
+        ],
         ":raspberry_pi_with_neon": [
             ":neon_tensor_utils",
         ],
diff --git a/tensorflow/lite/kernels/internal/log_quantized_test.cc b/tensorflow/lite/kernels/internal/log_quantized_test.cc
index c31c8e30775..2a27a097d2a 100644
--- a/tensorflow/lite/kernels/internal/log_quantized_test.cc
+++ b/tensorflow/lite/kernels/internal/log_quantized_test.cc
@@ -29,7 +29,7 @@ limitations under the License.
 #include <gtest/gtest.h>
 #include "tensorflow/lite/kernels/internal/optimized/optimized_ops.h"
 #include "tensorflow/lite/kernels/internal/reference/reference_ops.h"
-#include "tensorflow/lite/string.h"
+#include "tensorflow/lite/string_type.h"
 
 namespace tflite {
 
@@ -142,8 +142,9 @@ void RunSingleTest(const std::vector<int32>& test_input,
   }
   {
     std::ostringstream label;
-    label << check_label << " / reference vs float-gen / InputIntegerBits="
-          << InputIntegerBits << ", OutputIntegerBits=" << OutputIntegerBits;
+    label << check_label
+          << " / reference vs float-gen / InputIntegerBits=" << InputIntegerBits
+          << ", OutputIntegerBits=" << OutputIntegerBits;
     CheckOutputData(quantized_output, float_gen_output, test_input, label.str(),
                     InputIntegerBits, OutputIntegerBits, tolerance);
   }
@@ -189,7 +190,7 @@ void RunSingleTest(const std::vector<int32>& test_input, int input_integer_bits,
     INPUT_CASE(29);
     default:
       ASSERT_LE(input_integer_bits, 30)
-                << "Input integer bits not handled: " << input_integer_bits;
+          << "Input integer bits not handled: " << input_integer_bits;
   }
 #undef INPUT_CASE
 }
@@ -234,7 +235,7 @@ void RunSingleTest(const std::vector<int32>& test_input, int input_integer_bits,
     OUTPUT_CASE(29);
     default:
       ASSERT_LE(input_integer_bits, 30)
-                << "Input integer bits not handled: " << input_integer_bits;
+          << "Input integer bits not handled: " << input_integer_bits;
   }
 #undef OUTPUT_CASE
 }
diff --git a/tensorflow/lite/kernels/internal/logsoftmax_quantized_test.cc b/tensorflow/lite/kernels/internal/logsoftmax_quantized_test.cc
index d0d2654d412..b98e8234454 100644
--- a/tensorflow/lite/kernels/internal/logsoftmax_quantized_test.cc
+++ b/tensorflow/lite/kernels/internal/logsoftmax_quantized_test.cc
@@ -29,7 +29,7 @@ limitations under the License.
 #include "tensorflow/lite/kernels/internal/reference/integer_ops/log_softmax.h"
 #include "tensorflow/lite/kernels/internal/reference/reference_ops.h"
 #include "tensorflow/lite/kernels/internal/test_util.h"
-#include "tensorflow/lite/string.h"
+#include "tensorflow/lite/string_type.h"
 
 namespace tflite {
 namespace {
diff --git a/tensorflow/lite/kernels/internal/optimized/neon_tensor_utils.cc b/tensorflow/lite/kernels/internal/optimized/neon_tensor_utils.cc
index 35cc4e354eb..700d3fe66e0 100644
--- a/tensorflow/lite/kernels/internal/optimized/neon_tensor_utils.cc
+++ b/tensorflow/lite/kernels/internal/optimized/neon_tensor_utils.cc
@@ -47,9 +47,11 @@ namespace tensor_utils {
 namespace {
 
 constexpr int kFloatValuesPerNeonVector = 4;
+constexpr int kInt16ValuesPerNeonVector = 8;
 
-inline int RoundDownToFloatVectors(int size) {
-  return size & ~(kFloatValuesPerNeonVector - 1);
+template <int PerNeonSize>
+inline int RoundDownVectors(int size) {
+  return size & ~(PerNeonSize - 1);
 }
 
 // Allocates, at least, size bytes of uninitialized storage whose alignment is
@@ -95,14 +97,6 @@ inline int32_t AccumulateNeonLane(const int32x4_t lane) {
 #endif
 }
 
-inline int64_t AccumulateNeonLane64(const int64x2_t lane) {
-#ifdef __aarch64__
-  return vaddvq_s64(lane);
-#else
-  return vgetq_lane_s64(lane, 0) + vgetq_lane_s64(lane, 1);
-#endif
-}
-
 // TODO(jaesung): Merge duplicated implementations in optimized_ops.h and
 // neon_tensor_utils.cc.
 inline int32x4x4_t MultiplyByQuantizedMultiplier4Rows(
@@ -166,7 +160,8 @@ void NeonMatrixBatchVectorMultiplyAccumulate(const float* matrix, int m_rows,
   // If v_size is not divisible by the vector size, then we need to process the
   // final few elements sequentially. postamble_start shows the start index
   // where this should happen.
-  const int postamble_start = RoundDownToFloatVectors(m_cols);
+  const int postamble_start =
+      RoundDownVectors<kFloatValuesPerNeonVector>(m_cols);
 
   for (int b = 0; b < n_batch; b++) {
     float* result_in_batch = result + b * m_rows * result_stride;
@@ -820,14 +815,14 @@ void NeonMatrixBatchVectorMultiplyAccumulate(
 inline int64x2x2_t MulAdd(int32x4_t acc, int32x4_t lhs, int32x4_t rhs) {
   int64x2x2_t result;
   const int64x2_t lhs_low = vmovl_s32(vget_low_s32(lhs));
-  const int64x2_t lhs_high = vmovl_s32(vget_low_s32(lhs));
+  const int64x2_t lhs_high = vmovl_s32(vget_high_s32(lhs));
   const int64_t lhs_0 = vgetq_lane_s64(lhs_low, 0);
   const int64_t lhs_1 = vgetq_lane_s64(lhs_low, 1);
   const int64_t lhs_2 = vgetq_lane_s64(lhs_high, 0);
   const int64_t lhs_3 = vgetq_lane_s64(lhs_high, 1);
 
   const int64x2_t rhs_low = vmovl_s32(vget_low_s32(rhs));
-  const int64x2_t rhs_high = vmovl_s32(vget_low_s32(rhs));
+  const int64x2_t rhs_high = vmovl_s32(vget_high_s32(rhs));
   const int64_t rhs_0 = vgetq_lane_s64(rhs_low, 0);
   const int64_t rhs_1 = vgetq_lane_s64(rhs_low, 1);
   const int64_t rhs_2 = vgetq_lane_s64(rhs_high, 0);
@@ -854,7 +849,7 @@ void NeonApplyLayerNorm(const int16_t* input, const int16_t* layer_norm_weights,
     int64_t sum_sq = 0;
 
     int j = 0;
-    for (; j <= n_input - 16; j += 16) {
+    for (; j <= n_input - 8; j += 8) {
       const int32 index = i * n_input + j;
       const int16x8_t val_s16 = vld1q_s16(input + index);
       const int32x4_t val_s32_0 = vmovl_s16(vget_low_s16(val_s16));
@@ -863,10 +858,10 @@ void NeonApplyLayerNorm(const int16_t* input, const int16_t* layer_norm_weights,
       sum += static_cast<int64_t>(AccumulateNeonLane(val_s32_0));
       sum += static_cast<int64_t>(AccumulateNeonLane(val_s32_1));
 
-      sum_sq += AccumulateNeonLane64(vmovl_s32(vget_low_s32(val_s32_0)));
-      sum_sq += AccumulateNeonLane64(vmovl_s32(vget_high_s32(val_s32_0)));
-      sum_sq += AccumulateNeonLane64(vmovl_s32(vget_low_s32(val_s32_1)));
-      sum_sq += AccumulateNeonLane64(vmovl_s32(vget_high_s32(val_s32_1)));
+      sum_sq += static_cast<int64_t>(
+          AccumulateNeonLane(vmulq_s32(val_s32_0, val_s32_0)));
+      sum_sq += static_cast<int64_t>(
+          AccumulateNeonLane(vmulq_s32(val_s32_1, val_s32_1)));
     }
     for (; j < n_input; ++j) {
       const int32 index = i * n_input + j;
@@ -891,11 +886,11 @@ void NeonApplyLayerNorm(const int16_t* input, const int16_t* layer_norm_weights,
 
     j = 0;
     const int32x4_t mean_dup = vdupq_n_s32(mean);
-    for (; j <= n_input - 32; j += 32) {
-      // Load 32 items at once.
+    for (; j <= n_input - 16; j += 16) {
+      // Load 16 items at once.
       const int32 index = i * n_input + j;
       const int16x8_t val_s16_0 = vld1q_s16(input + index);
-      const int16x8_t val_s16_1 = vld1q_s16(input + index + 16);
+      const int16x8_t val_s16_1 = vld1q_s16(input + index + 8);
 
       int32x4x4_t shifted;
       shifted.val[0] = vsubq_s32(
@@ -984,14 +979,20 @@ void NeonApplySigmoid(const int16_t* input, int32_t n_batch, int32_t n_input,
     // F3 uses 3 integer bits, range [-8, 8], the input range expected here.
     using F3 = gemmlowp::FixedPoint<int16x8_t, 3>;
 
-    for (; i <= n_input - 16; i += 16) {
+    for (; i <= n_input - 32; i += 32) {
       const int index = batch * n_input + i;
       F3 input0 = F3::FromRaw(vld1q_s16(input + index));
       F3 input1 = F3::FromRaw(vld1q_s16(input + index + 8));
+      F3 input2 = F3::FromRaw(vld1q_s16(input + index + 16));
+      F3 input3 = F3::FromRaw(vld1q_s16(input + index + 24));
       F0 output0 = gemmlowp::logistic(input0);
       F0 output1 = gemmlowp::logistic(input1);
+      F0 output2 = gemmlowp::logistic(input2);
+      F0 output3 = gemmlowp::logistic(input3);
       vst1q_s16(output + index, output0.raw());
       vst1q_s16(output + index + 8, output1.raw());
+      vst1q_s16(output + index + 16, output2.raw());
+      vst1q_s16(output + index + 24, output3.raw());
     }
 #endif  // GEMMLOWP_NEON
     using F0_Scalar = gemmlowp::FixedPoint<int16_t, 0>;
@@ -1005,100 +1006,58 @@ void NeonApplySigmoid(const int16_t* input, int32_t n_batch, int32_t n_input,
   }
 }
 
-void NeonApplyTanh0(const int16_t* input, int32_t n_batch, int32_t n_input,
-                    int16_t* output) {
+template <int IntegerBits>
+void NeonApplyTanhImpl(const int16_t* input, int32_t n_batch, int32_t n_input,
+                       int16_t* output) {
   for (int batch = 0; batch < n_batch; ++batch) {
     int i = 0;
 #ifdef GEMMLOWP_NEON
     // F0 uses 0 integer bits, range [-1, 1].
     // This is the return type of math functions such as tanh, logistic,
     // whose range is in [-1, 1].
-    using F0 = gemmlowp::FixedPoint<int16x8_t, 0>;
+    using F_In = gemmlowp::FixedPoint<int16x8_t, IntegerBits>;
+    using F_Out = gemmlowp::FixedPoint<int16x8_t, 0>;
 
-    for (; i <= n_input - 16; i += 16) {
+    for (; i <= n_input - 32; i += 32) {
       const int index = batch * n_input + i;
-      F0 input0 = F0::FromRaw(vld1q_s16(input + index));
-      F0 input1 = F0::FromRaw(vld1q_s16(input + index + 8));
-      F0 output0 = gemmlowp::tanh(input0);
-      F0 output1 = gemmlowp::tanh(input1);
+      F_In input0 = F_In::FromRaw(vld1q_s16(input + index));
+      F_In input1 = F_In::FromRaw(vld1q_s16(input + index + 8));
+      F_In input2 = F_In::FromRaw(vld1q_s16(input + index + 16));
+      F_In input3 = F_In::FromRaw(vld1q_s16(input + index + 24));
+      F_Out output0 = gemmlowp::tanh(input0);
+      F_Out output1 = gemmlowp::tanh(input1);
+      F_Out output2 = gemmlowp::tanh(input2);
+      F_Out output3 = gemmlowp::tanh(input3);
       vst1q_s16(output + index, output0.raw());
       vst1q_s16(output + index + 8, output1.raw());
+      vst1q_s16(output + index + 16, output2.raw());
+      vst1q_s16(output + index + 24, output3.raw());
     }
 #endif  // GEMMLOWP_NEON
-    using F0_Scalar = gemmlowp::FixedPoint<int16_t, 0>;
+    using F_In_Scalar = gemmlowp::FixedPoint<int16_t, IntegerBits>;
+    using F_Out_Scalar = gemmlowp::FixedPoint<int16_t, 0>;
     for (; i < n_input; ++i) {
       const int index = batch * n_input + i;
-      F0_Scalar input_f0 = F0_Scalar::FromRaw(input[index]);
-      F0_Scalar output_f0 = gemmlowp::tanh(input_f0);
-      output[index] = output_f0.raw();
+      F_In_Scalar input_in = F_In_Scalar::FromRaw(input[index]);
+      F_Out_Scalar output_out = gemmlowp::tanh(input_in);
+      output[index] = output_out.raw();
     }
   }
 }
 
+void NeonApplyTanh0(const int16_t* input, int32_t n_batch, int32_t n_input,
+                    int16_t* output) {
+  NeonApplyTanhImpl<0>(input, n_batch, n_input, output);
+}
+
 void NeonApplyTanh3(const int16_t* input, int32_t n_batch, int32_t n_input,
                     int16_t* output) {
-  for (int batch = 0; batch < n_batch; ++batch) {
-    int i = 0;
-#ifdef GEMMLOWP_NEON
-    // F0 uses 0 integer bits, range [-1, 1].
-    // This is the return type of math functions such as tanh, logistic,
-    // whose range is in [-1, 1].
-    using F0 = gemmlowp::FixedPoint<int16x8_t, 0>;
-    // F3 uses 3 integer bits, range [-8, 8], the input range expected here.
-    using F3 = gemmlowp::FixedPoint<int16x8_t, 3>;
-
-    for (; i <= n_input - 16; i += 16) {
-      const int index = batch * n_input + i;
-      F3 input0 = F3::FromRaw(vld1q_s16(input + index));
-      F3 input1 = F3::FromRaw(vld1q_s16(input + index + 8));
-      F0 output0 = gemmlowp::tanh(input0);
-      F0 output1 = gemmlowp::tanh(input1);
-      vst1q_s16(output + index, output0.raw());
-      vst1q_s16(output + index + 8, output1.raw());
-    }
-#endif  // GEMMLOWP_NEON
-    using F0_Scalar = gemmlowp::FixedPoint<int16_t, 0>;
-    using F3_Scalar = gemmlowp::FixedPoint<int16_t, 3>;
-    for (; i < n_input; ++i) {
-      const int index = batch * n_input + i;
-      F3_Scalar input_f3 = F3_Scalar::FromRaw(input[index]);
-      F0_Scalar output_f0 = gemmlowp::tanh(input_f3);
-      output[index] = output_f0.raw();
-    }
-  }
+  NeonApplyTanhImpl<3>(input, n_batch, n_input, output);
 }
 
 void NeonApplyTanh4(const int16_t* input, int32_t n_batch, int32_t n_input,
                     int16_t* output) {
-  for (int batch = 0; batch < n_batch; ++batch) {
-    int i = 0;
-#ifdef GEMMLOWP_NEON
-    // F0 uses 0 integer bits, range [-1, 1].
-    // This is the return type of math functions such as tanh, logistic,
-    // whose range is in [-1, 1].
-    using F0 = gemmlowp::FixedPoint<int16x8_t, 0>;
-    // F4 uses 4 integer bits, range [-16, 16], the input range expected here.
-    using F4 = gemmlowp::FixedPoint<int16x8_t, 4>;
-
-    for (; i <= n_input - 16; i += 16) {
-      const int index = batch * n_input + i;
-      F4 input0 = F4::FromRaw(vld1q_s16(input + index));
-      F4 input1 = F4::FromRaw(vld1q_s16(input + index + 8));
-      F0 output0 = gemmlowp::tanh(input0);
-      F0 output1 = gemmlowp::tanh(input1);
-      vst1q_s16(output + index, output0.raw());
-      vst1q_s16(output + index + 8, output1.raw());
-    }
-#endif  // GEMMLOWP_NEON
-    using F0_Scalar = gemmlowp::FixedPoint<int16_t, 0>;
-    using F4_Scalar = gemmlowp::FixedPoint<int16_t, 4>;
-    for (; i < n_input; ++i) {
-      const int index = batch * n_input + i;
-      F4_Scalar input_f4 = F4_Scalar::FromRaw(input[index]);
-      F0_Scalar output_f0 = gemmlowp::tanh(input_f4);
-      output[index] = output_f0.raw();
-    }
-  }
+  NeonApplyTanhImpl<4>(input, n_batch, n_input, output);
 }
 
 void NeonCwiseMul(const int16_t* input_1, const int16_t* input_2, int n_batch,
@@ -1137,8 +1096,16 @@ void NeonCwiseMul(const int16_t* input_1, const int16_t* input_2, int n_batch,
   }
 }
 
-void NeonCwiseMul(const int16_t* input_1, const int16_t* input_2, int n_batch,
-                  int n_input, int shift, int8_t* output) {
+void NeonCwiseMul(const int16_t* input_1, const int16_t* input_2,
+                  int32_t multiplier, int shift, int n_batch, int n_input,
+                  int32_t output_zp, int8_t* output) {
+  const int32_t output_min = std::numeric_limits<int8_t>::min();
+  const int32_t output_max = std::numeric_limits<int8_t>::max();
+
+  const int32x4_t output_zp_dup = vdupq_n_s32(-output_zp);
+  const int32x4_t max_val_dup = vdupq_n_s32(output_max);
+  const int32x4_t min_val_dup = vdupq_n_s32(output_min);
+
   for (int batch = 0; batch < n_batch; ++batch) {
     int i = 0;
     for (; i <= n_input - 8; i += 8) {
@@ -1150,25 +1117,33 @@ void NeonCwiseMul(const int16_t* input_1, const int16_t* input_2, int n_batch,
       const int32x4_t b_s32_0 = vmovl_s16(vget_low_s16(b));
       const int32x4_t b_s32_1 = vmovl_s16(vget_high_s16(b));
 
-      int32x4_t x_0 = vmulq_s32(a_s32_0, b_s32_0);
-      int32x4_t x_1 = vmulq_s32(a_s32_1, b_s32_1);
-      x_0 = gemmlowp::RoundingDivideByPOT(x_0, shift);
-      x_1 = gemmlowp::RoundingDivideByPOT(x_1, shift);
+      int32x4x2_t temp_val;
+      temp_val.val[0] = vmulq_s32(a_s32_0, b_s32_0);
+      temp_val.val[1] = vmulq_s32(a_s32_1, b_s32_1);
+      temp_val =
+          MultiplyByQuantizedMultiplier2Rows(temp_val, multiplier, shift);
 
-      const int16x8_t result = vcombine_s16(vmovn_s32(x_0), vmovn_s32(x_1));
+      temp_val.val[0] = vaddq_s32(temp_val.val[0], output_zp_dup);
+      temp_val.val[1] = vaddq_s32(temp_val.val[1], output_zp_dup);
+      temp_val.val[0] =
+          vmaxq_s32(vminq_s32(temp_val.val[0], max_val_dup), min_val_dup);
+      temp_val.val[1] =
+          vmaxq_s32(vminq_s32(temp_val.val[1], max_val_dup), min_val_dup);
+
+      const int16x8_t result =
+          vcombine_s16(vmovn_s32(temp_val.val[0]), vmovn_s32(temp_val.val[1]));
       vst1_s8(output + index, vmovn_s16(result));
     }
     for (; i < n_input; ++i) {
       const int index = batch * n_input + i;
       const int16_t a = input_1[index];
       const int16_t b = input_2[index];
-      int64_t x = a * b;
-      if (x > std::numeric_limits<std::int32_t>::max()) {
-        x = std::numeric_limits<std::int32_t>::max();
-      }
-      const int32_t value = static_cast<int32_t>(x);
-      output[index] =
-          static_cast<int8_t>(gemmlowp::RoundingDivideByPOT(value, shift));
+      int32_t value = static_cast<int32_t>(a) * static_cast<int32_t>(b);
+      value = MultiplyByQuantizedMultiplier(value, multiplier, shift);
+      value -= output_zp;
+      value = std::min(std::max(-128, value), 127);
+
+      output[index] = static_cast<int8>(value);
     }
   }
 }
@@ -1382,7 +1357,8 @@ void NeonVectorVectorCwiseProduct(const float* vector1, const float* vector2,
   // If v_size is not divisible by the vector size, then we need to process the
   // final few elements sequentially. postamble_start shows the start index
   // where this should happen.
-  const int postamble_start = RoundDownToFloatVectors(v_size);
+  const int postamble_start =
+      RoundDownVectors<kFloatValuesPerNeonVector>(v_size);
   int v = 0;
   for (; v < postamble_start; v += kFloatValuesPerNeonVector) {
     // Load 4 float values from vector1 and vector2.
@@ -1404,7 +1380,8 @@ void NeonVectorVectorCwiseProductAccumulate(const float* vector1,
   // If v_size is not divisible by the vector size, then we need to process the
   // final few elements sequentially. postamble_start shows the start index
   // where this should happen.
-  const int postamble_start = RoundDownToFloatVectors(v_size);
+  const int postamble_start =
+      RoundDownVectors<kFloatValuesPerNeonVector>(v_size);
   int v = 0;
   for (; v < postamble_start; v += kFloatValuesPerNeonVector) {
     // Load 4 float values from vector1 and vector2 and accumulator.
@@ -1427,7 +1404,8 @@ void NeonVectorBatchVectorCwiseProduct(const float* vector, int v_size,
   // If v_size is not divisible by the vector size, then we need to process the
   // final few elements sequentially. postamble_start shows the start index
   // where this should happen.
-  const int postamble_start = RoundDownToFloatVectors(v_size);
+  const int postamble_start =
+      RoundDownVectors<kFloatValuesPerNeonVector>(v_size);
 
   for (int b = 0; b < n_batch; b++) {
     int v = 0;
@@ -1457,7 +1435,8 @@ void NeonVectorBatchVectorCwiseProductAccumulate(const float* vector,
   // If v_size is not divisible by the vector size, then we need to process the
   // final few elements sequentially. postamble_start shows the start index
   // where this should happen.
-  const int postamble_start = RoundDownToFloatVectors(v_size);
+  const int postamble_start =
+      RoundDownVectors<kFloatValuesPerNeonVector>(v_size);
 
   float* result_ptr = result;
   const float* batch_vector_ptr = batch_vector;
@@ -1487,7 +1466,8 @@ void NeonSub1Vector(const float* vector, int v_size, float* result) {
   // If v_size is not divisible by the vector size, then we need to process the
   // final few elements sequentially. postamble_start shows the start index
   // where this should happen.
-  const int postamble_start = RoundDownToFloatVectors(v_size);
+  const int postamble_start =
+      RoundDownVectors<kFloatValuesPerNeonVector>(v_size);
 
   float32x4_t one_f32x4 = vmovq_n_f32(1.0);
   int v = 0;
@@ -1504,11 +1484,29 @@ void NeonSub1Vector(const float* vector, int v_size, float* result) {
   }
 }
 
+void NeonSub1Vector(const int16_t* vector, int v_size, int16_t* result) {
+  int postamble_start = RoundDownVectors<kInt16ValuesPerNeonVector>(v_size);
+  static const int16_t kOne = 32767;
+  // Use xor to replace substract from 1 << 15 - 1.
+  // Local benchmark shows it's slightly faster than pure substract.
+  const int16x8_t one_dup = vdupq_n_s16(kOne);
+  int i = 0;
+  for (; i < postamble_start; i += kInt16ValuesPerNeonVector) {
+    const int16x8_t input = vld1q_s16(vector + i);
+    const int16x8_t sub1_result = veorq_s16(one_dup, input);
+    vst1q_s16(result + i, sub1_result);
+  }
+  for (; i < v_size; i++) {
+    result[i] = kOne ^ vector[i];
+  }
+}
+
 bool NeonIsZeroVector(const float* vector, int v_size) {
   // If v_size is not divisible by the vector size, then we need to process the
   // final few elements sequentially. postamble_start shows the start index
   // where this should happen.
-  const int postamble_start = RoundDownToFloatVectors(v_size);
+  const int postamble_start =
+      RoundDownVectors<kFloatValuesPerNeonVector>(v_size);
 
   const float32x4_t zero_x4_float = vmovq_n_f32(0.0f);
   int v = 0;
@@ -1532,7 +1530,8 @@ void NeonClipVector(const float* vector, int v_size, float abs_limit,
   // If v_size is not divisible by the vector size, then we need to process the
   // final few elements sequentially. postamble_start shows the start index
   // where this should happen.
-  const int postamble_start = RoundDownToFloatVectors(v_size);
+  const int postamble_start =
+      RoundDownVectors<kFloatValuesPerNeonVector>(v_size);
 
   // Replicate abs_limit and -abs_limit in two vectors.
   const float32x4_t abs_limit_f32x4 = vmovq_n_f32(abs_limit);
@@ -1709,7 +1708,8 @@ float NeonVectorVectorDotProduct(const float* vector1, const float* vector2,
   // If v_size is not divisible by the vector size, then we need to process the
   // final few elements sequentially. postamble_start shows the start index
   // where this should happen.
-  const int postamble_start = RoundDownToFloatVectors(v_size);
+  const int postamble_start =
+      RoundDownVectors<kFloatValuesPerNeonVector>(v_size);
   float32x4_t acc_32x4 = vmovq_n_f32(0.0);
   int v = 0;
   for (; v < postamble_start; v += kFloatValuesPerNeonVector) {
@@ -1734,7 +1734,8 @@ void NeonReductionSumVector(const float* input_vector, float* output_vector,
     // If v_size is not divisible by the vector size, then we need to process
     // the final few elements sequentially. postamble_start shows the start
     // index where this should happen.
-    const int postamble_start = RoundDownToFloatVectors(reduction_size);
+    const int postamble_start =
+        RoundDownVectors<kFloatValuesPerNeonVector>(reduction_size);
     float32x4_t sum_f32x4 = vmovq_n_f32(0.0);
     int r = 0;
     for (; r < postamble_start; r += kFloatValuesPerNeonVector) {
diff --git a/tensorflow/lite/kernels/internal/optimized/neon_tensor_utils.h b/tensorflow/lite/kernels/internal/optimized/neon_tensor_utils.h
index a0eef04a5bb..93acbf9767c 100644
--- a/tensorflow/lite/kernels/internal/optimized/neon_tensor_utils.h
+++ b/tensorflow/lite/kernels/internal/optimized/neon_tensor_utils.h
@@ -87,9 +87,9 @@ void ApplyLayerNorm(const int16_t* input, const int16_t* layer_norm_weights,
                     const int32_t* bias, int32_t layer_norm_scale_a,
                     int32_t layer_norm_scale_b, int32_t variance_limit,
                     int n_batch, int n_input, int16_t* output) {
-  PortableApplyLayerNorm(input, layer_norm_weights, bias, layer_norm_scale_a,
-                         layer_norm_scale_b, variance_limit, n_batch, n_input,
-                         output);
+  NEON_OR_PORTABLE(ApplyLayerNorm, input, layer_norm_weights, bias,
+                   layer_norm_scale_a, layer_norm_scale_b, variance_limit,
+                   n_batch, n_input, output);
 }
 
 void ApplySigmoid(const int16_t* input, int32_t n_batch, int32_t n_input,
@@ -117,9 +117,11 @@ void CwiseMul(const int16_t* input_1, const int16_t* input_2, int n_batch,
   NEON_OR_PORTABLE(CwiseMul, input_1, input_2, n_batch, n_input, shift, output);
 }
 
-void CwiseMul(const int16_t* input_1, const int16_t* input_2, int n_batch,
-              int n_input, int shift, int8_t* output) {
-  NEON_OR_PORTABLE(CwiseMul, input_1, input_2, n_batch, n_input, shift, output);
+void CwiseMul(const int16_t* input_1, const int16_t* input_2,
+              int32_t multiplier, int shift, int n_batch, int n_input,
+              int32_t output_zp, int8_t* output) {
+  NEON_OR_PORTABLE(CwiseMul, input_1, input_2, multiplier, shift, n_batch,
+                   n_input, output_zp, output);
 }
 
 void CwiseAdd(const int16_t* input_1, const int16_t* input_2, int n_batch,
@@ -127,21 +129,14 @@ void CwiseAdd(const int16_t* input_1, const int16_t* input_2, int n_batch,
   NEON_OR_PORTABLE(CwiseAdd, input_1, input_2, n_batch, n_input, output);
 }
 
-void CwiseMul(const int16_t* input_1, const int16_t* input_2,
-              int32_t multiplier, int32_t shift, int32_t n_batch,
-              int32_t n_input, int32_t output_zp, int8_t* output) {
-  PortableCwiseMul(input_1, input_2, multiplier, shift, n_batch, n_input,
-                   output_zp, output);
-}
-
 void CwiseClipping(int16_t* input, const int16_t clipping_value,
                    int32_t n_batch, int32_t n_input) {
-  PortableCwiseClipping(input, clipping_value, n_batch, n_input);
+  NEON_OR_PORTABLE(CwiseClipping, input, clipping_value, n_batch, n_input);
 }
 
 void CwiseClipping(int8_t* input, const int8_t clipping_value, int32_t n_batch,
                    int32_t n_input) {
-  PortableCwiseClipping(input, clipping_value, n_batch, n_input);
+  NEON_OR_PORTABLE(CwiseClipping, input, clipping_value, n_batch, n_input);
 }
 
 void VectorVectorCwiseProduct(const float* vector1, const float* vector2,
@@ -194,7 +189,7 @@ void Sub1Vector(const float* vector, int v_size, float* result) {
 }
 
 void Sub1Vector(const int16_t* vector, int v_size, int16_t* result) {
-  PortableSub1Vector(vector, v_size, result);
+  NEON_OR_PORTABLE(Sub1Vector, vector, v_size, result);
 }
 
 float Clip(float f, float abs_limit) { return PortableClip(f, abs_limit); }
diff --git a/tensorflow/lite/kernels/internal/optimized/neon_tensor_utils_impl.h b/tensorflow/lite/kernels/internal/optimized/neon_tensor_utils_impl.h
index b6bd956b7a9..2bed674d49e 100644
--- a/tensorflow/lite/kernels/internal/optimized/neon_tensor_utils_impl.h
+++ b/tensorflow/lite/kernels/internal/optimized/neon_tensor_utils_impl.h
@@ -62,8 +62,9 @@ void NeonApplyTanh4(const int16_t* input, int32_t n_batch, int32_t n_input,
 void NeonCwiseMul(const int16_t* input_1, const int16_t* input_2, int n_batch,
                   int n_input, int shift, int16_t* output);
 
-void NeonCwiseMul(const int16_t* input_1, const int16_t* input_2, int n_batch,
-                  int n_input, int shift, int8_t* output);
+void NeonCwiseMul(const int16_t* input_1, const int16_t* input_2,
+                  int32_t multiplier, int shift, int n_batch, int n_input,
+                  int32_t output_zp, int8_t* output);
 
 void NeonCwiseAdd(const int16_t* input_1, const int16_t* input_2, int n_batch,
                   int n_input, int16_t* output);
@@ -131,6 +132,8 @@ void NeonVectorBatchVectorCwiseProductAccumulate(const float* vector,
 // Compute "1.0f - elements of vector" (used in CIFG).
 void NeonSub1Vector(const float* vector, int v_size, float* result);
 
+void NeonSub1Vector(const int16_t* vector, int v_size, int16_t* result);
+
 // Clip elements of a vector using a abs_limit value.
 void NeonClipVector(const float* vector, int v_size, float abs_limit,
                     float* result);
diff --git a/tensorflow/lite/kernels/internal/optimized/sse_tensor_utils.h b/tensorflow/lite/kernels/internal/optimized/sse_tensor_utils.h
index 4c40ee85fb1..df3c070093e 100644
--- a/tensorflow/lite/kernels/internal/optimized/sse_tensor_utils.h
+++ b/tensorflow/lite/kernels/internal/optimized/sse_tensor_utils.h
@@ -124,11 +124,6 @@ void CwiseMul(const int16_t* input_1, const int16_t* input_2, int n_batch,
   PortableCwiseMul(input_1, input_2, n_batch, n_input, shift, output);
 }
 
-void CwiseMul(const int16_t* input_1, const int16_t* input_2, int n_batch,
-              int n_input, int shift, int8_t* output) {
-  PortableCwiseMul(input_1, input_2, n_batch, n_input, shift, output);
-}
-
 void CwiseMul(const int16_t* input_1, const int16_t* input_2,
               int32_t multiplier, int32_t shift, int32_t n_batch,
               int32_t n_input, int32_t output_zp, int8_t* output) {
diff --git a/tensorflow/lite/kernels/internal/reference/logistic.h b/tensorflow/lite/kernels/internal/reference/logistic.h
new file mode 100644
index 00000000000..9d54d7ddefe
--- /dev/null
+++ b/tensorflow/lite/kernels/internal/reference/logistic.h
@@ -0,0 +1,70 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_LOGISTIC_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_LOGISTIC_H_
+
+#include "fixedpoint/fixedpoint.h"
+#include "tensorflow/lite/kernels/internal/common.h"
+#include "tensorflow/lite/kernels/internal/quantization_util.h"
+#include "tensorflow/lite/kernels/internal/round.h"
+#include "tensorflow/lite/kernels/internal/types.h"
+#include "tensorflow/lite/kernels/op_macros.h"
+
+namespace tflite {
+namespace reference_ops {
+
+inline void Logistic(const RuntimeShape& input_shape, const float* input_data,
+                     const RuntimeShape& output_shape, float* output_data) {
+  const int flat_size = MatchingFlatSize(input_shape, output_shape);
+
+  for (int i = 0; i < flat_size; i++) {
+    float val = input_data[i];
+    float result = 1.f / (1.f + std::exp(-val));
+    output_data[i] = result;
+  }
+}
+
+// Convenience version that allows, for example, generated-code calls to be
+// uniform between data types.
+inline void Logistic(const LogisticParams&, const RuntimeShape& input_shape,
+                     const float* input_data, const RuntimeShape& output_shape,
+                     float* output_data) {
+  // Drop params: not needed.
+  Logistic(input_shape, input_data, output_shape, output_data);
+}
+
+inline void Logistic(const LogisticParams& params,
+                     const RuntimeShape& input_shape, const int16* input_data,
+                     const RuntimeShape& output_shape, int16* output_data) {
+  const int flat_size = MatchingFlatSize(input_shape, output_shape);
+
+  for (int i = 0; i < flat_size; i++) {
+    // F0 uses 0 integer bits, range [-1, 1].
+    // This is the return type of math functions such as tanh, logistic,
+    // whose range is in [-1, 1].
+    using F0 = gemmlowp::FixedPoint<std::int16_t, 0>;
+    // F3 uses 3 integer bits, range [-8, 8], the input range expected here.
+    using F3 = gemmlowp::FixedPoint<std::int16_t, 3>;
+
+    const F3 input = F3::FromRaw(input_data[i]);
+    F0 output = gemmlowp::logistic(input);
+    output_data[i] = output.raw();
+  }
+}
+
+}  // namespace reference_ops
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_LOGISTIC_H_
diff --git a/tensorflow/lite/kernels/internal/reference/portable_tensor_utils.cc b/tensorflow/lite/kernels/internal/reference/portable_tensor_utils.cc
index 932478e24d4..ad42dd4399e 100644
--- a/tensorflow/lite/kernels/internal/reference/portable_tensor_utils.cc
+++ b/tensorflow/lite/kernels/internal/reference/portable_tensor_utils.cc
@@ -346,20 +346,6 @@ void PortableCwiseMul(const int16_t* input_1, const int16_t* input_2,
   }
 }
 
-void PortableCwiseMul(const int16_t* input_1, const int16_t* input_2,
-                      int n_batch, int n_input, int shift, int8_t* output) {
-  for (int batch = 0; batch < n_batch; ++batch) {
-    for (int i = 0; i < n_input; ++i) {
-      const int index = batch * n_input + i;
-      const int16_t a = input_1[index];
-      const int16_t b = input_2[index];
-      const int32_t value = static_cast<int32_t>(a) * static_cast<int32_t>(b);
-      output[index] =
-          static_cast<int8_t>(gemmlowp::RoundingDivideByPOT(value, shift));
-    }
-  }
-}
-
 void PortableCwiseMul(const int16_t* input_1, const int16_t* input_2,
                       int32_t multiplier, int32_t shift, int32_t n_batch,
                       int32_t n_input, int32_t output_zp, int8_t* output) {
diff --git a/tensorflow/lite/kernels/internal/reference/portable_tensor_utils.h b/tensorflow/lite/kernels/internal/reference/portable_tensor_utils.h
index 086b050b7b6..3eed48c1aeb 100644
--- a/tensorflow/lite/kernels/internal/reference/portable_tensor_utils.h
+++ b/tensorflow/lite/kernels/internal/reference/portable_tensor_utils.h
@@ -133,11 +133,6 @@ void CwiseMul(const int16_t* input_1, const int16_t* input_2, int n_batch,
   PortableCwiseMul(input_1, input_2, n_batch, n_input, shift, output);
 }
 
-void CwiseMul(const int16_t* input_1, const int16_t* input_2, int n_batch,
-              int n_input, int shift, int8_t* output) {
-  PortableCwiseMul(input_1, input_2, n_batch, n_input, shift, output);
-}
-
 void CwiseMul(const int16_t* input_1, const int16_t* input_2,
               int32_t multiplier, int32_t shift, int32_t n_batch,
               int32_t n_input, int32_t output_zp, int8_t* output) {
diff --git a/tensorflow/lite/kernels/internal/reference/portable_tensor_utils_impl.h b/tensorflow/lite/kernels/internal/reference/portable_tensor_utils_impl.h
index 8acdd7d4238..24d4296ea8e 100644
--- a/tensorflow/lite/kernels/internal/reference/portable_tensor_utils_impl.h
+++ b/tensorflow/lite/kernels/internal/reference/portable_tensor_utils_impl.h
@@ -123,9 +123,6 @@ void PortableApplyTanh4(const int16_t* input, int32_t n_batch, int32_t n_input,
 void PortableCwiseMul(const int16_t* input_1, const int16_t* input_2,
                       int n_batch, int n_input, int shift, int16_t* output);
 
-void PortableCwiseMul(const int16_t* input_1, const int16_t* input_2,
-                      int n_batch, int n_input, int shift, int8_t* output);
-
 void PortableCwiseMul(const int16_t* input_1, const int16_t* input_2,
                       int32_t multiplier, int32_t shift, int32_t n_batch,
                       int32_t n_input, int32_t output_zp, int8_t* output);
diff --git a/tensorflow/lite/kernels/internal/reference/reference_ops.h b/tensorflow/lite/kernels/internal/reference/reference_ops.h
index 5f2e8331f59..35e429ba050 100644
--- a/tensorflow/lite/kernels/internal/reference/reference_ops.h
+++ b/tensorflow/lite/kernels/internal/reference/reference_ops.h
@@ -40,6 +40,7 @@ limitations under the License.
 #include "tensorflow/lite/kernels/internal/reference/conv.h"
 #include "tensorflow/lite/kernels/internal/reference/floor.h"
 #include "tensorflow/lite/kernels/internal/reference/fully_connected.h"
+#include "tensorflow/lite/kernels/internal/reference/logistic.h"
 #include "tensorflow/lite/kernels/internal/reference/maximum_minimum.h"
 #include "tensorflow/lite/kernels/internal/reference/neg.h"
 #include "tensorflow/lite/kernels/internal/reference/pooling.h"
@@ -1934,45 +1935,6 @@ inline void LogSoftmax(const SoftmaxParams& params,
   }
 }
 
-inline void Logistic(const RuntimeShape& input_shape, const float* input_data,
-                     const RuntimeShape& output_shape, float* output_data) {
-  const int flat_size = MatchingFlatSize(input_shape, output_shape);
-
-  for (int i = 0; i < flat_size; i++) {
-    float val = input_data[i];
-    float result = 1.f / (1.f + std::exp(-val));
-    output_data[i] = result;
-  }
-}
-
-// Convenience version that allows, for example, generated-code calls to be
-// uniform between data types.
-inline void Logistic(const LogisticParams&, const RuntimeShape& input_shape,
-                     const float* input_data, const RuntimeShape& output_shape,
-                     float* output_data) {
-  // Drop params: not needed.
-  Logistic(input_shape, input_data, output_shape, output_data);
-}
-
-inline void Logistic(const LogisticParams& params,
-                     const RuntimeShape& input_shape, const int16* input_data,
-                     const RuntimeShape& output_shape, int16* output_data) {
-  const int flat_size = MatchingFlatSize(input_shape, output_shape);
-
-  for (int i = 0; i < flat_size; i++) {
-    // F0 uses 0 integer bits, range [-1, 1].
-    // This is the return type of math functions such as tanh, logistic,
-    // whose range is in [-1, 1].
-    using F0 = gemmlowp::FixedPoint<std::int16_t, 0>;
-    // F3 uses 3 integer bits, range [-8, 8], the input range expected here.
-    using F3 = gemmlowp::FixedPoint<std::int16_t, 3>;
-
-    const F3 input = F3::FromRaw(input_data[i]);
-    F0 output = gemmlowp::logistic(input);
-    output_data[i] = output.raw();
-  }
-}
-
 inline void Tanh(const RuntimeShape& input_shape, const float* input_data,
                  const RuntimeShape& output_shape, float* output_data) {
   const int flat_size = MatchingFlatSize(input_shape, output_shape);
diff --git a/tensorflow/lite/kernels/internal/softmax_quantized_test.cc b/tensorflow/lite/kernels/internal/softmax_quantized_test.cc
index 269dc98e129..95fc8f1a7e2 100644
--- a/tensorflow/lite/kernels/internal/softmax_quantized_test.cc
+++ b/tensorflow/lite/kernels/internal/softmax_quantized_test.cc
@@ -29,7 +29,7 @@ limitations under the License.
 #include "tensorflow/lite/kernels/internal/reference/integer_ops/softmax.h"
 #include "tensorflow/lite/kernels/internal/reference/reference_ops.h"
 #include "tensorflow/lite/kernels/internal/test_util.h"
-#include "tensorflow/lite/string.h"
+#include "tensorflow/lite/string_type.h"
 
 namespace tflite {
 namespace {
diff --git a/tensorflow/lite/kernels/internal/tensor_utils.h b/tensorflow/lite/kernels/internal/tensor_utils.h
index 29548867ab0..58236c0e139 100644
--- a/tensorflow/lite/kernels/internal/tensor_utils.h
+++ b/tensorflow/lite/kernels/internal/tensor_utils.h
@@ -102,6 +102,32 @@ void SparseMatrixBatchVectorMultiplyAccumulate(
     const float* scaling_factors, int n_batch, float* __restrict__ result,
     int result_stride);
 
+// Multiplies a matrix by a "batched" vector (i.e. a matrix with a batch
+// dimension composed by input vectors independent from each other). The result
+// of the multiplication is accumulated to the passed result buffer.
+// More specifically, for a matrix M of shape [n, i] and a batched-vector
+// of shape [i, batch] it will first compute the product of shape [n, batch].
+// This product will be accumulated to the result buffer,
+// Parameters:
+//     - input: batch vector of size n_batch * n_input
+//     - bias:  vector of size b_input
+//     - input_to_gate_weights: matrix of size n_input * n_output
+//     - multiplier: scalar
+//     - shift: scalar
+//     - n_batch: the batch size
+//     - n_input: the input size
+//     - n_output: the output size
+//     - output_zp: the zero point of the output.
+//     - scratch: batch vector of size n_batch * n_output
+//     - output: the 16 bit output
+// Notes:
+//     - this is used for gate matmul: for non-cifg it is for input, forget,
+//       cell, output gates; for cifg, it is for forget, cell, output gates.
+//     - multiplier and shift combined gives the scale.
+//     - assumes input zero point is 0.
+//     - scratch is created for optimization purpose only.
+//       TODO(jianlijianli): this can be removed if some furture optimization
+//       work makes it unnecesssary.
 void MatrixBatchVectorMultiplyAccumulate(const int8_t* input,
                                          const int32_t* bias,
                                          const int8_t* input_to_gate_weights,
@@ -110,6 +136,31 @@ void MatrixBatchVectorMultiplyAccumulate(const int8_t* input,
                                          int32_t n_output, int32_t output_zp,
                                          int32_t* scratch, int16_t* output);
 
+// Multiplies a matrix by a "batched" vector (i.e. a matrix with a batch
+// dimension composed by input vectors independent from each other). The result
+// of the multiplication is accumulated to the passed result buffer.
+// More specifically, for a matrix M of shape [n, i] and a batched-vector
+// of shape [i, batch] it will first compute the product of shape [n, batch].
+// This product will be accumulated to the result buffer,
+// Parameters:
+//     - input: batch vector of size n_batch * n_input
+//     - bias:  vector of size b_input
+//     - input_to_gate_weights: matrix of size n_input * n_output
+//     - multiplier: scalar
+//     - shift: scalar
+//     - n_batch: the batch size
+//     - n_input: the input size
+//     - n_output: the output size
+//     - output_zp: the zero point of the output.
+//     - scratch: batch vector of size n_batch * n_output
+//     - output: the 8 bit output
+// Notes:
+//     - this is used for projection matmul.
+//     - multiplier and shift combined gives the scale.
+//     - assumes input zero point is 0.
+//     - scratch is created for optimization purpose only.
+//       TODO(jianlijianli): this can be removed if some furture optimization
+//       work makes it unnecesssary.
 void MatrixBatchVectorMultiplyAccumulate(const int8_t* input,
                                          const int32_t* bias,
                                          const int8_t* input_to_gate_weights,
diff --git a/tensorflow/lite/kernels/internal/tensor_utils_test.cc b/tensorflow/lite/kernels/internal/tensor_utils_test.cc
index fa88ec20c91..cd048d7f996 100644
--- a/tensorflow/lite/kernels/internal/tensor_utils_test.cc
+++ b/tensorflow/lite/kernels/internal/tensor_utils_test.cc
@@ -280,13 +280,19 @@ TEST(uKernels, QuantTanh0Test) {
       -145, 899, -176, -35,  264, 289,  8,    27,   -37,  -1310,
       -120, 127, -16,  106,  370, -583, -299, 93,   -548, 548,
       653,  -29, -53,  1058, -52, -164, -149, -635, 201,  -1297,
+      -145, 899, -176, -35,  264, 289,  8,    27,   -37,  -1310,
+      -120, 127, -16,  106,  370, -583, -299, 93,   -548, 548,
+      653,  -29, -53,  1058, -52, -164, -149, -635, 201,  -1297,
   };
-  std::vector<int16_t> output(2 * 15, 0);
-  ApplyTanh0(input.data(), 2, 15, output.data());
+  std::vector<int16_t> output(4 * 15, 0);
+  ApplyTanh0(input.data(), 4, 15, output.data());
   const std::vector<int16_t> expected_output = {
       -136, 904, -176, -40,  260, 292,  8,    28,   -44,  -1304,
       -120, 120, -24,  112,  376, -576, -308, 88,   -544, 544,
       652,  -32, -60,  1056, -56, -156, -144, -636, 192,  -1300,
+      -136, 904, -176, -40,  260, 292,  8,    28,   -44,  -1304,
+      -120, 120, -24,  112,  376, -576, -308, 88,   -544, 544,
+      652,  -32, -60,  1056, -56, -156, -144, -636, 192,  -1300,
   };
   EXPECT_THAT(output, testing::ElementsAreArray(expected_output));
 }
@@ -297,13 +303,19 @@ TEST(uKernels, QuantTanh3Test) {
       -145, 899, -176, -35,  264, 289,  8,    27,   -37,  -1310,
       -120, 127, -16,  106,  370, -583, -299, 93,   -548, 548,
       653,  -29, -53,  1058, -52, -164, -149, -635, 201,  -1297,
+      -145, 899, -176, -35,  264, 289,  8,    27,   -37,  -1310,
+      -120, 127, -16,  106,  370, -583, -299, 93,   -548, 548,
+      653,  -29, -53,  1058, -52, -164, -149, -635, 201,  -1297,
   };
-  std::vector<int16_t> output(2 * 15, 0);
-  ApplyTanh3(input.data(), 2, 15, output.data());
+  std::vector<int16_t> output(4 * 15, 0);
+  ApplyTanh3(input.data(), 4, 15, output.data());
   const std::vector<int16_t> expected_output = {
       -1156, 7076, -1412, -276, 2104, 2308,  64,    220,   -288,  -10132,
       -964,  1016, -120,  844,  2944, -4640, -2392, 736,   -4352, 4352,
       5180,  -232, -428,  8276, -412, -1308, -1196, -5044, 1612,  -10044,
+      -1156, 7076, -1412, -276, 2104, 2308,  64,    220,   -288,  -10132,
+      -964,  1016, -120,  844,  2944, -4640, -2392, 736,   -4352, 4352,
+      5180,  -232, -428,  8276, -412, -1308, -1196, -5044, 1612,  -10044,
   };
   EXPECT_THAT(output, testing::ElementsAreArray(expected_output));
 }
@@ -313,13 +325,18 @@ TEST(uKernels, QuantTanh4Test) {
   const std::vector<int16_t> input = {
       -5,  163, -31, -5,  54, 90, 1,  2,  -4, -42, -8,  29,  0,   47, 150,
       -26, -36, 9,   -73, 25, 14, -2, -1, 29, -10, -12, -18, -29, 51, -92,
+      -5,  163, -31, -5,  54, 90, 1,  2,  -4, -42, -8,  29,  0,   47, 150,
+      -26, -36, 9,   -73, 25, 14, -2, -1, 29, -10, -12, -18, -29, 51, -92,
   };
-  std::vector<int16_t> output(2 * 15, 0);
-  ApplyTanh4(input.data(), 2, 15, output.data());
+  std::vector<int16_t> output(4 * 15, 0);
+  ApplyTanh4(input.data(), 4, 15, output.data());
   const std::vector<int16_t> expected_output = {
       -76,  2596, -496, -76, 856,  1436, 24,   36,   -64,   -672,
       -120, 456,  0,    752, 2400, -412, -576, 148,  -1168, 400,
       216,  -36,  -24,  456, -164, -192, -292, -456, 820,   -1476,
+      -76,  2596, -496, -76, 856,  1436, 24,   36,   -64,   -672,
+      -120, 456,  0,    752, 2400, -412, -576, 148,  -1168, 400,
+      216,  -36,  -24,  456, -164, -192, -292, -456, 820,   -1476,
   };
   EXPECT_THAT(output, testing::ElementsAreArray(expected_output));
 }
@@ -327,17 +344,23 @@ TEST(uKernels, QuantTanh4Test) {
 // Quantized sigmoid with Q3.12 input and Q0.15 output.
 TEST(uKernels, QuantSigmoidTest) {
   const std::vector<int16_t> input = {
-      -10500, 1398,   -6963,  -7404,  485,    -5401,  -1757, -7668,
-      -19248, -9692,  -24249, -17923, -15840, -10026, 5249,  -89,
-      1787,   -16178, -6691,  -19524, -13439, -24048, -1123, 32767,
+      -10500, 1398,   -6963,  -7404,  485,    -5401,  -1757,  -7668,  -19248,
+      -9692,  -24249, -17923, -15840, -10026, 5249,   -89,    1787,   -16178,
+      -6691,  -19524, -13439, -24048, -1123,  32767,  -17267, -3378,  823,
+      11482,  -11139, 7508,   -10500, 1398,   -6963,  -7404,  485,    -5401,
+      -1757,  -7668,  -19248, -9692,  -24249, -17923, -15840, -10026, 5249,
+      -89,    1787,   -16178, -6691,  -19524, -13439, -24048, -1123,  32767,
       -17267, -3378,  823,    11482,  -11139, 7508,
   };
-  std::vector<int16_t> output(2 * 15, 0);
-  ApplySigmoid(input.data(), 2, 15, output.data());
+  std::vector<int16_t> output(4 * 15, 0);
+  ApplySigmoid(input.data(), 4, 15, output.data());
   const std::vector<int16_t> expected_output = {
       2339, 19152, 5063,  4617,  17350, 6917,  12921, 4371,  299,  2813,
       89,   409,   673,   2605,  25646, 16207, 19904, 615,   5353, 273,
       1187, 91,    14153, 32756, 475,   9983,  18026, 30898, 2023, 28246,
+      2339, 19152, 5063,  4617,  17350, 6917,  12921, 4371,  299,  2813,
+      89,   409,   673,   2605,  25646, 16207, 19904, 615,   5353, 273,
+      1187, 91,    14153, 32756, 475,   9983,  18026, 30898, 2023, 28246,
   };
   EXPECT_THAT(output, testing::ElementsAreArray(expected_output));
 }
@@ -386,27 +409,6 @@ TEST(uKernels, QuantMul16bitOut19ShiftTest) {
   EXPECT_THAT(output, testing::ElementsAreArray(expected_output));
 }
 
-// Quantized Multiply with 8bit output and 32 bit shift.
-TEST(uKernels, QuantMul8bitOut23ShiftTest) {
-  const std::vector<int16_t> input1 = {
-      2491, 32767, -32768, 32767, -32768, 32767, 32767, -32768, -32768, 2157,
-      4545, 14835, 1285,   29498, 26788,  2907,  7877,  6331,   8775,   3001,
-      1399, 4683,  1437,   1853,  12163,  4927,  7977,  3001,   16612,  4791,
-  };
-  const std::vector<int16_t> input2 = {
-      -1156, 32767, -32768, -32768, 32767, 2308,  64,    220,   -288,  -10132,
-      -964,  1016,  -120,   844,    2944,  -4640, -2392, 736,   -4352, 4352,
-      5180,  -232,  -428,   8276,   -412,  -1308, -1196, -5044, 1612,  -10044,
-  };
-  std::vector<int8_t> output(2 * 15, 0);
-  CwiseMul(input1.data(), input2.data(), 2, 15, 23, output.data());
-  const std::vector<int8_t> expected_output = {
-      0,  -128, -128, -128, -128, 9, 0, -1, 1, -3, -1, 2,  0,  3, 9,
-      -2, -2,   1,    -5,   2,    1, 0, 0,  2, -1, -1, -1, -2, 3, -6,
-  };
-  EXPECT_THAT(output, testing::ElementsAreArray(expected_output));
-}
-
 // Quantized Multiply with arbitrary scale.
 TEST(uKernels, QuantMul8bitArbitrarySclaeTest) {
   // scale = 0.000028.
diff --git a/tensorflow/lite/kernels/lstm_eval.cc b/tensorflow/lite/kernels/lstm_eval.cc
index 742a35519b7..f710b31ab15 100644
--- a/tensorflow/lite/kernels/lstm_eval.cc
+++ b/tensorflow/lite/kernels/lstm_eval.cc
@@ -887,6 +887,97 @@ inline void LstmStepWithAuxInput(
   }
 }
 
+// Fully quantized lstm kernel. Currently supports both cifg and non-cifg.
+//
+// Input activatoin of size n_batch * n_input:
+//   input_ptr
+//
+// LSTM weights:
+// Quantized input weights of size 'n_cell * n_input':
+//   input_to_input_weight_ptr            - optional
+//   input_to_forget_weight_ptr           - optional
+//   input_to_cell_weight_ptr             - optional
+//   input_to_output_weight_ptr           - optional
+//
+// Quantized recurrent weights of size 'n_cell * n_output':
+//   recurrent_to_input_weight_ptr        - optional
+//   recurrent_to_forget_weights_ptr
+//   recurrent_to_cell_weights_ptr
+//   recurrent_to_input_weights_ptr
+//
+// Quantized peephole weights of size 'n_cell', representing diagonal matrices.
+//   cell_to_input_weights               - optional
+//   cell_to_cell_weights                - optional
+//   cell_to_output_weights              - optional
+//
+// Quantized projection weights of size 'n_output * n_cell'
+//   proj_weight_ptr                     - optional
+//
+// Weight scales (scalars) for each of the weights above.
+//   effective_input_to_input_scale_a    - optional
+//   effective_input_to_input_scale_b    - optional
+//   effective_input_to_forget_scale_a
+//   effective_input_to_forget_scale_b
+//   effective_input_to_cell_scale_a
+//   effective_input_to_cell_scale_b
+//   effective_input_to_output_scale_a
+//   effective_input_to_output_scale_b
+//   effective_recurrent_to_input_scale_a    - optional
+//   effective_recurrent_to_input_scale_b    - optional
+//   effective_recurrent_to_forget_scale_a
+//   effective_recurrent_to_forget_scale_b
+//   effective_recurrent_to_cell_scale_a
+//   effective_recurrent_to_cell_scale_b
+//   effective_recurrent_to_output_scale_a
+//   effective_recurrent_to_output_scale_b
+//   effective_proj_scale_a                  - optional
+//   effective_proj_scale_b                  - optional
+//
+// Gate biases of size 'n_cell':
+//   input_bias_ptr                 - optional
+//   forget_bias_ptr
+//   cell_bias_ptr
+//   output_bias_ptr
+//
+// Layer norm coefficients of size 'n_cell', representing diagonal matrices.
+//   layer_norm_input_weight_ptr    - optional
+//   layer_norm_forput_weight_ptr   - optional
+//   layer_norm_cell_weight_ptr     - optional
+//   layer_norm_output_weight_ptr   - optional
+//
+// Layer norm scales of size 'n_cell'.
+//   layer_norm_input_scale_a     - optional
+//   layer_norm_input_scale_b     - optional
+//   layer_norm_forget_scale_a    - optional
+//   layer_norm_forget_scale_b    - optional
+//   layer_norm_cell_scale_a      - optional
+//   layer_norm_cell_scale_b      - optional
+//   layer_norm_output_scale_a    - optional
+//   layer_norm_output_scale_b    - optional
+//
+// Scalar values:
+//   quantized_cell_clip: quantized clip value for cell.
+//   quantized_proj_clip: quantized clip value for projection.
+//   cell_scale: the POT scale for cell state.
+//
+// Zero points:
+//   activation_zp: zero point of activation
+//   hidden_zp: zero point for hidden state.
+//
+// Temporary pre-allocated storage for the calculation. Each is of size n_cell *
+// n_batch.
+//   scratch_0:
+//   scratch_1
+//   scratch_2
+//   scratch_3
+//   scratch_4
+//   scratch_5: this scratch buffer is created purely for optimizing the
+//              MatrixBatchVectorMultiplyAccumulate.
+//
+// Outputs:
+//   output_state_ptr - size 'n_batch * n_output'
+//   cell_state_ptr   - size 'n_batch * n_cell'
+//   output_ptr       - size 'n_batch * n_output'
 inline void LstmStepQuantized(
     const int8_t* input_ptr, const int8_t* input_to_input_weight_ptr,
     int32_t effective_input_to_input_scale_a,
@@ -934,7 +1025,7 @@ inline void LstmStepQuantized(
     int32_t layer_norm_output_scale_a, int32_t layer_norm_output_scale_b,
     const int32_t* input_bias_ptr, const int32_t* forget_bias_ptr,
     const int32_t* cell_bias_ptr, const int32_t* output_bias_ptr,
-    int32 quantized_cell_clip, int32 quantized_proj_clip, int32_t cell_scale,
+    int16_t quantized_cell_clip, int8_t quantized_proj_clip, int32_t cell_scale,
     const int32_t* inv_large_value,
     const int32_t* input_to_forget_effective_bias,
     const int32_t* recurrent_to_forget_effective_bias,
diff --git a/tensorflow/lite/kernels/lstm_eval.h b/tensorflow/lite/kernels/lstm_eval.h
index 7d48f2376b8..8430510292e 100644
--- a/tensorflow/lite/kernels/lstm_eval.h
+++ b/tensorflow/lite/kernels/lstm_eval.h
@@ -65,8 +65,8 @@ struct QuantizedLstmParameter {
   int32_t layer_norm_output_scale_a;
   int32_t layer_norm_output_scale_b;
   // Quantized clip value for cell and projection. Zero value means no clipping.
-  int32_t quantized_cell_clip;
-  int32_t quantized_proj_clip;
+  int16_t quantized_cell_clip;
+  int8_t quantized_proj_clip;
   int32_t hidden_zp;
   int32_t cell_scale;
   std::vector<int32_t> inv_large_value;
diff --git a/tensorflow/lite/kernels/test_util.h b/tensorflow/lite/kernels/test_util.h
index 06933f283a9..c9d28cb1da7 100644
--- a/tensorflow/lite/kernels/test_util.h
+++ b/tensorflow/lite/kernels/test_util.h
@@ -237,7 +237,7 @@ class SingleOpModel {
     auto* params =
         reinterpret_cast<TfLiteAffineQuantization*>(t->quantization.params);
     for (int i = 0; i < num_inputs; ++i) {
-      quantized_output[i] = input_data[i] * params->scale->data[i];
+      quantized_output[i] = input_data[i] / params->scale->data[i];
     }
     PopulateTensor(index, /*offset=*/0, quantized_output.data(),
                    quantized_output.data() + quantized_output.size());
diff --git a/tensorflow/lite/python/BUILD b/tensorflow/lite/python/BUILD
index a410c176666..4f80469156b 100644
--- a/tensorflow/lite/python/BUILD
+++ b/tensorflow/lite/python/BUILD
@@ -41,6 +41,31 @@ py_test(
     ],
 )
 
+py_test(
+    name = "interpreter_test_py3",
+    srcs = ["interpreter_test.py"],
+    data = [
+        "//tensorflow/lite/python/testdata:interpreter_test_data",
+        "//tensorflow/lite/python/testdata:test_delegate.so",
+    ],
+    main = "interpreter_test.py",
+    python_version = "PY3",
+    srcs_version = "PY2AND3",
+    tags = [
+        "no_windows",
+        "noasan",  # TODO(b/137568139): enable after this is fixed.
+        "nomsan",  # TODO(b/137568139): enable after this is fixed.
+    ],
+    deps = [
+        ":interpreter",
+        "//tensorflow/lite/python/testdata:test_registerer_wrapper",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:platform",
+        "//third_party/py/numpy",
+    ],
+)
+
 py_binary(
     name = "tflite_convert",
     srcs = ["tflite_convert.py"],
diff --git a/tensorflow/lite/python/interpreter.py b/tensorflow/lite/python/interpreter.py
index 1eb04a60580..0515bd5e422 100644
--- a/tensorflow/lite/python/interpreter.py
+++ b/tensorflow/lite/python/interpreter.py
@@ -107,7 +107,7 @@ class Delegate(object):
         self.message = ''
 
       def report(self, x):
-        self.message += x
+        self.message += x if isinstance(x, str) else x.decode('utf-8')
 
     capture = ErrorMessageCapture()
     error_capturer_cb = ctypes.CFUNCTYPE(None, ctypes.c_char_p)(capture.report)
diff --git a/tensorflow/lite/python/interpreter_test.py b/tensorflow/lite/python/interpreter_test.py
index af0540c510a..58da11284eb 100644
--- a/tensorflow/lite/python/interpreter_test.py
+++ b/tensorflow/lite/python/interpreter_test.py
@@ -334,13 +334,10 @@ class InterpreterDelegateTest(test_util.TensorFlowTestCase):
     if sys.platform == 'darwin': return
     destructions = []
     def register_destruction(x):
-      destructions.append(x)
+      destructions.append(x if isinstance(x, str) else x.decode('utf-8'))
       return 0
     # Make a wrapper for the callback so we can send this to ctypes
     delegate = interpreter_wrapper.load_delegate(self._delegate_file)
-    prototype = ctypes.CFUNCTYPE(ctypes.c_int, (ctypes.c_char_p))
-    destroy_callback = prototype(register_destruction)
-    delegate._library.set_destroy_callback(destroy_callback)
     # Make an interpreter with the delegate
     interpreter = interpreter_wrapper.Interpreter(
         model_path=resource_loader.get_path_to_datafile(
@@ -353,8 +350,12 @@ class InterpreterDelegateTest(test_util.TensorFlowTestCase):
 
     interpreter._interpreter.stuff = InterpreterDestroyCallback()
     # Destroy both delegate and interpreter
+    library = delegate._library
+    prototype = ctypes.CFUNCTYPE(ctypes.c_int, (ctypes.c_char_p))
+    library.set_destroy_callback(prototype(register_destruction))
     del delegate
     del interpreter
+    library.set_destroy_callback(None)
     # check the interpreter was destroyed before the delegate
     self.assertEqual(destructions, ['interpreter', 'test_delegate'])
 
diff --git a/tensorflow/lite/python/lite_test.py b/tensorflow/lite/python/lite_test.py
index 1d11cb75e9e..1ee16c386d4 100644
--- a/tensorflow/lite/python/lite_test.py
+++ b/tensorflow/lite/python/lite_test.py
@@ -1077,7 +1077,7 @@ class FromSessionTest(TestModels, parameterized.TestCase):
 
     # Check the add node in the inlined function is included.
     func = sess.graph.as_graph_def().library.function[0].signature.name
-    self.assertIn((func + 'add'), converter._debug_info.traces)
+    self.assertIn(('add@' + func), converter._debug_info.traces)
 
 
 class FromFrozenGraphFile(test_util.TensorFlowTestCase):
diff --git a/tensorflow/lite/string.h b/tensorflow/lite/string_type.h
similarity index 87%
rename from tensorflow/lite/string.h
rename to tensorflow/lite/string_type.h
index 4d1d6077934..f5a7f833765 100644
--- a/tensorflow/lite/string.h
+++ b/tensorflow/lite/string_type.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 // Abstract string. We don't want even absl at this level.
-#ifndef TENSORFLOW_LITE_STRING_H_
-#define TENSORFLOW_LITE_STRING_H_
+#ifndef TENSORFLOW_LITE_STRING_TYPE_H_
+#define TENSORFLOW_LITE_STRING_TYPE_H_
 
 #include <string>
 
@@ -24,4 +24,4 @@ using std::string;
 
 }  // namespace tflite
 
-#endif  // TENSORFLOW_LITE_STRING_H_
+#endif  // TENSORFLOW_LITE_STRING_TYPE_H_
diff --git a/tensorflow/lite/string_util.h b/tensorflow/lite/string_util.h
index cb268ee805c..67f3b72bdd1 100644
--- a/tensorflow/lite/string_util.h
+++ b/tensorflow/lite/string_util.h
@@ -43,7 +43,7 @@ limitations under the License.
 #include <vector>
 
 #include "tensorflow/lite/c/c_api_internal.h"
-#include "tensorflow/lite/string.h"
+#include "tensorflow/lite/string_type.h"
 
 namespace tflite {
 
diff --git a/tensorflow/lite/testing/generate_examples_lib.py b/tensorflow/lite/testing/generate_examples_lib.py
index 6f7dc3e957c..b18ff883a52 100644
--- a/tensorflow/lite/testing/generate_examples_lib.py
+++ b/tensorflow/lite/testing/generate_examples_lib.py
@@ -59,7 +59,6 @@ from tensorflow.python.framework import graph_util as tf_graph_util
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import rnn
 from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import spectral_ops_test_util
 
 
 RANDOM_SEED = 342
@@ -3484,7 +3483,7 @@ def make_lstm_tests(options):
     # forget_bias == 0, inner state activation == tanh.
     # TODO(zhixianyan): Add another test with forget_bias == 1.
     # TODO(zhixianyan): Add another test with relu as activation.
-    lstm_cell = tf.contrib.rnn.BasicLSTMCell(
+    lstm_cell = tf.nn.rnn_cell.BasicLSTMCell(
         num_cells, forget_bias=0.0, state_is_tuple=True)
     cell_outputs, _ = rnn.static_rnn(
         lstm_cell, inputs_after_split, dtype=tf.float32)
@@ -5311,8 +5310,7 @@ def make_rfft2d_tests(options):
         dtype=parameters["input_dtype"],
         name="input",
         shape=parameters["input_shape"])
-    with spectral_ops_test_util.fft_kernel_label_map():
-      outs = tf.signal.rfft2d(input_value, fft_length=parameters["fft_length"])
+    outs = tf.signal.rfft2d(input_value, fft_length=parameters["fft_length"])
     return [input_value], [outs]
 
   def build_inputs(parameters, sess, inputs, outputs):
diff --git a/tensorflow/lite/testing/generate_testspec.h b/tensorflow/lite/testing/generate_testspec.h
index bda636f2c80..fe7e6ddb3fb 100644
--- a/tensorflow/lite/testing/generate_testspec.h
+++ b/tensorflow/lite/testing/generate_testspec.h
@@ -19,7 +19,7 @@ limitations under the License.
 #include <iostream>
 #include <vector>
 
-#include "tensorflow/lite/string.h"
+#include "tensorflow/lite/string_type.h"
 
 namespace tflite {
 namespace testing {
diff --git a/tensorflow/lite/testing/join.h b/tensorflow/lite/testing/join.h
index 12496b4864b..3f17f7fad46 100644
--- a/tensorflow/lite/testing/join.h
+++ b/tensorflow/lite/testing/join.h
@@ -19,7 +19,7 @@ limitations under the License.
 #include <iomanip>
 #include <sstream>
 
-#include "tensorflow/lite/string.h"
+#include "tensorflow/lite/string_type.h"
 
 namespace tflite {
 namespace testing {
diff --git a/tensorflow/lite/testing/kernel_test/diff_analyzer.h b/tensorflow/lite/testing/kernel_test/diff_analyzer.h
index aecbaea449b..2056aa03b57 100644
--- a/tensorflow/lite/testing/kernel_test/diff_analyzer.h
+++ b/tensorflow/lite/testing/kernel_test/diff_analyzer.h
@@ -18,7 +18,7 @@ limitations under the License.
 #include <vector>
 
 #include "tensorflow/lite/c/c_api_internal.h"
-#include "tensorflow/lite/string.h"
+#include "tensorflow/lite/string_type.h"
 
 namespace tflite {
 namespace testing {
diff --git a/tensorflow/lite/testing/kernel_test/input_generator.h b/tensorflow/lite/testing/kernel_test/input_generator.h
index 859c7068e54..257a2bf0902 100644
--- a/tensorflow/lite/testing/kernel_test/input_generator.h
+++ b/tensorflow/lite/testing/kernel_test/input_generator.h
@@ -21,7 +21,7 @@ limitations under the License.
 #include "tensorflow/lite/c/c_api_internal.h"
 #include "tensorflow/lite/interpreter.h"
 #include "tensorflow/lite/model.h"
-#include "tensorflow/lite/string.h"
+#include "tensorflow/lite/string_type.h"
 
 namespace tflite {
 namespace testing {
diff --git a/tensorflow/lite/testing/split.h b/tensorflow/lite/testing/split.h
index b3ffab793af..b9513b6fa4f 100644
--- a/tensorflow/lite/testing/split.h
+++ b/tensorflow/lite/testing/split.h
@@ -23,7 +23,7 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
-#include "tensorflow/lite/string.h"
+#include "tensorflow/lite/string_type.h"
 
 namespace tflite {
 namespace testing {
diff --git a/tensorflow/lite/testing/test_runner.h b/tensorflow/lite/testing/test_runner.h
index 1e7901264d5..aaa4b2822ec 100644
--- a/tensorflow/lite/testing/test_runner.h
+++ b/tensorflow/lite/testing/test_runner.h
@@ -19,7 +19,8 @@ limitations under the License.
 #include <memory>
 #include <string>
 #include <vector>
-#include "tensorflow/lite/string.h"
+
+#include "tensorflow/lite/string_type.h"
 
 namespace tflite {
 namespace testing {
diff --git a/tensorflow/lite/testing/tflite_diff_util.h b/tensorflow/lite/testing/tflite_diff_util.h
index 091134f50f8..362bc64a6bc 100644
--- a/tensorflow/lite/testing/tflite_diff_util.h
+++ b/tensorflow/lite/testing/tflite_diff_util.h
@@ -17,7 +17,7 @@ limitations under the License.
 
 #include <vector>
 
-#include "tensorflow/lite/string.h"
+#include "tensorflow/lite/string_type.h"
 #include "tensorflow/lite/testing/tflite_driver.h"
 
 namespace tflite {
diff --git a/tensorflow/lite/testing/tokenize.cc b/tensorflow/lite/testing/tokenize.cc
index bb475358013..02ae8fb6999 100644
--- a/tensorflow/lite/testing/tokenize.cc
+++ b/tensorflow/lite/testing/tokenize.cc
@@ -13,9 +13,11 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #include "tensorflow/lite/testing/tokenize.h"
+
 #include <istream>
 #include <string>
-#include "tensorflow/lite/string.h"
+
+#include "tensorflow/lite/string_type.h"
 
 namespace tflite {
 namespace testing {
diff --git a/tensorflow/lite/testing/util.h b/tensorflow/lite/testing/util.h
index 45751497de4..fa4cc32904d 100644
--- a/tensorflow/lite/testing/util.h
+++ b/tensorflow/lite/testing/util.h
@@ -18,7 +18,7 @@ limitations under the License.
 #include <cstdio>
 
 #include "tensorflow/lite/core/api/error_reporter.h"
-#include "tensorflow/lite/string.h"
+#include "tensorflow/lite/string_type.h"
 
 namespace tflite {
 
diff --git a/tensorflow/lite/tools/benchmark/benchmark_performance_options.cc b/tensorflow/lite/tools/benchmark/benchmark_performance_options.cc
index 28b74dbd9e7..57c5d8ca2e2 100644
--- a/tensorflow/lite/tools/benchmark/benchmark_performance_options.cc
+++ b/tensorflow/lite/tools/benchmark/benchmark_performance_options.cc
@@ -47,10 +47,10 @@ void MultiRunStatsRecorder::OnBenchmarkStart(const BenchmarkParams& params) {
 #if defined(__ANDROID__)
     const bool allow_precision_loss =
         params.Get<bool>("gpu_precision_loss_allowed");
-    const string precision_tag = allow_precision_loss ? "fp16" : "fp32";
+    const std::string precision_tag = allow_precision_loss ? "fp16" : "fp32";
 
     const int32_t gl_obj_type = params.Get<int32_t>("gpu_gl_object_type");
-    string gl_type;
+    std::string gl_type;
     switch (gl_obj_type) {
       case TFLITE_GL_OBJECT_TYPE_FASTEST:
         gl_type = "fastest";
diff --git a/tensorflow/lite/tools/benchmark/benchmark_tflite_model.cc b/tensorflow/lite/tools/benchmark/benchmark_tflite_model.cc
index 0315fd190bd..ccd8db95484 100644
--- a/tensorflow/lite/tools/benchmark/benchmark_tflite_model.cc
+++ b/tensorflow/lite/tools/benchmark/benchmark_tflite_model.cc
@@ -138,7 +138,7 @@ void FillRandomValue(T* ptr, int num_elements,
 
 void FillRandomString(tflite::DynamicBuffer* buffer,
                       const std::vector<int>& sizes,
-                      const std::function<string()>& random_func) {
+                      const std::function<std::string()>& random_func) {
   int num_elements = 1;
   for (int dim : sizes) {
     num_elements *= dim;
@@ -150,7 +150,7 @@ void FillRandomString(tflite::DynamicBuffer* buffer,
 }
 
 TfLiteStatus PopulateInputLayerInfo(
-    const string& names_string, const string& shapes_string,
+    const std::string& names_string, const std::string& shapes_string,
     std::vector<BenchmarkTfLiteModel::InputLayerInfo>* info) {
   info->clear();
   std::vector<std::string> names = Split(names_string, ',');
@@ -299,15 +299,15 @@ void BenchmarkTfLiteModel::LogParams() {
   TFLITE_LOG(INFO) << "Use nnapi : [" << params_.Get<bool>("use_nnapi") << "]";
   if (!params_.Get<std::string>("nnapi_execution_preference").empty()) {
     TFLITE_LOG(INFO) << "nnapi execution preference: ["
-                     << params_.Get<string>("nnapi_execution_preference")
+                     << params_.Get<std::string>("nnapi_execution_preference")
                      << "]";
   }
   TFLITE_LOG(INFO) << "Use legacy nnapi : ["
                    << params_.Get<bool>("use_legacy_nnapi") << "]";
   if (params_.Get<bool>("use_nnapi")) {
-    std::string log_string = "nnapi accelerator name: [" +
-                             params_.Get<string>("nnapi_accelerator_name") +
-                             "]";
+    std::string log_string =
+        "nnapi accelerator name: [" +
+        params_.Get<std::string>("nnapi_accelerator_name") + "]";
     std::string string_device_names_list = nnapi::GetStringDeviceNamesList();
     // Print available devices when possible
     if (!string_device_names_list.empty()) {
@@ -504,6 +504,7 @@ TfLiteStatus BenchmarkTfLiteModel::Init() {
   }
 
   interpreter_->UseNNAPI(params_.Get<bool>("use_legacy_nnapi"));
+  interpreter_->SetAllowFp16PrecisionForFp32(params_.Get<bool>("allow_fp16"));
 
   delegates_ = GetDelegates();
   for (const auto& delegate : delegates_) {
@@ -535,7 +536,6 @@ TfLiteStatus BenchmarkTfLiteModel::Init() {
     }
   }
 
-  interpreter_->SetAllowFp16PrecisionForFp32(params_.Get<bool>("allow_fp16"));
 
   auto interpreter_inputs = interpreter_->inputs();
 
diff --git a/tensorflow/lite/tools/evaluation/proto/BUILD b/tensorflow/lite/tools/evaluation/proto/BUILD
index 8c265ff5c70..e56d8f5c1e8 100644
--- a/tensorflow/lite/tools/evaluation/proto/BUILD
+++ b/tensorflow/lite/tools/evaluation/proto/BUILD
@@ -13,6 +13,8 @@
 # limitations under the License.
 # ==============================================================================
 
+# Placeholder for Google-internal load statements.
+
 package(
     default_visibility = ["//visibility:public"],
     licenses = ["notice"],  # Apache 2.0
diff --git a/tensorflow/lite/tools/evaluation/proto/evaluation_stages.proto b/tensorflow/lite/tools/evaluation/proto/evaluation_stages.proto
index 74ab8c2a712..2f491fbe137 100644
--- a/tensorflow/lite/tools/evaluation/proto/evaluation_stages.proto
+++ b/tensorflow/lite/tools/evaluation/proto/evaluation_stages.proto
@@ -89,7 +89,7 @@ message ProcessMetrics {
 
 // Parameters that define how images are preprocessed.
 //
-// Next ID: 5
+// Next ID: 6
 message ImagePreprocessingParams {
   // Required.
   optional int32 image_height = 1;
@@ -103,6 +103,9 @@ message ImagePreprocessingParams {
   // https://github.com/tensorflow/tpu/blob/master/models/experimental/inception/inception_preprocessing.py#L296
   // Set to 0 to disable cropping.
   optional float cropping_fraction = 4 [default = 0.875];
+  // Set this flag if the image is preprocessed and saved as binary file.
+  // In that case, we only do the quantization if needed.
+  optional bool load_raw_images = 5 [default = false];
 }
 
 // Parameters that control TFLite inference.
diff --git a/tensorflow/lite/tools/evaluation/stages/BUILD b/tensorflow/lite/tools/evaluation/stages/BUILD
index 6ee00c853fb..ce3a6342e43 100644
--- a/tensorflow/lite/tools/evaluation/stages/BUILD
+++ b/tensorflow/lite/tools/evaluation/stages/BUILD
@@ -39,10 +39,10 @@ cc_library(
     copts = tflite_copts(),
     deps = [
         "@com_google_absl//absl/base",
+        "//tensorflow/core:tflite_portable_logging",
         "//tensorflow/core:stats_calculator_portable",
         "//tensorflow/lite/profiling:time",
         "//tensorflow/lite/tools/evaluation:evaluation_stage",
-        "//tensorflow/core:tflite_portable_logging",
         "//tensorflow/lite/kernels/internal:reference_base",
         "//tensorflow/lite/kernels/internal:types",
         "//tensorflow/lite/tools/evaluation/proto:evaluation_config_cc_proto",
diff --git a/tensorflow/lite/tools/evaluation/stages/image_preprocessing_stage.cc b/tensorflow/lite/tools/evaluation/stages/image_preprocessing_stage.cc
index 5d1ce058f20..602b5b79971 100644
--- a/tensorflow/lite/tools/evaluation/stages/image_preprocessing_stage.cc
+++ b/tensorflow/lite/tools/evaluation/stages/image_preprocessing_stage.cc
@@ -15,6 +15,7 @@ limitations under the License.
 #include "tensorflow/lite/tools/evaluation/stages/image_preprocessing_stage.h"
 
 #include <cmath>
+#include <cstdint>
 #include <fstream>
 #include <streambuf>
 #include <string>
@@ -26,6 +27,7 @@ limitations under the License.
 #include "tensorflow/lite/kernels/internal/reference/reference_ops.h"
 #include "tensorflow/lite/kernels/internal/types.h"
 #include "tensorflow/lite/profiling/time.h"
+#include "tensorflow/lite/tools/evaluation/proto/evaluation_config.pb.h"
 #include "tensorflow/lite/tools/evaluation/proto/evaluation_stages.pb.h"
 
 namespace tflite {
@@ -90,6 +92,80 @@ inline void ResizeBilinear(int input_height, int input_width,
         static_cast<T>((temp_float_data[i] - input_mean) * scale));
   }
 }
+
+// Loads the JPEG image then does the crop, resize and quantization.
+template <typename T>
+void LoadImageJPEG(std::string* filename, float input_mean, float scale,
+                   float cropping_fraction, int image_height, int image_width,
+                   std::vector<T>& output_data, int total_size) {
+  // Read image.
+  std::ifstream t(*filename);
+  std::string image_str((std::istreambuf_iterator<char>(t)),
+                        std::istreambuf_iterator<char>());
+  const int fsize = image_str.size();
+  auto temp = absl::bit_cast<const uint8_t*>(image_str.data());
+  std::unique_ptr<uint8_t[]> original_image;
+  int original_width, original_height, original_channels;
+  tensorflow::jpeg::UncompressFlags flags;
+  // JDCT_ISLOW performs slower but more accurate pre-processing.
+  // This isn't always obvious in unit tests, but makes a difference during
+  // accuracy testing with ILSVRC dataset.
+  flags.dct_method = JDCT_ISLOW;
+  // We necessarily require a 3-channel image as the output.
+  flags.components = kNumChannels;
+  original_image.reset(Uncompress(temp, fsize, flags, &original_width,
+                                  &original_height, &original_channels,
+                                  nullptr));
+
+  // Central Crop.
+  const int left = static_cast<int>(
+      std::round(original_width * (1 - cropping_fraction) / 2));
+  const int top = static_cast<int>(
+      std::round(original_height * (1 - cropping_fraction) / 2));
+  const int crop_width =
+      static_cast<int>(std::round(original_width * cropping_fraction));
+  const int crop_height =
+      static_cast<int>(std::round(original_height * cropping_fraction));
+  std::vector<float> cropped_image;
+  cropped_image.reserve(crop_height * crop_width * kNumChannels);
+  Crop(original_height, original_width, top, left, crop_height, crop_width,
+       original_image.get(), &cropped_image);
+
+  // Billinear-Resize & apply mean & scale.
+  ResizeBilinear<T>(crop_height, crop_width, cropped_image, image_height,
+                    image_width, total_size, output_data, input_mean, scale);
+}
+
+// Loads the raw image and performs quantization only.
+template <typename T>
+void LoadImageRaw(std::string* filename, float input_mean, float scale,
+                  std::vector<T>& output_data, int total_size) {
+  std::ifstream stream(filename->c_str(), std::ios::in | std::ios::binary);
+  std::vector<uint8_t> raw_data((std::istreambuf_iterator<char>(stream)),
+                                std::istreambuf_iterator<char>());
+  if (raw_data.size() != total_size) {
+    LOG(ERROR) << "Got unexpected size of the image";
+  }
+
+  output_data.clear();
+  output_data.reserve(total_size);
+  for (int i = 0; i < total_size; ++i) {
+    output_data.push_back(static_cast<T>((raw_data[i] - input_mean) * scale));
+  }
+}
+
+// LoadImage can load both raw and JPEG images based on the preprocessed flag.
+template <typename T>
+void LoadImage(std::string* filename, float input_mean, float scale,
+               float cropping_fraction, int image_height, int image_width,
+               std::vector<T>& output_data, int total_size, bool preprocessed) {
+  if (preprocessed) {
+    LoadImageRaw<T>(filename, input_mean, scale, output_data, total_size);
+  } else {
+    LoadImageJPEG<T>(filename, input_mean, scale, cropping_fraction,
+                     image_height, image_width, output_data, total_size);
+  }
+}
 }  // namespace
 
 TfLiteStatus ImagePreprocessingStage::Init() {
@@ -132,52 +208,22 @@ TfLiteStatus ImagePreprocessingStage::Run() {
 
   int64_t start_us = profiling::time::NowMicros();
 
-  // Read image.
-  std::ifstream t(*image_path_);
-  std::string image_str((std::istreambuf_iterator<char>(t)),
-                        std::istreambuf_iterator<char>());
-  const int fsize = image_str.size();
-  auto temp = absl::bit_cast<const uint8_t*>(image_str.data());
-  std::unique_ptr<uint8_t[]> original_image;
-  int original_width, original_height, original_channels;
-  tensorflow::jpeg::UncompressFlags flags;
-  // JDCT_ISLOW performs slower but more accurate pre-processing.
-  // This isn't always obvious in unit tests, but makes a difference during
-  // accuracy testing with ILSVRC dataset.
-  flags.dct_method = JDCT_ISLOW;
-  // We necessarily require a 3-channel image as the output.
-  flags.components = kNumChannels;
-  original_image.reset(Uncompress(temp, fsize, flags, &original_width,
-                                  &original_height, &original_channels,
-                                  nullptr));
-
-  // Central Crop.
-  const int left = static_cast<int>(
-      std::round(original_width * (1 - cropping_fraction_) / 2));
-  const int top = static_cast<int>(
-      std::round(original_height * (1 - cropping_fraction_) / 2));
-  const int crop_width =
-      static_cast<int>(std::round(original_width * cropping_fraction_));
-  const int crop_height =
-      static_cast<int>(std::round(original_height * cropping_fraction_));
-  std::vector<float> cropped_image;
-  cropped_image.reserve(crop_height * crop_width * kNumChannels);
-  Crop(original_height, original_width, top, left, crop_height, crop_width,
-       original_image.get(), &cropped_image);
-
   // Billinear-Resize & apply mean & scale.
   if (output_type_ == kTfLiteUInt8) {
-    ResizeBilinear(crop_height, crop_width, cropped_image,
-                   params.image_height(), params.image_width(), total_size_,
-                   uint8_preprocessed_image_, input_mean_value_, scale_);
+    evaluation::LoadImage<uint8_t>(
+        image_path_, input_mean_value_, scale_, cropping_fraction_,
+        params.image_height(), params.image_width(), uint8_preprocessed_image_,
+        total_size_, params.load_raw_images());
   } else if (output_type_ == kTfLiteInt8) {
-    ResizeBilinear(crop_height, crop_width, cropped_image,
-                   params.image_height(), params.image_width(), total_size_,
-                   int8_preprocessed_image_, input_mean_value_, scale_);
+    evaluation::LoadImage<int8_t>(
+        image_path_, input_mean_value_, scale_, cropping_fraction_,
+        params.image_height(), params.image_width(), int8_preprocessed_image_,
+        total_size_, params.load_raw_images());
   } else if (output_type_ == kTfLiteFloat32) {
-    ResizeBilinear(crop_height, crop_width, cropped_image,
-                   params.image_height(), params.image_width(), total_size_,
-                   float_preprocessed_image_, input_mean_value_, scale_);
+    evaluation::LoadImage<float>(
+        image_path_, input_mean_value_, scale_, cropping_fraction_,
+        params.image_height(), params.image_width(), float_preprocessed_image_,
+        total_size_, params.load_raw_images());
   }
 
   latency_stats_.UpdateStat(profiling::time::NowMicros() - start_us);
diff --git a/tensorflow/lite/tools/evaluation/stages/image_preprocessing_stage.h b/tensorflow/lite/tools/evaluation/stages/image_preprocessing_stage.h
index e262d7e38a2..45a3e383852 100644
--- a/tensorflow/lite/tools/evaluation/stages/image_preprocessing_stage.h
+++ b/tensorflow/lite/tools/evaluation/stages/image_preprocessing_stage.h
@@ -16,6 +16,7 @@ limitations under the License.
 #define TENSORFLOW_LITE_TOOLS_EVALUATION_STAGES_IMAGE_PREPROCESSING_STAGE_H_
 
 #include <stdint.h>
+
 #include <vector>
 
 #include "tensorflow/core/util/stats_calculator.h"
@@ -38,7 +39,7 @@ class ImagePreprocessingStage : public EvaluationStage {
 
   EvaluationStageMetrics LatestMetrics() override;
 
-  ~ImagePreprocessingStage() {}
+  ~ImagePreprocessingStage() override {}
 
   // Call before Run().
   void SetImagePath(std::string* image_path) { image_path_ = image_path; }
@@ -46,6 +47,9 @@ class ImagePreprocessingStage : public EvaluationStage {
   // Provides preprocessing output.
   void* GetPreprocessedImageData();
 
+  // Get total size of data.
+  int GetTotalSize() { return total_size_; }
+
  private:
   std::string* image_path_ = nullptr;
   float cropping_fraction_;
diff --git a/tensorflow/lite/tools/gen_op_registration.h b/tensorflow/lite/tools/gen_op_registration.h
index a616720c934..b01ede98292 100644
--- a/tensorflow/lite/tools/gen_op_registration.h
+++ b/tensorflow/lite/tools/gen_op_registration.h
@@ -16,7 +16,7 @@ limitations under the License.
 #define TENSORFLOW_LITE_TOOLS_GEN_OP_REGISTRATION_H_
 
 #include "tensorflow/lite/model.h"
-#include "tensorflow/lite/string.h"
+#include "tensorflow/lite/string_type.h"
 
 namespace tflite {
 
diff --git a/tensorflow/lite/tools/make/Makefile b/tensorflow/lite/tools/make/Makefile
index 4b46a63364b..318080a479e 100644
--- a/tensorflow/lite/tools/make/Makefile
+++ b/tensorflow/lite/tools/make/Makefile
@@ -75,6 +75,7 @@ LIB_NAME := libtensorflow-lite.a
 # Benchmark static library and binary
 BENCHMARK_LIB_NAME := benchmark-lib.a
 BENCHMARK_BINARY_NAME := benchmark_model
+BENCHMARK_PERF_OPTIONS_BINARY_NAME := benchmark_model_performance_options
 
 # A small example program that shows how to link against the library.
 MINIMAL_SRCS := \
@@ -99,26 +100,7 @@ $(wildcard tensorflow/lite/c/*.c) \
 $(wildcard tensorflow/lite/core/*.cc) \
 $(wildcard tensorflow/lite/core/api/*.cc) \
 $(wildcard tensorflow/lite/experimental/resource_variable/*.cc) \
-tensorflow/lite/experimental/ruy/allocator.cc \
-tensorflow/lite/experimental/ruy/block_map.cc \
-tensorflow/lite/experimental/ruy/blocking_counter.cc \
-tensorflow/lite/experimental/ruy/context.cc \
-tensorflow/lite/experimental/ruy/detect_arm.cc \
-tensorflow/lite/experimental/ruy/have_built_path_for_avx2.cc \
-tensorflow/lite/experimental/ruy/have_built_path_for_avx512.cc \
-tensorflow/lite/experimental/ruy/kernel_arm32.cc \
-tensorflow/lite/experimental/ruy/kernel_arm64.cc \
-tensorflow/lite/experimental/ruy/kernel_avx2.cc \
-tensorflow/lite/experimental/ruy/kernel_avx512.cc \
-tensorflow/lite/experimental/ruy/pack_arm.cc \
-tensorflow/lite/experimental/ruy/pack_avx2.cc \
-tensorflow/lite/experimental/ruy/pack_avx512.cc \
-tensorflow/lite/experimental/ruy/pmu.cc \
-tensorflow/lite/experimental/ruy/thread_pool.cc \
-tensorflow/lite/experimental/ruy/trace.cc \
-tensorflow/lite/experimental/ruy/trmul.cc \
-tensorflow/lite/experimental/ruy/tune.cc \
-tensorflow/lite/experimental/ruy/wait.cc
+$(wildcard tensorflow/lite/experimental/ruy/*.cc)
 ifneq ($(BUILD_TYPE),micro)
 CORE_CC_ALL_SRCS += \
 $(wildcard tensorflow/lite/kernels/*.cc) \
@@ -138,6 +120,9 @@ CORE_CC_ALL_SRCS := $(sort $(CORE_CC_ALL_SRCS))
 CORE_CC_EXCLUDE_SRCS := \
 $(wildcard tensorflow/lite/*test.cc) \
 $(wildcard tensorflow/lite/*/*test.cc) \
+$(wildcard tensorflow/lite/*/*/benchmark.cc) \
+$(wildcard tensorflow/lite/*/*/example*.cc) \
+$(wildcard tensorflow/lite/*/*/test*.cc) \
 $(wildcard tensorflow/lite/*/*/*test.cc) \
 $(wildcard tensorflow/lite/*/*/*/*test.cc) \
 $(wildcard tensorflow/lite/kernels/*test_main.cc) \
@@ -177,6 +162,7 @@ ifeq ($(BUILD_WITH_NNAPI),true)
 	CORE_CC_ALL_SRCS += tensorflow/lite/delegates/nnapi/nnapi_delegate.cc
   CORE_CC_ALL_SRCS += tensorflow/lite/delegates/nnapi/quant_lstm_sup.cc
 	CORE_CC_ALL_SRCS += tensorflow/lite/nnapi/nnapi_implementation.cc
+	CORE_CC_ALL_SRCS += tensorflow/lite/nnapi/nnapi_util.cc
 	LIBS += -lrt
 else
 	CORE_CC_ALL_SRCS += tensorflow/lite/delegates/nnapi/nnapi_delegate_disabled.cc
@@ -199,14 +185,19 @@ TF_LITE_CC_SRCS := $(filter-out $(CORE_CC_EXCLUDE_SRCS), $(CORE_CC_ALL_SRCS))
 BENCHMARK_SRCS_DIR := tensorflow/lite/tools/benchmark
 EVALUATION_UTILS_SRCS := \
   tensorflow/lite/tools/evaluation/utils.cc
-BENCHMARK_ALL_SRCS := $(TF_LITE_CC_SRCS) \
+BENCHMARK_ALL_SRCS := \
 	$(wildcard $(BENCHMARK_SRCS_DIR)/*.cc) \
 	$(PROFILE_SUMMARIZER_SRCS) \
 	$(CMD_LINE_TOOLS_SRCS) \
 	$(EVALUATION_UTILS_SRCS)
 
-BENCHMARK_SRCS := $(filter-out \
+BENCHMARK_MAIN_SRC := $(BENCHMARK_SRCS_DIR)/benchmark_main.cc
+BENCHMARK_PERF_OPTIONS_SRC := \
+	$(BENCHMARK_SRCS_DIR)/benchmark_tflite_performance_options_main.cc
+BENCHMARK_LIB_SRCS := $(filter-out \
 	$(wildcard $(BENCHMARK_SRCS_DIR)/*_test.cc) \
+	$(BENCHMARK_MAIN_SRC) \
+	$(BENCHMARK_PERF_OPTIONS_SRC) \
 	$(BENCHMARK_SRCS_DIR)/benchmark_plus_flex_main.cc, \
 	$(BENCHMARK_ALL_SRCS))
 
@@ -221,7 +212,7 @@ ALL_SRCS := \
 	$(PROFILER_SRCS) \
 	$(PROFILER_SUMMARIZER_SRCS) \
 	$(TF_LITE_CC_SRCS) \
-	$(BENCHMARK_SRCS) \
+	$(BENCHMARK_LIB_SRCS) \
   $(CMD_LINE_TOOLS_SRCS)
 
 # Where compiled objects are stored.
@@ -233,6 +224,7 @@ LIBDIR := $(GENDIR)lib/
 LIB_PATH := $(LIBDIR)$(LIB_NAME)
 BENCHMARK_LIB := $(LIBDIR)$(BENCHMARK_LIB_NAME)
 BENCHMARK_BINARY := $(BINDIR)$(BENCHMARK_BINARY_NAME)
+BENCHMARK_PERF_OPTIONS_BINARY := $(BINDIR)$(BENCHMARK_PERF_OPTIONS_BINARY_NAME)
 MINIMAL_BINARY := $(BINDIR)minimal
 
 CXX := $(CC_PREFIX)${TARGET_TOOLCHAIN_PREFIX}g++
@@ -245,8 +237,14 @@ $(patsubst %.cc,%.o,$(patsubst %.c,%.o,$(MINIMAL_SRCS))))
 LIB_OBJS := $(addprefix $(OBJDIR), \
 $(patsubst %.cc,%.o,$(patsubst %.c,%.o,$(patsubst %.cpp,%.o,$(TF_LITE_CC_SRCS)))))
 
-BENCHMARK_OBJS := $(addprefix $(OBJDIR), \
-$(patsubst %.cc,%.o,$(patsubst %.c,%.o,$(patsubst %.cpp,%.o,$(BENCHMARK_SRCS)))))
+BENCHMARK_MAIN_OBJ := $(addprefix $(OBJDIR), \
+$(patsubst %.cc,%.o,$(patsubst %.c,%.o,$(BENCHMARK_MAIN_SRC))))
+
+BENCHMARK_PERF_OPTIONS_OBJ := $(addprefix $(OBJDIR), \
+$(patsubst %.cc,%.o,$(patsubst %.c,%.o,$(BENCHMARK_PERF_OPTIONS_SRC))))
+
+BENCHMARK_LIB_OBJS := $(addprefix $(OBJDIR), \
+$(patsubst %.cc,%.o,$(patsubst %.c,%.o,$(BENCHMARK_LIB_SRCS))))
 
 # For normal manually-created TensorFlow Lite C++ source files.
 $(OBJDIR)%.o: %.cc
@@ -261,7 +259,7 @@ $(OBJDIR)%.o: %.cpp
 	$(CXX) $(CXXFLAGS) $(INCLUDES) -c $< -o $@
 
 # The target that's compiled if there's no command-line arguments.
-all: $(LIB_PATH)  $(MINIMAL_BINARY) $(BENCHMARK_BINARY)
+all: $(LIB_PATH)  $(MINIMAL_BINARY) $(BENCHMARK_BINARY) $(BENCHMARK_PERF_OPTIONS_BINARY)
 
 # The target that's compiled for micro-controllers
 micro: $(LIB_PATH)
@@ -285,19 +283,25 @@ $(MINIMAL_BINARY): $(MINIMAL_OBJS) $(LIB_PATH)
 
 minimal: $(MINIMAL_BINARY)
 
-$(BENCHMARK_LIB) : $(LIB_PATH) $(BENCHMARK_OBJS)
+$(BENCHMARK_LIB) : $(LIB_PATH) $(BENCHMARK_LIB_OBJS)
 	@mkdir -p $(dir $@)
-	$(AR) $(ARFLAGS) $(BENCHMARK_LIB) $(LIB_OBJS) $(BENCHMARK_OBJS)
+	$(AR) $(ARFLAGS) $(BENCHMARK_LIB) $(LIB_OBJS) $(BENCHMARK_LIB_OBJS)
 
 benchmark_lib: $(BENCHMARK_LIB)
 
-$(BENCHMARK_BINARY) : $(BENCHMARK_LIB)
+$(BENCHMARK_BINARY) : $(BENCHMARK_MAIN_OBJ) $(BENCHMARK_LIB)
 	@mkdir -p $(dir $@)
 	$(CXX) $(CXXFLAGS) $(INCLUDES) \
-	-o $(BENCHMARK_BINARY) \
+	-o $(BENCHMARK_BINARY) $(BENCHMARK_MAIN_OBJ) \
 	$(LIBFLAGS) $(BENCHMARK_LIB) $(LDFLAGS) $(LIBS)
 
-benchmark: $(BENCHMARK_BINARY)
+$(BENCHMARK_PERF_OPTIONS_BINARY) : $(BENCHMARK_PERF_OPTIONS_OBJ) $(BENCHMARK_LIB)
+	@mkdir -p $(dir $@)
+	$(CXX) $(CXXFLAGS) $(INCLUDES) \
+	-o $(BENCHMARK_PERF_OPTIONS_BINARY) $(BENCHMARK_PERF_OPTIONS_OBJ) \
+	$(LIBFLAGS) $(BENCHMARK_LIB) $(LDFLAGS) $(LIBS)
+
+benchmark: $(BENCHMARK_BINARY) $(BENCHMARK_PERF_OPTIONS_BINARY)
 
 libdir:
 	@echo $(LIBDIR)
diff --git a/tensorflow/lite/tools/optimize/operator_property.cc b/tensorflow/lite/tools/optimize/operator_property.cc
index f0fc409ce37..394ef928437 100644
--- a/tensorflow/lite/tools/optimize/operator_property.cc
+++ b/tensorflow/lite/tools/optimize/operator_property.cc
@@ -191,6 +191,12 @@ OperatorProperty GetOperatorProperty(const BuiltinOperator& op) {
       property.outputs = {{0, {}}};
       property.version = 2;
       break;
+    case BuiltinOperator_RELU6: {
+      property.inputs = {{0, {}}};
+      property.outputs = {{0, {}}};
+      property.version = 2;
+      break;
+    }
     case BuiltinOperator_RESHAPE:
       property.inputs = {{0, {}}};
       property.outputs = {{0, {}}};
diff --git a/tensorflow/lite/tools/pip_package/setup.py b/tensorflow/lite/tools/pip_package/setup.py
index bfe9636d89c..ff5c36cb8a9 100644
--- a/tensorflow/lite/tools/pip_package/setup.py
+++ b/tensorflow/lite/tools/pip_package/setup.py
@@ -43,8 +43,8 @@ TENSORFLOW_DIR = os.environ['TENSORFLOW_SRC_ROOT']
 # Setup cross compiling
 TARGET = os.environ.get('TENSORFLOW_TARGET', None)
 if TARGET == 'rpi':
-  os.environ['CXX'] = 'arm-linux-gnueabihf-g++'
-  os.environ['CC'] = 'arm-linux-gnueabihf-gcc'
+  os.environ['CXX'] = 'arm-rpi-linux-gnueabihf-g++'
+  os.environ['CC'] = 'arm-rpi-linux-gnueabihf-gcc'
 elif TARGET == 'aarch64':
   os.environ['CXX'] = 'aarch64-linux-gnu-g++'
   os.environ['CC'] = 'aarch64-linux-gnu-gcc'
diff --git a/tensorflow/opensource_only.files b/tensorflow/opensource_only.files
index 9b1efb68491..2df9c1d566e 100644
--- a/tensorflow/opensource_only.files
+++ b/tensorflow/opensource_only.files
@@ -1,4 +1,4 @@
-llvm/llvm/projects/google_mlir/WORKSPACE
+llvm/llvm/projects/google_mlir/mlir_configure.bzl
 tensorflow/__init__.py
 tensorflow/api_template.__init__.py
 tensorflow/api_template_v1.__init__.py
@@ -69,6 +69,7 @@ tensorflow/third_party/fft2d/fft2d.h
 tensorflow/third_party/functools32.BUILD
 tensorflow/third_party/gast.BUILD
 tensorflow/third_party/gif.BUILD
+tensorflow/third_party/gif_fix_strtok_r.patch
 tensorflow/third_party/git/BUILD
 tensorflow/third_party/git/BUILD.tpl
 tensorflow/third_party/git/git_configure.bzl
diff --git a/tensorflow/python/BUILD b/tensorflow/python/BUILD
index 7df0397af6b..1bead1dc89d 100644
--- a/tensorflow/python/BUILD
+++ b/tensorflow/python/BUILD
@@ -49,8 +49,6 @@ py_library(
     visibility = [
         "//tensorflow:__pkg__",
         "//tensorflow/compiler/aot/tests:__pkg__",  # TODO(b/34059704): remove when fixed
-        "//tensorflow/contrib/learn:__pkg__",  # TODO(b/34059704): remove when fixed
-        "//tensorflow/contrib/learn/python/learn/datasets:__pkg__",  # TODO(b/34059704): remove when fixed
         "//tensorflow/lite/toco/python:__pkg__",  # TODO(b/34059704): remove when fixed
         "//tensorflow/python/debug:__pkg__",  # TODO(b/34059704): remove when fixed
         "//tensorflow/python/tools:__pkg__",  # TODO(b/34059704): remove when fixed
@@ -100,6 +98,8 @@ py_library(
         "//third_party/py/tensorflow_core:__subpackages__",
     ],
     deps = [
+        ":_pywrap_stat_summarizer",
+        ":_pywrap_tfprof",
         ":_pywrap_util_port",
         ":_pywrap_utils",
         ":array_ops",
@@ -148,7 +148,6 @@ py_library(
         ":session_ops",
         ":sets",
         ":sparse_ops",
-        ":spectral_ops_test_util",
         ":standard_ops",
         ":state_ops",
         ":string_ops",
@@ -177,7 +176,6 @@ py_library(
         "//tensorflow/python/distribute:multi_worker_test_base",
         "//tensorflow/python/distribute:strategy_combinations",
         "//tensorflow/python/eager:def_function",
-        "//tensorflow/python/eager:execution_callbacks",
         "//tensorflow/python/eager:monitoring",
         "//tensorflow/python/eager:profiler",
         "//tensorflow/python/eager:profiler_client",
@@ -370,6 +368,28 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "pybind11_absl",
+    hdrs = ["lib/core/pybind11_absl.h"],
+    features = ["-parse_headers"],
+    deps = [
+        "//tensorflow/core/platform:stringpiece",
+        "@pybind11",
+    ],
+)
+
+cc_library(
+    name = "pybind11_status",
+    hdrs = ["lib/core/pybind11_status.h"],
+    features = ["-parse_headers"],
+    deps = [
+        ":py_exception_registry",
+        "//tensorflow/core:lib",
+        "//third_party/python_runtime:headers",
+        "@pybind11",
+    ],
+)
+
 cc_library(
     name = "kernel_registry",
     srcs = ["util/kernel_registry.cc"],
@@ -381,15 +401,25 @@ cc_library(
     ],
 )
 
+tf_python_pybind_extension(
+    name = "_pywrap_tfprof",
+    srcs = ["util/tfprof_wrapper.cc"],
+    module_name = "_pywrap_tfprof",
+    deps = [
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core/profiler/internal:print_model_analysis_hdr",
+        "//third_party/eigen3",
+        "//third_party/python_runtime:headers",
+        "@com_google_absl//absl/strings",
+        "@pybind11",
+    ],
+)
+
 tf_python_pybind_extension(
     name = "_pywrap_utils",
     srcs = ["util/util_wrapper.cc"],
     hdrs = ["util/util.h"],
-    copts = [
-        "-fexceptions",
-        "-fno-strict-aliasing",
-    ],
-    features = ["-use_header_modules"],
     module_name = "_pywrap_utils",
     deps = [
         "//third_party/python_runtime:headers",
@@ -400,11 +430,6 @@ tf_python_pybind_extension(
 tf_python_pybind_extension(
     name = "_pywrap_stat_summarizer",
     srcs = ["util/stat_summarizer_wrapper.cc"],
-    copts = [
-        "-fexceptions",
-        "-fno-strict-aliasing",
-    ],
-    features = ["-use_header_modules"],
     module_name = "_pywrap_stat_summarizer",
     deps = [
         "//tensorflow/core:framework",
@@ -419,11 +444,6 @@ pybind_extension(
     name = "_pywrap_util_port",
     srcs = ["util/port_wrapper.cc"],
     hdrs = ["//tensorflow/core:util_port_hdrs"],
-    copts = [
-        "-fexceptions",
-        "-fno-strict-aliasing",
-    ],
-    features = ["-use_header_modules"],
     module_name = "_pywrap_util_port",
     deps = [
         "//tensorflow/core:util_port",
@@ -742,6 +762,7 @@ py_library(
     srcs_version = "PY2AND3",
     deps = [
         ":_pywrap_stat_summarizer",
+        ":_pywrap_tfprof",
         ":_pywrap_util_port",
         ":_pywrap_utils",
         ":common_shapes",
@@ -972,7 +993,7 @@ py_library(
         ":tensor_shape",
         ":util",
         "//tensorflow/core:protos_all_py",
-        "//tensorflow/python/eager:execution_callbacks",
+        "//tensorflow/python:op_callbacks",
         "@six_archive//:six",
     ],
 )
@@ -1035,7 +1056,6 @@ cuda_py_test(
         ":sparse_tensor",
         "//third_party/py/numpy",
         "//tensorflow/python/eager:execute",
-        "//tensorflow/python/eager:execution_callbacks",
         "//tensorflow/python/eager:test",
         "//tensorflow/python/keras",
     ],
@@ -1517,21 +1537,6 @@ tf_py_test(
     main = "framework/subscribe_test.py",
 )
 
-tf_py_test(
-    name = "contrib_test",
-    size = "small",
-    srcs = ["framework/contrib_test.py"],
-    additional_deps = [
-        "//tensorflow:tensorflow_py",
-        "//tensorflow/python:client_testlib",
-    ],
-    main = "framework/contrib_test.py",
-    tags = [
-        "no_pip",
-        "no_windows",
-    ],
-)
-
 tf_py_test(
     name = "build_info_test",
     size = "small",
@@ -3715,16 +3720,6 @@ tf_py_test(
     ],
 )
 
-py_library(
-    name = "spectral_ops_test_util",
-    srcs = ["ops/spectral_ops_test_util.py"],
-    srcs_version = "PY2AND3",
-    deps = [
-        ":client_testlib",
-        ":framework_ops",
-    ],
-)
-
 py_library(
     name = "confusion_matrix",
     srcs = ["ops/confusion_matrix.py"],
@@ -4297,9 +4292,9 @@ cuda_py_test(
         ":client_testlib",
         ":framework_for_generated_wrappers",
         ":math_ops",
+        "//tensorflow/python/debug:check_numerics_callback",
         "//tensorflow/python/eager:backprop",
         "//tensorflow/python/eager:context",
-        "//tensorflow/python/eager:execution_callbacks",
         "//third_party/py/numpy",
     ],
     tags = ["no_windows_gpu"],
@@ -4663,8 +4658,6 @@ py_library(
 pybind_extension(
     name = "_tf_stack",
     srcs = ["util/tf_stack.cc"],
-    copts = ["-fexceptions"],
-    features = ["-use_header_modules"],
     # TODO(b/138203821): change to "util._tf_stack" once the bug is fixed.
     module_name = "_tf_stack",
     deps = [
@@ -5042,9 +5035,9 @@ tf_py_wrap_cc(
         "util/kernel_registry.i",
         "util/py_checkpoint_reader.i",
         "util/scoped_annotation.i",
-        "util/tfprof.i",
         "util/traceme.i",
         "util/transform_graph.i",
+        "//tensorflow/compiler/mlir/python:mlir.i",
         "//tensorflow/lite/toco/python:toco.i",
     ],
     # add win_def_file for pywrap_tensorflow
@@ -5110,11 +5103,14 @@ tf_py_wrap_cc(
 # the dynamic libraries of custom ops can find it at runtime.
 genrule(
     name = "pywrap_tensorflow_filtered_def_file",
-    srcs = [
-        "//tensorflow:tensorflow_def_file",
-        ":pybind_symbol_target_libs_file",
-        "//tensorflow/tools/def_file_filter:symbols_pybind",
-    ],
+    srcs = select({
+        "//tensorflow:windows": [
+            "//tensorflow:tensorflow_def_file",
+            ":pybind_symbol_target_libs_file",
+            "//tensorflow/tools/def_file_filter:symbols_pybind",
+        ],
+        "//conditions:default": [],
+    }),
     outs = ["pywrap_tensorflow_filtered_def_file.def"],
     cmd = select({
         "//tensorflow:windows": """
@@ -5137,6 +5133,7 @@ genrule(
     srcs = [
         ":cpp_python_util",  # util
         "//tensorflow/stream_executor:stream_executor_pimpl",  # stat_summarizer
+        "//tensorflow/core/profiler/internal:print_model_analysis",  # tfprof
     ],
     outs = ["pybind_symbol_target_libs_file.txt"],
     cmd = select({
@@ -5642,9 +5639,14 @@ tf_py_test(
         ":lib",
         ":util",
     ],
-    # The multiprocessing module behaves differently on windows, so we
-    # disable this test on windows.
-    tags = ["no_windows"],
+    tags = [
+        # multiprocessing can be flaky in the internal google
+        # environment, so we disable it there.
+        "notap",
+        # The multiprocessing module behaves differently on
+        # windows, so we disable this test on windows.
+        "no_windows",
+    ],
 )
 
 cuda_py_test(
@@ -5925,6 +5927,7 @@ tf_py_test(
         ":client",
         ":client_testlib",
         ":control_flow_ops",
+        ":fake_summary_writer",
         ":framework",
         ":framework_for_generated_wrappers",
         ":nn_grad",
@@ -5934,8 +5937,6 @@ tf_py_test(
         ":training",
         ":variable_scope",
         ":variables",
-        "//tensorflow/contrib/framework:framework_py",
-        "//tensorflow/contrib/testing:testing_py",
         "//tensorflow/core:protos_all_py",
     ],
     tags = [
@@ -6027,8 +6028,6 @@ tf_py_test(
         ":summary",
         ":training",
         ":variables",
-        "//tensorflow/contrib/framework:framework_py",
-        "//tensorflow/contrib/testing:testing_py",
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python/distribute:collective_all_reduce_strategy",
         "//tensorflow/python/distribute:distribute_coordinator",
@@ -6107,7 +6106,10 @@ py_library(
     name = "summary",
     srcs = glob(
         ["summary/**/*.py"],
-        exclude = ["**/*test*"],
+        exclude = [
+            "**/fake*",
+            "**/*test*",
+        ],
     ),
     srcs_version = "PY2AND3",
     visibility = ["//visibility:public"],
@@ -6131,6 +6133,19 @@ py_library(
     ],
 )
 
+py_library(
+    name = "fake_summary_writer",
+    testonly = 1,
+    srcs = ["summary/writer/fake_summary_writer.py"],
+    srcs_version = "PY2AND3",
+    visibility = ["//visibility:public"],
+    deps = [
+        ":framework_test_lib",
+        ":protos_all_py",
+        ":summary",
+    ],
+)
+
 py_tests(
     name = "summary_tests",
     size = "small",
diff --git a/tensorflow/python/__init__.py b/tensorflow/python/__init__.py
index 06216f47b85..e06d7a9af39 100644
--- a/tensorflow/python/__init__.py
+++ b/tensorflow/python/__init__.py
@@ -48,6 +48,7 @@ import numpy as np
 
 from tensorflow.python import pywrap_tensorflow
 from tensorflow.python import _pywrap_utils
+from tensorflow.python import _pywrap_tfprof
 
 # Protocol buffers
 from tensorflow.core.framework.graph_pb2 import *
@@ -159,6 +160,9 @@ from tensorflow.python.ops import rnn_cell
 from tensorflow.python.compiler.xla import jit
 from tensorflow.python.compiler.xla import xla
 
+# MLIR APIs.
+from tensorflow.python.compiler.mlir import mlir
+
 # Required due to `rnn` and `rnn_cell` not being imported in `nn` directly
 # (due to a circular dependency issue: rnn depends on layers).
 nn.dynamic_rnn = rnn.dynamic_rnn
diff --git a/tensorflow/python/autograph/operators/py_builtins.py b/tensorflow/python/autograph/operators/py_builtins.py
index cd5f69bbce2..93f7e8fed5b 100644
--- a/tensorflow/python/autograph/operators/py_builtins.py
+++ b/tensorflow/python/autograph/operators/py_builtins.py
@@ -349,14 +349,28 @@ def zip_(*iterables):
 
 
 def _tf_dataset_zip(*iterables):
-  return dataset_ops.DatasetV2.zip(tuple(iterables))
+  return dataset_ops.DatasetV2.zip(iterables)
 
 
 def _py_zip(*iterables):
   return zip(*iterables)
 
 
-SUPPORTED_BUILTINS = (abs, float, int, len, print, range, enumerate, zip)
+def map_(fn, *iterables):
+  if all(isinstance(x, dataset_ops.DatasetV2) for x in iterables):
+    return _tf_dataset_map(fn, *iterables)
+  return _py_map(fn, *iterables)
+
+
+def _tf_dataset_map(fn, *iterables):
+  return dataset_ops.DatasetV2.zip(iterables).map(fn)
+
+
+def _py_map(fn, *iterables):
+  return map(fn, *iterables)
+
+
+SUPPORTED_BUILTINS = (abs, float, int, len, print, range, enumerate, zip, map)
 
 if six.PY2:
   SUPPORTED_BUILTINS += (xrange,)
@@ -372,4 +386,5 @@ BUILTIN_FUINCTIONS_MAP = {
     'xrange': range_,
     'enumerate': enumerate_,
     'zip': zip_,
+    'map': map_,
 }
diff --git a/tensorflow/python/autograph/operators/py_builtins_test.py b/tensorflow/python/autograph/operators/py_builtins_test.py
index be77495daed..4257510f8d1 100644
--- a/tensorflow/python/autograph/operators/py_builtins_test.py
+++ b/tensorflow/python/autograph/operators/py_builtins_test.py
@@ -178,6 +178,41 @@ class PyBuiltinsTest(test.TestCase):
       self.assertAllEqual(self.evaluate(iterator.get_next()), (-12, -22))
       self.assertAllEqual(self.evaluate(iterator.get_next()), (4, 5))
 
+  def test_map(self):
+
+    def increment(x):
+      return x + 1
+
+    add_list = lambda x, y: x + y
+    self.assertListEqual(
+        list(py_builtins.map_(increment, [4, 5, 6])), [5, 6, 7])
+    self.assertListEqual(
+        list(py_builtins.map_(add_list, [3, 2, 1], [-1, -2, -3])), [2, 0, -2])
+
+  def test_map_dataset(self):
+
+    def increment(x):
+      return x + 1
+
+    ds1 = dataset_ops.DatasetV2.from_tensor_slices([4, 5, 6])
+    ds2 = py_builtins.map_(increment, ds1)
+    iterator = dataset_ops.make_one_shot_iterator(ds2)
+    with self.cached_session() as sess:
+      self.assertAllEqual(self.evaluate(iterator.get_next()), 5)
+      self.assertAllEqual(self.evaluate(iterator.get_next()), 6)
+      self.assertAllEqual(self.evaluate(iterator.get_next()), 7)
+
+  def test_map_multiple_datasets(self):
+    add_list = lambda x, y: x + y
+    ds1 = dataset_ops.DatasetV2.from_tensor_slices([-11, -12, 4])
+    ds2 = dataset_ops.DatasetV2.from_tensor_slices([-21, -22, 5])
+    ds3 = py_builtins.map_(add_list, ds1, ds2)
+    iterator = dataset_ops.make_one_shot_iterator(ds3)
+    with self.cached_session() as sess:
+      self.assertAllEqual(self.evaluate(iterator.get_next()), -32)
+      self.assertAllEqual(self.evaluate(iterator.get_next()), -34)
+      self.assertAllEqual(self.evaluate(iterator.get_next()), 9)
+
   def _basic_function_scope(self):
     return function_wrappers.FunctionScope(
         'test_function_name',
diff --git a/tensorflow/python/autograph/pyct/common_transformers/anf.py b/tensorflow/python/autograph/pyct/common_transformers/anf.py
index 216c0231434..c64b92b33c0 100644
--- a/tensorflow/python/autograph/pyct/common_transformers/anf.py
+++ b/tensorflow/python/autograph/pyct/common_transformers/anf.py
@@ -115,8 +115,25 @@ class AnfTransformer(transformer.Base):
     """
     super(AnfTransformer, self).__init__(ctx)
     if config is None:
+      # These could be pulled out, but are generally considered to already be in
+      # A-normal form.  Thus they are left in by default, but could be pulled
+      # out if the configuration calls for it.
+      try:
+        # TODO(b/140808434): Fix this.
+        # gast pre-0.3
+        literal_node_types = (
+            gast.Num, gast.Str, gast.Bytes, gast.NameConstant,
+            gast.Name  # Name is here to cover True, False, and None in Python 2
+        )
+      except AttributeError:
+        # gast 0.3+
+        literal_node_types = (
+            gast.Constant,
+            gast.Name  # Name is here to cover True, False, and None in Python 2
+        )
+
       self._overrides = [
-          (ASTEdgePattern(ANY, ANY, self._literal_nodes), LEAVE),
+          (ASTEdgePattern(ANY, ANY, literal_node_types), LEAVE),
           (ASTEdgePattern(ANY, ANY, gast.expr), REPLACE)]
     else:
       self._overrides = config
@@ -134,32 +151,6 @@ class AnfTransformer(transformer.Base):
   def _add_pending_statement(self, stmt):
     self._pending_statements.append(stmt)
 
-  # These can't meaningfully be pulled out into their own assignment statements.
-  _trivial_nodes = (
-      # Variable names
-      gast.Name,
-      # Non-nodes that show up as AST fields
-      bool, six.string_types,
-      # Binary operators
-      gast.Add, gast.Sub, gast.Mult, gast.Div, gast.Mod, gast.Pow, gast.LShift,
-      gast.RShift, gast.BitOr, gast.BitXor, gast.BitAnd, gast.FloorDiv,
-      # Unary operators
-      gast.Invert, gast.Not, gast.UAdd, gast.USub,
-      # Comparison operators
-      gast.Eq, gast.NotEq, gast.Lt, gast.LtE, gast.Gt, gast.GtE,
-      gast.Is, gast.IsNot, gast.In, gast.NotIn,
-      # Other leaf nodes that don't make sense standalone.
-      gast.expr_context, gast.Ellipsis,
-  )
-
-  # These could be pulled out, but are generally considered to already be in
-  # A-normal form.  Thus they are left in by default, but could be pulled out
-  # if the configuration calls for it.
-  _literal_nodes = (
-      gast.Num, gast.Str, gast.Bytes, gast.NameConstant,
-      gast.Name  # Name is here to cover True, False, and None in Python 2
-  )
-
   def _match(self, pattern, parent, field, child):
     if pattern is ANY:
       return True
@@ -199,8 +190,7 @@ class AnfTransformer(transformer.Base):
     """
     if node is None:
       return node
-    if (isinstance(node, self._trivial_nodes) and
-        not _is_py2_name_constant(node)):
+    if _is_trivial(node):
       return node
     if isinstance(node, list):
       # If something's field was actually a list, e.g., variadic arguments.
@@ -499,6 +489,51 @@ def _is_py2_name_constant(node):
   return isinstance(node, gast.Name) and node.id in ['True', 'False', 'None']
 
 
+def _is_trivial(node):
+  """Returns whether to consider the given node 'trivial'.
+
+  The definition of 'trivial' is a node that can't meaningfully be pulled out
+  into its own assignment statement.
+
+  This is surprisingly difficult to do robustly across versions of Python and
+  gast, as the parsing of constants has changed, if I may, constantly.
+
+  Args:
+    node: An AST node to check for triviality
+
+  Returns:
+    trivial: A Python `bool` indicating whether the node is trivial.
+  """
+  trivial_node_types = (
+      # Variable names
+      gast.Name,
+      # Non-nodes that show up as AST fields
+      bool, six.string_types,
+      # Binary operators
+      gast.Add, gast.Sub, gast.Mult, gast.Div, gast.Mod, gast.Pow,
+      gast.LShift, gast.RShift, gast.BitOr, gast.BitXor, gast.BitAnd,
+      gast.FloorDiv,
+      # Unary operators
+      gast.Invert, gast.Not, gast.UAdd, gast.USub,
+      # Comparison operators
+      gast.Eq, gast.NotEq, gast.Lt, gast.LtE, gast.Gt, gast.GtE,
+      gast.Is, gast.IsNot, gast.In, gast.NotIn,
+      # Other leaf nodes that don't make sense standalone.
+      gast.expr_context,
+  )
+  if isinstance(node, trivial_node_types) and not _is_py2_name_constant(node):
+    return True
+  try:
+    # gast pre-0.3
+    if isinstance(node, gast.Ellipsis):
+      return True
+  except AttributeError:
+    # gast 0.3+
+    if isinstance(node, gast.Constant) and node.value == Ellipsis:
+      return True
+  return False
+
+
 def transform(node, ctx, config=None, gensym_source=None):
   """Converts the given node to A-normal form (ANF).
 
@@ -563,6 +598,7 @@ def transform(node, ctx, config=None, gensym_source=None):
   If no configuration is supplied, the default behavior is to transform all
   expressions except literal constants, which is defined as a configuration as
   ```python
+  # For Python 3, and gast library versions before 0.3
   literals = (gast.Num, gast.Str, gast.Bytes, gast.NameConstant)
   [(anf.ASTEdgePattern(anf.ANY, anf.ANY, literals), anf.LEAVE),
    (anf.ASTEdgePattern(anf.ANY, anf.ANY, gast.expr), anf.REPLACE)]
diff --git a/tensorflow/python/autograph/pyct/common_transformers/anf_test.py b/tensorflow/python/autograph/pyct/common_transformers/anf_test.py
index fe2e9b20361..f3bbba20925 100644
--- a/tensorflow/python/autograph/pyct/common_transformers/anf_test.py
+++ b/tensorflow/python/autograph/pyct/common_transformers/anf_test.py
@@ -479,7 +479,13 @@ class AnfConfiguredTest(AnfTestBase):
   def test_constants_in_function_calls(self):
     # An example specific configuration that differs from the default: Moving
     # literals out of being directly passed to functions, but nothing else.
-    literals = (gast.Num, gast.Str, gast.Bytes, gast.NameConstant, gast.Name)
+    try:
+      # TODO(b/140808434): Fix this.
+      # gast pre-0.3
+      literals = (gast.Num, gast.Str, gast.Bytes, gast.NameConstant, gast.Name)
+    except AttributeError:
+      # gast 0.3+
+      literals = (gast.Constant, gast.Name)
     config = [(anf.ASTEdgePattern(gast.Call, anf.ANY, literals), anf.REPLACE)]
 
     def test_function(x, frob):
diff --git a/tensorflow/python/client/tf_session_helper.cc b/tensorflow/python/client/tf_session_helper.cc
index 89123f03c30..1c7c42aea1b 100644
--- a/tensorflow/python/client/tf_session_helper.cc
+++ b/tensorflow/python/client/tf_session_helper.cc
@@ -546,9 +546,7 @@ void TF_SessionPRun_wrapper(TF_Session* session, const char* handle,
 std::vector<TF_Output> GetOperationInputs(TF_Operation* oper) {
   int num_inputs = TF_OperationNumInputs(oper);
   std::vector<TF_Output> inputs(num_inputs);
-  for (int i = 0; i < num_inputs; ++i) {
-    inputs[i] = TF_OperationInput({oper, i});
-  }
+  TF_OperationAllInputs(oper, inputs.data(), inputs.size());
   return inputs;
 }
 
diff --git a/tensorflow/python/compat/compat.py b/tensorflow/python/compat/compat.py
index 7ff9416c9a3..6ba4188eb19 100644
--- a/tensorflow/python/compat/compat.py
+++ b/tensorflow/python/compat/compat.py
@@ -31,7 +31,7 @@ from tensorflow.python.util.tf_export import tf_export
 # This value changes every day with an automatic CL. It can be modified in code
 # via `forward_compatibility_horizon()` or with the environment variable
 # TF_FORWARD_COMPATIBILITY_DELTA_DAYS, which is added to the compatibility date.
-_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2019, 9, 4)
+_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2019, 9, 13)
 
 _FORWARD_COMPATIBILITY_HORIZON_OVERRIDDEN = False
 _FORWARD_COMPATIBILITY_DELTA_DAYS_VAR_NAME = "TF_FORWARD_COMPATIBILITY_DELTA_DAYS"
diff --git a/tensorflow/python/compat/v2_compat.py b/tensorflow/python/compat/v2_compat.py
index 60547a9f900..62afbc3d623 100644
--- a/tensorflow/python/compat/v2_compat.py
+++ b/tensorflow/python/compat/v2_compat.py
@@ -19,6 +19,7 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.python import tf2
+from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.ops import control_flow_v2_toggles
@@ -49,6 +50,8 @@ def enable_v2_behavior():
   ops.enable_tensor_equality()
   # Enables TensorArrayV2 and control flow V2.
   control_flow_v2_toggles.enable_control_flow_v2()
+  # Make sure internal uses of the `dataset_ops.Dataset` map to DatasetV2.
+  dataset_ops.Dataset = dataset_ops.DatasetV2
 
 
 @tf_export(v1=["disable_v2_behavior"])
@@ -69,3 +72,5 @@ def disable_v2_behavior():
   ops.disable_tensor_equality()
   # Disables TensorArrayV2 and control flow V2.
   control_flow_v2_toggles.disable_control_flow_v2()
+  # Make sure internal uses of the `dataset_ops.Dataset` map to DatasetV1.
+  dataset_ops.Dataset = dataset_ops.DatasetV1
diff --git a/tensorflow/python/compiler/BUILD b/tensorflow/python/compiler/BUILD
index 9d3aa19d44d..4d16a85e379 100644
--- a/tensorflow/python/compiler/BUILD
+++ b/tensorflow/python/compiler/BUILD
@@ -17,6 +17,7 @@ py_library(
     deps = if_not_windows([
         "//tensorflow/python/compiler/tensorrt:init_py",
     ]) + [
+        "//tensorflow/python/compiler/mlir",
         "//tensorflow/python/compiler/xla:compiler_py",
     ],
 )
diff --git a/tensorflow/python/compiler/mlir/BUILD b/tensorflow/python/compiler/mlir/BUILD
new file mode 100644
index 00000000000..ee191e4e6db
--- /dev/null
+++ b/tensorflow/python/compiler/mlir/BUILD
@@ -0,0 +1,27 @@
+load("//tensorflow:tensorflow.bzl", "py_test")
+
+package(
+    default_visibility = ["//visibility:public"],
+    licenses = ["notice"],  # Apache 2.0
+)
+
+py_library(
+    name = "mlir",
+    srcs = ["mlir.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python:pywrap_tensorflow",
+        "//tensorflow/python:util",
+    ],
+)
+
+py_test(
+    name = "mlir_test",
+    srcs = ["mlir_test.py"],
+    python_version = "PY3",
+    deps = [
+        ":mlir",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:platform",
+    ],
+)
diff --git a/tensorflow/python/compiler/mlir/mlir.py b/tensorflow/python/compiler/mlir/mlir.py
new file mode 100644
index 00000000000..42a9971e06a
--- /dev/null
+++ b/tensorflow/python/compiler/mlir/mlir.py
@@ -0,0 +1,38 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+"""mlir is an experimental library that provides support APIs for MLIR."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python import pywrap_tensorflow as import_graphdef
+from tensorflow.python.util.tf_export import tf_export
+
+
+@tf_export('mlir.experimental.convert_graph_def')
+def convert_graph_def(graph_def):
+  """Import a GraphDef and convert it to a textual MLIR module.
+
+  Args:
+    graph_def: An object of type graph_pb2.GraphDef or a textual proto
+      representation of a valid GraphDef.
+
+  Returns:
+    A textual representation of the MLIR module corresponding to the graphdef.
+    Raises a RuntimeError on error.
+
+  """
+  return import_graphdef.import_graphdef(graph_def)
diff --git a/tensorflow/python/framework/contrib_test.py b/tensorflow/python/compiler/mlir/mlir_test.py
similarity index 52%
rename from tensorflow/python/framework/contrib_test.py
rename to tensorflow/python/compiler/mlir/mlir_test.py
index f2eaf7c2eea..8fd5952fd76 100644
--- a/tensorflow/python/framework/contrib_test.py
+++ b/tensorflow/python/compiler/mlir/mlir_test.py
@@ -1,4 +1,4 @@
-# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -11,34 +11,30 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-# ==============================================================================
-"""Test that the contrib module shows up properly."""
+# =============================================================================
+"""Tests for python.compiler.mlir."""
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.python.compiler.mlir import mlir
+from tensorflow.python.framework import errors
 from tensorflow.python.platform import test
-from tensorflow.python.util import tf_inspect
 
 
-class ContribTest(test.TestCase):
+class MLIRImportTest(test.TestCase):
 
-  def testContrib(self):
-    # pylint: disable=g-import-not-at-top
-    import tensorflow as tf
-    _ = tf.contrib.layers  # `tf.contrib` is loaded lazily on first use.
-    assert tf_inspect.ismodule(tf.contrib)
+  def test_import_graph_def(self):
+    """Tests the basic flow of `tf.mlir.experimental.convert_graph_def`."""
+    mlir_module = mlir.convert_graph_def('')
+    # An empty graph should contain at least an empty main function.
+    self.assertIn('func @main', mlir_module)
 
-  def testLayers(self):
-    # pylint: disable=g-import-not-at-top
-    import tensorflow as tf
-    assert tf_inspect.ismodule(tf.contrib.layers)
-
-  def testLinearOptimizer(self):
-    # pylint: disable=g-import-not-at-top
-    import tensorflow as tf
-    assert tf_inspect.ismodule(tf.contrib.linear_optimizer)
+  def test_invalid_pbtxt(self):
+    with self.assertRaisesRegexp(errors.InvalidArgumentError,
+                                 'Could not parse input proto'):
+      mlir.convert_graph_def('some invalid proto')
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/compiler/tensorrt/BUILD b/tensorflow/python/compiler/tensorrt/BUILD
index 83a2daf6ee0..bd09b5b5024 100644
--- a/tensorflow/python/compiler/tensorrt/BUILD
+++ b/tensorflow/python/compiler/tensorrt/BUILD
@@ -152,6 +152,9 @@ cuda_py_tests(
         "//tensorflow/python:framework_test_lib",
     ],
     tags = [
+        "no_rocm",
+        "no_windows",
+        "nomac",
         "notap",  # b/140261407
     ],
     xla_enable_strict_auto_jit = True,
diff --git a/tensorflow/python/compiler/tensorrt/test/quantization_mnist_test.py b/tensorflow/python/compiler/tensorrt/test/quantization_mnist_test.py
index d44a0ec7156..f5be3850002 100644
--- a/tensorflow/python/compiler/tensorrt/test/quantization_mnist_test.py
+++ b/tensorflow/python/compiler/tensorrt/test/quantization_mnist_test.py
@@ -21,9 +21,9 @@ from __future__ import print_function
 from tensorflow.compiler.tf2tensorrt.wrap_py_utils import get_linked_tensorrt_version
 from tensorflow.compiler.tf2tensorrt.wrap_py_utils import is_tensorrt_enabled
 from tensorflow.core.protobuf import config_pb2
-from tensorflow.python import data
 from tensorflow.python import keras
 from tensorflow.python.compiler.tensorrt import trt_convert
+from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.estimator.estimator import Estimator
 from tensorflow.python.estimator.model_fn import EstimatorSpec
 from tensorflow.python.estimator.model_fn import ModeKeys
@@ -191,28 +191,24 @@ class QuantizationAwareTrainingMNISTTest(test_util.TensorFlowTestCase):
 
     def _EvalInputFn():
       mnist_x, mnist_y = test_data
-      dataset = data.Dataset.from_tensor_slices((mnist_x, mnist_y))
-      dataset = dataset.apply(
-          data.experimental.map_and_batch(
-              map_func=_PreprocessFn,
-              batch_size=batch_size,
-              num_parallel_calls=8))
+      dataset = dataset_ops.Dataset.from_tensor_slices((mnist_x, mnist_y))
+      dataset = dataset.map(
+          map_func=_PreprocessFn,
+          num_parallel_calls=8).batch(batch_size=batch_size)
       dataset = dataset.repeat(count=1)
-      iterator = dataset.make_one_shot_iterator()
+      iterator = dataset_ops.make_one_shot_iterator(dataset)
       features, labels = iterator.get_next()
       return features, labels
 
     def _TrainInputFn():
       mnist_x, mnist_y = train_data
-      dataset = data.Dataset.from_tensor_slices((mnist_x, mnist_y))
+      dataset = dataset_ops.Dataset.from_tensor_slices((mnist_x, mnist_y))
       dataset = dataset.shuffle(2 * len(mnist_x))
-      dataset = dataset.apply(
-          data.experimental.map_and_batch(
-              map_func=_PreprocessFn,
-              batch_size=batch_size,
-              num_parallel_calls=8))
+      dataset = dataset.map(
+          map_func=_PreprocessFn,
+          num_parallel_calls=8).batch(batch_size=batch_size)
       dataset = dataset.repeat(count=num_epochs)
-      iterator = dataset.make_one_shot_iterator()
+      iterator = dataset_ops.make_one_shot_iterator(dataset)
       features, labels = iterator.get_next()
       return features, labels
 
diff --git a/tensorflow/python/compiler/tensorrt/trt_convert.py b/tensorflow/python/compiler/tensorrt/trt_convert.py
index 2de271d4e85..133b9ea627b 100644
--- a/tensorflow/python/compiler/tensorrt/trt_convert.py
+++ b/tensorflow/python/compiler/tensorrt/trt_convert.py
@@ -190,31 +190,38 @@ def _check_trt_version_compatibility():
   Raises:
     RuntimeError: if the TensorRT library version is incompatible.
   """
-  compiled_version = wrap_py_utils.get_linked_tensorrt_version()
+  linked_version = wrap_py_utils.get_linked_tensorrt_version()
   loaded_version = wrap_py_utils.get_loaded_tensorrt_version()
-  tf_logging.info("Linked TensorRT version: %s" % str(compiled_version))
+  assert isinstance(linked_version, tuple)
+  assert isinstance(loaded_version, tuple)
+  assert len(linked_version) == 3
+  assert len(loaded_version) == 3
+  tf_logging.info("Linked TensorRT version: %s" % str(linked_version))
   tf_logging.info("Loaded TensorRT version: %s" % str(loaded_version))
-  version_mismatch = False
-  if loaded_version[0] < compiled_version[0]:
+  if loaded_version < linked_version:
     tf_logging.error(
-        "TensorRT version mismatch. Tensorflow was compiled against " +
-        "TensorRT %s but library loaded from environment is TensorRT %s" %
-        (".".join([str(x) for x in compiled_version]),
-         ".".join([str(x) for x in loaded_version])) +
-        ". Please make sure that correct version of TensorRT " +
-        "is available in the system and added to ldconfig or LD_LIBRARY_PATH")
-    raise RuntimeError("Incompatible TensorRT library version")
-  for i in zip(loaded_version, compiled_version):
-    if i[0] != i[1]:
-      tf_logging.warn("TensorRT mismatch. Compiled against version " +
-                      "%s, but loaded %s. Things may not work" %
-                      (".".join([str(x) for x in compiled_version]),
-                       ".".join([str(x) for x in loaded_version])))
-      version_mismatch = True
-      break
-  if not version_mismatch:
-    tf_logging.info("Running against TensorRT version %s" %
-                    ".".join([str(x) for x in loaded_version]))
+        "Loaded TensorRT %s but linked TensorFlow against TensorRT %s. " %
+        (".".join([str(x) for x in loaded_version]),
+         ".".join([str(x) for x in linked_version])) +
+        "TensorRT does not support forward compatibility. " +
+        "It is also required to use the same major version of TensorRT " +
+        "during compilation and runtime.")
+    raise RuntimeError("Incompatible TensorRT versions")
+  if loaded_version[0] > linked_version[0]:
+    tf_logging.error(
+        "Loaded TensorRT %s but linked TensorFlow against TensorRT %s. " %
+        (".".join([str(x) for x in loaded_version]),
+         ".".join([str(x) for x in linked_version])) +
+        "It is required to use the same major version " +
+        "of TensorRT during compilation and runtime.")
+    raise RuntimeError("Incompatible TensorRT major version")
+  if loaded_version != linked_version:
+    tf_logging.info(
+        "Loaded TensorRT %s and linked TensorFlow against TensorRT %s. " %
+        (".".join([str(x) for x in loaded_version]),
+         ".".join([str(x) for x in linked_version])) +
+        "This is supported because TensorRT " +
+        " minor/patch upgrades are backward compatible")
 
 
 def get_tensorrt_rewriter_config(conversion_params, is_v2=False):
@@ -758,7 +765,7 @@ class _TRTEngineResource(tracking.TrackableResource):
     self._resource_name = resource_name
     # Track the serialized engine file in the SavedModel.
     self._filename = self._track_trackable(
-        tracking.TrackableAsset(filename), "_serialized_trt_resource_filename")
+        tracking.Asset(filename), "_serialized_trt_resource_filename")
     self._maximum_cached_engines = maximum_cached_engines
 
   def _create_resource(self):
diff --git a/tensorflow/python/data/experimental/benchmarks/snapshot_dataset_benchmark.py b/tensorflow/python/data/experimental/benchmarks/snapshot_dataset_benchmark.py
index 92bc965f469..88230153181 100644
--- a/tensorflow/python/data/experimental/benchmarks/snapshot_dataset_benchmark.py
+++ b/tensorflow/python/data/experimental/benchmarks/snapshot_dataset_benchmark.py
@@ -72,6 +72,14 @@ class SnapshotDatasetBenchmark(benchmark_base.DatasetBenchmarkBase):
     self.run_and_report_benchmark(dataset, num_elems, "write_gzip",
                                   warmup=False, iters=1)
 
+  def benchmarkWriteSnapshotSnappyCompression(self):
+    num_elems = 500000
+    dataset = self._createSimpleDataset(
+        num_elems, compression=snapshot.COMPRESSION_SNAPPY)
+
+    self.run_and_report_benchmark(
+        dataset, num_elems, "write_snappy", warmup=False, iters=1)
+
   def benchmarkWriteSnapshotSimple(self):
     num_elems = 500000
     dataset = self._createSimpleDataset(num_elems)
@@ -111,6 +119,15 @@ class SnapshotDatasetBenchmark(benchmark_base.DatasetBenchmarkBase):
     self._consumeDataset(dataset, num_elems)
     self.run_and_report_benchmark(dataset, num_elems, "read_gzip")
 
+  def benchmarkReadSnapshotSnappyCompression(self):
+    num_elems = 100000
+    tmp_dir = self._makeSnapshotDirectory()
+    dataset = self._createSimpleDataset(
+        num_elems, tmp_dir, compression=snapshot.COMPRESSION_SNAPPY)
+
+    self._consumeDataset(dataset, num_elems)
+    self.run_and_report_benchmark(dataset, num_elems, "read_snappy")
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/data/experimental/kernel_tests/csv_dataset_test.py b/tensorflow/python/data/experimental/kernel_tests/csv_dataset_test.py
index e523f36639d..4b349ebd811 100644
--- a/tensorflow/python/data/experimental/kernel_tests/csv_dataset_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/csv_dataset_test.py
@@ -523,7 +523,7 @@ class CsvDatasetTest(test_base.DatasetTestBase):
     if context.executing_eagerly():
       err_spec = errors.InvalidArgumentError, (
           'Each record default should be at '
-          'most rank 1.')
+          'most rank 1')
     else:
       err_spec = ValueError, 'Shape must be at most rank 1 but is rank 2'
 
diff --git a/tensorflow/python/data/experimental/kernel_tests/rebatch_dataset_test.py b/tensorflow/python/data/experimental/kernel_tests/rebatch_dataset_test.py
index c12d9916041..ee240673d22 100644
--- a/tensorflow/python/data/experimental/kernel_tests/rebatch_dataset_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/rebatch_dataset_test.py
@@ -29,12 +29,10 @@ from tensorflow.python.data.experimental.ops import distribute
 from tensorflow.python.data.experimental.ops import grouping
 from tensorflow.python.data.experimental.ops import readers
 from tensorflow.python.data.experimental.ops import scan_ops
-from tensorflow.python.data.experimental.ops import sleep
 from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.util import nest
 from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import errors
 from tensorflow.python.framework import test_util
 from tensorflow.python.lib.io import python_io
 from tensorflow.python.ops import array_ops
@@ -90,6 +88,19 @@ class RebatchDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
       i += 4
     self.assertDatasetProduces(rebatched_dataset, expected_output)
 
+  def testBatchSizeNotDivisibleByNumReplicas2(self):
+    dataset = dataset_ops.Dataset.range(32).batch(16, drop_remainder=True)
+    rebatched_dataset = distribute._RebatchDataset(dataset, num_replicas=5)
+    # This will rebatch into sub-batches of size 4, since
+    # ceil(16 / 5) = 4. However, that means only the first 4 replicas will get
+    # data.
+    expected_output = [[k for k in range(i, i + 4)] for i in range(0, 16, 4)]
+    expected_output.extend([[]])  # Last replica gets an empty batch
+    expected_output.extend(
+        [[k for k in range(i, i + 4)] for i in range(16, 32, 4)])
+    expected_output.extend([[]])  # Last replica gets an empty batch
+    self.assertDatasetProduces(rebatched_dataset, expected_output)
+
   def testTupleOutput(self):
     dataset = dataset_ops.Dataset.range(1024).map(lambda x: (x, x)).batch(32)
     rebatched_dataset = distribute._RebatchDataset(dataset, num_replicas=4)
@@ -119,7 +130,9 @@ class RebatchDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
     # makes up a complete minibatch.
     expected_output = [[k for k in range(i, i + 8)] for i in range(0, 1024, 8)]  # pylint: disable=g-complex-comprehension
     if not drop_remainder:
-      expected_output.append([k for k in range(1024, 1032)])
+      # The last partial batch of size 8 is split over 4 replicas
+      expected_output.extend(
+          [[k for k in range(i, i + 2)] for i in range(1024, 1032, 2)])
     self.assertDatasetProduces(rebatched_dataset, expected_output)
 
   @parameterized.named_parameters(drop_remainder_cases)
@@ -132,7 +145,8 @@ class RebatchDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
 
     expected_output = [[k for k in range(i, i + 8)] for i in range(0, 32, 8)]  # pylint: disable=g-complex-comprehension
     if not drop_remainder:
-      expected_output += [[32, 33]]
+      # The last partial batch of size 2 is split over 4 replicas
+      expected_output += [[32], [33], [], []]
     self.assertDatasetProduces(rebatched_dataset, expected_output)
 
   def testMultipleBatches(self):
@@ -214,9 +228,8 @@ class RebatchDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
     dataset2 = dataset_ops.Dataset.range(32).batch(8)
     dataset = dataset1.concatenate(dataset2)
     rebatched_dataset = distribute._RebatchDataset(dataset, num_replicas=4)
-    self.assertEqual(
-        [[None]],
-        [ts.as_list() for ts in _flat_shapes(rebatched_dataset)])
+    self.assertEqual([[None]],
+                     [ts.as_list() for ts in _flat_shapes(rebatched_dataset)])
     expected_output = ([[i, i + 1, i + 2, i + 3] for i in range(0, 64, 4)] +
                        [[i, i + 1] for i in range(0, 32, 2)])
     self.assertDatasetProduces(rebatched_dataset, expected_output)
@@ -242,24 +255,6 @@ class RebatchDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
                        for i in range(0, 32, 2)]
     self.assertDatasetProduces(rebatched_dataset, expected_output)
 
-  def testUnsupportedTransformError(self):
-    dataset = dataset_ops.Dataset.range(1024).batch(32).apply(sleep.sleep(10))
-    with self.assertRaises(errors.InvalidArgumentError):
-      rebatched_dataset = distribute._RebatchDataset(
-          dataset, num_replicas=4, use_fallback=False)
-      next_element = self.getNext(rebatched_dataset)
-      self.evaluate(next_element())
-
-  def testUnsupportedTransformInFlatMapError(self):
-    dataset = dataset_ops.Dataset.range(2).flat_map(
-        lambda _: dataset_ops.Dataset.range(32).batch(  # pylint: disable=g-long-lambda
-            32).apply(sleep.sleep(10)))
-    with self.assertRaises(errors.InvalidArgumentError):
-      rebatched_dataset = distribute._RebatchDataset(
-          dataset, num_replicas=4, use_fallback=False)
-      next_element = self.getNext(rebatched_dataset)
-      self.evaluate(next_element())
-
   def testFlatMapBatching(self):
     dataset = dataset_ops.Dataset.range(2).flat_map(
         lambda _: dataset_ops.Dataset.range(32).batch(  # pylint: disable=g-long-lambda
@@ -290,11 +285,8 @@ class RebatchDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
     rebatched_dataset = distribute._RebatchDataset(dataset, num_replicas=4)
     self.assertEqual([[None]],
                      [ts.as_list() for ts in _flat_shapes(rebatched_dataset)])
-    # List of 4 elements where each element is a list of 8 numbering from 0 to
-    # 31 repeated twice.
-    expected_output = [[k for k in range(i, i + 8)]  # pylint: disable=g-complex-comprehension
-                       for i in range(0, 32, 8)  # generates 4 elements
-                       for _ in range(2)]
+    expected_output = [[k for k in range(i, i + 8)] for i in range(0, 32, 8)]
+    expected_output += expected_output
     self.assertDatasetProduces(rebatched_dataset, expected_output)
 
   def testParallelInterleaveBatching(self):
@@ -310,11 +302,8 @@ class RebatchDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
     rebatched_dataset = distribute._RebatchDataset(dataset, num_replicas=4)
     self.assertEqual([[None]],
                      [ts.as_list() for ts in _flat_shapes(rebatched_dataset)])
-    # List of 4 elements where each element is a list of 8 numbering from 0 to
-    # 31 repeated twice in collated fashion i.e [0...8], [0...8] etc.
-    expected_output = [[k for k in range(i, i + 8)]  # pylint: disable=g-complex-comprehension
-                       for i in range(0, 32, 8)  # generates 4 elements
-                       for _ in range(2)]
+    expected_output = [[k for k in range(i, i + 8)] for i in range(0, 32, 8)]
+    expected_output += expected_output
     self.assertDatasetProduces(rebatched_dataset, expected_output)
 
   def testGroupByWindowStaticBatch(self):
@@ -350,8 +339,7 @@ class RebatchDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
             key_func=lambda x: x, reduce_func=reduce_fn, window_size=10))
     dataset = distribute._RebatchDataset(dataset, num_replicas=2)
 
-    self.assertEqual([[None]],
-                     [ts.as_list() for ts in _flat_shapes(dataset)])
+    self.assertEqual([[None]], [ts.as_list() for ts in _flat_shapes(dataset)])
 
     # The batches of 5 (value == 0) will be split into minibatches of (3, 2) and
     # the batches of 10 (value == 1) split into minibatches of (5, 5)
@@ -377,8 +365,8 @@ class RebatchDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
 
     self.assertEqual([[None]], [ts.as_list() for ts in _flat_shapes(dataset)])
 
-    pairs = [(3, 0), (2, 0), (3, 0), (2, 0), (1, 0), (5, 1), (5, 1), (1, 1),
-             (3, 0), (2, 0), (2, 0), (2, 0), (5, 1), (4, 1)]
+    pairs = [(3, 0), (2, 0), (3, 0), (2, 0), (1, 0), (0, 0), (5, 1), (5, 1),
+             (1, 1), (0, 1), (3, 0), (2, 0), (2, 0), (2, 0), (5, 1), (4, 1)]
     expected_output = [[value] * batch_size for batch_size, value in pairs]
     self.assertDatasetProduces(dataset, expected_output)
 
@@ -450,92 +438,5 @@ class RebatchDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
     self.assertDatasetProduces(rebatched_dataset, expected_output)
 
 
-@test_util.run_all_in_graph_and_eager_modes
-class RebatchDatasetFallbackTest(test_base.DatasetTestBase):
-
-  def testWithNoBatchDataset(self):
-    dataset = dataset_ops.Dataset.from_tensor_slices(
-        [[k for k in range(i, i + 32)] for i in range(0, 1024, 32)])  # pylint: disable=g-complex-comprehension
-    rebatched_dataset = distribute._RebatchDataset(dataset, num_replicas=4)
-    self.assertEqual([[32]], [ts.as_list() for ts in _flat_shapes(dataset)])
-    self.assertEqual([[8]],
-                     [ts.as_list() for ts in _flat_shapes(rebatched_dataset)])
-
-    expected_output = [[k for k in range(i, i + 8)] for i in range(0, 1024, 8)]  # pylint: disable=g-complex-comprehension
-    self.assertDatasetProduces(rebatched_dataset, expected_output)
-
-  def testWithUnhandledTransformation(self):
-    dataset = dataset_ops.Dataset.range(1024).batch(
-        32, drop_remainder=True).apply(sleep.sleep(10))
-    rebatched_dataset = distribute._RebatchDataset(dataset, num_replicas=4)
-    self.assertEqual([[32]], [ts.as_list() for ts in _flat_shapes(dataset)])
-    self.assertEqual([[8]],
-                     [ts.as_list() for ts in _flat_shapes(rebatched_dataset)])
-
-    expected_output = [[k for k in range(i, i + 8)] for i in range(0, 1024, 8)]  # pylint: disable=g-complex-comprehension
-    self.assertDatasetProduces(rebatched_dataset, expected_output)
-
-  def testWithUnhandledTransformationInFlatMap(self):
-    dataset = dataset_ops.Dataset.range(2).flat_map(
-        lambda _: dataset_ops.Dataset.range(32).batch(  # pylint: disable=g-long-lambda
-            32, drop_remainder=True).apply(sleep.sleep(10)))
-    rebatched_dataset = distribute._RebatchDataset(dataset, num_replicas=4)
-
-    self.assertEqual([[8]],
-                     [ts.as_list() for ts in _flat_shapes(rebatched_dataset)])
-
-    # Two elements where each element is a list of 4 elements where each element
-    # is a list of 8.
-    expected_output = [
-        [k for k in range(i, i + 8)]  # pylint: disable=g-complex-comprehension
-        for _ in range(2) for i in range(0, 32, 8)]  # generates 4 elements
-    self.assertDatasetProduces(rebatched_dataset, expected_output)
-
-  def testWithUnknownBatchDim(self):
-    dataset = dataset_ops.Dataset.range(1024).batch(
-        32, drop_remainder=False).apply(sleep.sleep(10))
-    rebatched_dataset = distribute._RebatchDataset(dataset, num_replicas=4)
-
-    expected_output = [[k for k in range(i, i + 8)] for i in range(0, 1024, 8)]  # pylint: disable=g-complex-comprehension
-    self.assertDatasetProduces(rebatched_dataset, expected_output)
-
-  def testWithUnknownBatchDimInSecondComponent(self):
-    dataset0 = dataset_ops.Dataset.range(1024).batch(32, drop_remainder=True)
-    dataset1 = dataset_ops.Dataset.range(1024).batch(
-        32, drop_remainder=False).apply(sleep.sleep(10))
-    dataset = dataset_ops.Dataset.zip((dataset0, dataset1))
-    rebatched_dataset = distribute._RebatchDataset(dataset, num_replicas=4)
-
-    expected_output = [[k for k in range(i, i + 8)] for i in range(0, 1024, 8)]  # pylint: disable=g-complex-comprehension
-    expected_output = [(x, x) for x in expected_output]
-    self.assertDatasetProduces(rebatched_dataset, expected_output)
-
-  def testBatchSizeNotDivisibleByNumReplicas(self):
-    dataset = dataset_ops.Dataset.range(64).batch(
-        32, drop_remainder=True).apply(sleep.sleep(10))
-
-    rebatched_dataset = distribute._RebatchDataset(dataset, num_replicas=5)
-
-    expected_output = []
-    i = 0
-    for _ in range(2):  # number of steps
-      # first four minibatches have seven elements
-      for _ in range(4):
-        expected_output.append([k for k in range(i, i + 7)])
-        i += 7
-      # last minibatch has four elements
-      expected_output.append([k for k in range(i, i + 4)])
-      i += 4
-    self.assertDatasetProduces(rebatched_dataset, expected_output)
-
-  def testBatchSizesDontMatch(self):
-    dataset = dataset_ops.Dataset.from_tensors((np.arange(10), np.arange(5)))
-    with self.assertRaisesRegexp(errors.InvalidArgumentError,
-                                 "Cannot use rebatching fallback"):
-      rebatched_dataset = distribute._RebatchDataset(dataset, num_replicas=5)
-      next_element = self.getNext(rebatched_dataset)
-      self.evaluate(next_element())
-
-
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/data/experimental/kernel_tests/serialization/BUILD b/tensorflow/python/data/experimental/kernel_tests/serialization/BUILD
index 3f84bede6cf..bded3c386fe 100644
--- a/tensorflow/python/data/experimental/kernel_tests/serialization/BUILD
+++ b/tensorflow/python/data/experimental/kernel_tests/serialization/BUILD
@@ -25,16 +25,41 @@ py_library(
         "//tensorflow/python:util",
         "//tensorflow/python:variables",
         "//tensorflow/python/data/experimental/ops:iterator_ops",
+        "//tensorflow/python/data/kernel_tests:test_base",
         "//tensorflow/python/data/ops:iterator_ops",
         "//third_party/py/numpy",
     ],
 )
 
+py_test(
+    name = "auto_shard_dataset_serialization_test",
+    size = "medium",
+    srcs = ["auto_shard_dataset_serialization_test.py"],
+    python_version = "PY3",
+    srcs_version = "PY2AND3",
+    tags = [
+        "no_oss",
+        "no_pip",
+        "no_windows",
+    ],
+    deps = [
+        ":dataset_serialization_test_base",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:sparse_tensor",
+        "//tensorflow/python/data/experimental/ops:distribute",
+        "//tensorflow/python/data/experimental/ops:interleave_ops",
+        "//tensorflow/python/data/experimental/ops:readers",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//third_party/py/numpy",
+    ],
+)
+
 py_test(
     name = "batch_dataset_serialization_test",
     size = "medium",
     srcs = ["batch_dataset_serialization_test.py"],
-    python_version = "PY2",
+    python_version = "PY3",
     srcs_version = "PY2AND3",
     tags = [
         "no_oss",
@@ -56,7 +81,7 @@ py_test(
     name = "cache_dataset_serialization_test",
     size = "small",
     srcs = ["cache_dataset_serialization_test.py"],
-    python_version = "PY2",
+    python_version = "PY3",
     srcs_version = "PY2AND3",
     tags = [
         "no_oss",
@@ -76,7 +101,7 @@ py_test(
     name = "checkpoint_input_pipeline_hook_test",
     size = "small",
     srcs = ["checkpoint_input_pipeline_hook_test.py"],
-    python_version = "PY2",
+    python_version = "PY3",
     srcs_version = "PY2AND3",
     tags = [
         "no_pip",
@@ -101,7 +126,7 @@ py_test(
     name = "choose_fastest_branch_dataset_serialization_test",
     size = "medium",
     srcs = ["choose_fastest_branch_dataset_serialization_test.py"],
-    python_version = "PY2",
+    python_version = "PY3",
     srcs_version = "PY2AND3",
     tags = [
         "no_oss",
@@ -123,7 +148,7 @@ py_test(
     name = "choose_fastest_dataset_serialization_test",
     size = "small",
     srcs = ["choose_fastest_dataset_serialization_test.py"],
-    python_version = "PY2",
+    python_version = "PY3",
     srcs_version = "PY2AND3",
     tags = [
         "no_oss",
@@ -142,7 +167,7 @@ py_test(
     name = "concatenate_dataset_serialization_test",
     size = "small",
     srcs = ["concatenate_dataset_serialization_test.py"],
-    python_version = "PY2",
+    python_version = "PY3",
     srcs_version = "PY2AND3",
     tags = [
         "no_oss",
@@ -161,7 +186,7 @@ py_test(
     name = "csv_dataset_serialization_test",
     size = "small",
     srcs = ["csv_dataset_serialization_test.py"],
-    python_version = "PY2",
+    python_version = "PY3",
     srcs_version = "PY2AND3",
     tags = [
         "no_oss",
@@ -180,7 +205,7 @@ py_test(
     name = "dataset_constructor_serialization_test",
     size = "medium",
     srcs = ["dataset_constructor_serialization_test.py"],
-    python_version = "PY2",
+    python_version = "PY3",
     srcs_version = "PY2AND3",
     tags = [
         "no_oss",
@@ -196,35 +221,11 @@ py_test(
     ],
 )
 
-py_test(
-    name = "auto_shard_dataset_serialization_test",
-    size = "medium",
-    srcs = ["auto_shard_dataset_serialization_test.py"],
-    python_version = "PY2",
-    srcs_version = "PY2AND3",
-    tags = [
-        "no_oss",
-        "no_pip",
-        "no_windows",
-    ],
-    deps = [
-        ":dataset_serialization_test_base",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:sparse_tensor",
-        "//tensorflow/python/data/experimental/ops:distribute",
-        "//tensorflow/python/data/experimental/ops:interleave_ops",
-        "//tensorflow/python/data/experimental/ops:readers",
-        "//tensorflow/python/data/ops:dataset_ops",
-        "//third_party/py/numpy",
-    ],
-)
-
 py_test(
     name = "filter_dataset_serialization_test",
     size = "medium",
     srcs = ["filter_dataset_serialization_test.py"],
-    python_version = "PY2",
+    python_version = "PY3",
     srcs_version = "PY2AND3",
     tags = [
         "no_oss",
@@ -245,7 +246,7 @@ py_test(
     name = "fixed_length_record_dataset_serialization_test",
     size = "medium",
     srcs = ["fixed_length_record_dataset_serialization_test.py"],
-    python_version = "PY2",
+    python_version = "PY3",
     shard_count = 4,
     srcs_version = "PY2AND3",
     tags = [
@@ -265,7 +266,7 @@ py_test(
     name = "flat_map_dataset_serialization_test",
     size = "medium",
     srcs = ["flat_map_dataset_serialization_test.py"],
-    python_version = "PY2",
+    python_version = "PY3",
     tags = [
         "no_oss",
         "no_pip",
@@ -291,7 +292,7 @@ py_test(
     name = "group_by_reducer_serialization_test",
     size = "medium",
     srcs = ["group_by_reducer_serialization_test.py"],
-    python_version = "PY2",
+    python_version = "PY3",
     srcs_version = "PY2AND3",
     tags = [
         "no_oss",
@@ -311,7 +312,7 @@ py_test(
     name = "group_by_window_serialization_test",
     size = "medium",
     srcs = ["group_by_window_serialization_test.py"],
-    python_version = "PY2",
+    python_version = "PY3",
     srcs_version = "PY2AND3",
     tags = [
         "no_oss",
@@ -331,7 +332,7 @@ py_test(
     name = "ignore_errors_serialization_test",
     size = "small",
     srcs = ["ignore_errors_serialization_test.py"],
-    python_version = "PY2",
+    python_version = "PY3",
     srcs_version = "PY2AND3",
     tags = [
         "no_oss",
@@ -352,7 +353,7 @@ py_test(
     name = "interleave_dataset_serialization_test",
     size = "medium",
     srcs = ["interleave_dataset_serialization_test.py"],
-    python_version = "PY2",
+    python_version = "PY3",
     srcs_version = "PY2AND3",
     tags = [
         "no_oss",
@@ -374,7 +375,7 @@ py_test(
     name = "map_and_batch_dataset_serialization_test",
     size = "medium",
     srcs = ["map_and_batch_dataset_serialization_test.py"],
-    python_version = "PY2",
+    python_version = "PY3",
     srcs_version = "PY2AND3",
     tags = [
         "no_oss",
@@ -394,7 +395,7 @@ py_test(
     name = "map_dataset_serialization_test",
     size = "medium",
     srcs = ["map_dataset_serialization_test.py"],
-    python_version = "PY2",
+    python_version = "PY3",
     srcs_version = "PY2AND3",
     tags = [
         "no_oss",
@@ -421,7 +422,7 @@ py_test(
     name = "matching_files_dataset_serialization_test",
     size = "small",
     srcs = ["matching_files_dataset_serialization_test.py"],
-    python_version = "PY2",
+    python_version = "PY3",
     srcs_version = "PY2AND3",
     tags = [
         "no_windows",
@@ -439,7 +440,7 @@ py_test(
     name = "optimize_dataset_serialization_test",
     size = "small",
     srcs = ["optimize_dataset_serialization_test.py"],
-    python_version = "PY2",
+    python_version = "PY3",
     srcs_version = "PY2AND3",
     tags = [
         "no_oss",
@@ -458,7 +459,7 @@ py_test(
     name = "rebatch_dataset_serialization_test",
     size = "small",
     srcs = ["rebatch_dataset_serialization_test.py"],
-    python_version = "PY2",
+    python_version = "PY3",
     srcs_version = "PY2AND3",
     tags = [
         "no_oss",
@@ -477,7 +478,7 @@ py_test(
     name = "padded_batch_dataset_serialization_test",
     size = "medium",
     srcs = ["padded_batch_dataset_serialization_test.py"],
-    python_version = "PY2",
+    python_version = "PY3",
     srcs_version = "PY2AND3",
     tags = [
         "no_oss",
@@ -498,7 +499,7 @@ py_test(
     name = "parallel_interleave_dataset_serialization_test",
     size = "medium",
     srcs = ["parallel_interleave_dataset_serialization_test.py"],
-    python_version = "PY2",
+    python_version = "PY3",
     srcs_version = "PY2AND3",
     tags = [
         "no_oss",
@@ -520,7 +521,7 @@ py_test(
     name = "parallel_map_dataset_serialization_test",
     size = "medium",
     srcs = ["parallel_map_dataset_serialization_test.py"],
-    python_version = "PY2",
+    python_version = "PY3",
     srcs_version = "PY2AND3",
     tags = [
         "no_oss",
@@ -546,7 +547,7 @@ py_test(
     name = "parse_example_dataset_serialization_test",
     size = "medium",
     srcs = ["parse_example_dataset_serialization_test.py"],
-    python_version = "PY2",
+    python_version = "PY3",
     srcs_version = "PY2AND3",
     tags = [
         "no_oss",
@@ -564,7 +565,7 @@ py_test(
     name = "prefetch_dataset_serialization_test",
     size = "small",
     srcs = ["prefetch_dataset_serialization_test.py"],
-    python_version = "PY2",
+    python_version = "PY3",
     srcs_version = "PY2AND3",
     tags = [
         "no_oss",
@@ -582,7 +583,7 @@ py_test(
     name = "range_dataset_serialization_test",
     size = "small",
     srcs = ["range_dataset_serialization_test.py"],
-    python_version = "PY2",
+    python_version = "PY3",
     srcs_version = "PY2AND3",
     tags = [
         "no_oss",
@@ -607,7 +608,7 @@ py_test(
     name = "sample_from_datasets_serialization_test",
     size = "medium",
     srcs = ["sample_from_datasets_serialization_test.py"],
-    python_version = "PY2",
+    python_version = "PY3",
     srcs_version = "PY2AND3",
     tags = [
         "no_oss",
@@ -626,7 +627,7 @@ py_test(
     name = "scan_dataset_serialization_test",
     size = "small",
     srcs = ["scan_dataset_serialization_test.py"],
-    python_version = "PY2",
+    python_version = "PY3",
     srcs_version = "PY2AND3",
     tags = [
         "no_oss",
@@ -645,7 +646,7 @@ py_test(
     name = "sequence_dataset_serialization_test",
     size = "medium",
     srcs = ["sequence_dataset_serialization_test.py"],
-    python_version = "PY2",
+    python_version = "PY3",
     srcs_version = "PY2AND3",
     tags = [
         "no_oss",
@@ -664,7 +665,7 @@ py_test(
     name = "serialization_integration_test",
     size = "small",
     srcs = ["serialization_integration_test.py"],
-    python_version = "PY2",
+    python_version = "PY3",
     srcs_version = "PY2AND3",
     tags = [
         "no_oss",
@@ -673,9 +674,11 @@ py_test(
     ],
     deps = [
         "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_combinations",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:training",
         "//tensorflow/python/data/experimental/ops:iterator_ops",
+        "//tensorflow/python/data/kernel_tests:test_base",
         "//tensorflow/python/data/ops:dataset_ops",
     ],
 )
@@ -684,7 +687,7 @@ py_test(
     name = "shard_dataset_serialization_test",
     size = "medium",
     srcs = ["shard_dataset_serialization_test.py"],
-    python_version = "PY2",
+    python_version = "PY3",
     srcs_version = "PY2AND3",
     tags = [
         "no_oss",
@@ -703,7 +706,7 @@ py_test(
     name = "shuffle_and_repeat_dataset_serialization_test",
     size = "medium",
     srcs = ["shuffle_and_repeat_dataset_serialization_test.py"],
-    python_version = "PY2",
+    python_version = "PY3",
     srcs_version = "PY2AND3",
     tags = [
         "no_oss",
@@ -722,7 +725,7 @@ py_test(
     name = "shuffle_dataset_serialization_test",
     size = "medium",
     srcs = ["shuffle_dataset_serialization_test.py"],
-    python_version = "PY2",
+    python_version = "PY3",
     srcs_version = "PY2AND3",
     tags = [
         "no_oss",
@@ -743,7 +746,7 @@ py_test(
     name = "sql_dataset_serialization_test",
     size = "small",
     srcs = ["sql_dataset_serialization_test.py"],
-    python_version = "PY2",
+    python_version = "PY3",
     srcs_version = "PY2AND3",
     tags = [
         "no_oss",
@@ -764,7 +767,7 @@ py_test(
     name = "stats_dataset_serialization_test",
     size = "medium",
     srcs = ["stats_dataset_serialization_test.py"],
-    python_version = "PY2",
+    python_version = "PY3",
     srcs_version = "PY2AND3",
     tags = [
         "no_oss",
@@ -786,7 +789,7 @@ py_test(
     name = "take_while_dataset_serialization_test",
     size = "small",
     srcs = ["take_while_dataset_serialization_test.py"],
-    python_version = "PY2",
+    python_version = "PY3",
     srcs_version = "PY2AND3",
     tags = [
         "no_oss",
@@ -806,7 +809,7 @@ py_test(
     name = "textline_dataset_serialization_test",
     size = "medium",
     srcs = ["textline_dataset_serialization_test.py"],
-    python_version = "PY2",
+    python_version = "PY3",
     shard_count = 4,
     srcs_version = "PY2AND3",
     tags = [
@@ -826,7 +829,7 @@ py_test(
     name = "tf_record_dataset_serialization_test",
     size = "medium",
     srcs = ["tf_record_dataset_serialization_test.py"],
-    python_version = "PY2",
+    python_version = "PY3",
     shard_count = 4,
     srcs_version = "PY2AND3",
     tags = [
@@ -846,7 +849,7 @@ py_test(
     name = "unbatch_dataset_serialization_test",
     size = "medium",
     srcs = ["unbatch_dataset_serialization_test.py"],
-    python_version = "PY2",
+    python_version = "PY3",
     srcs_version = "PY2AND3",
     tags = [
         "no_oss",
@@ -866,7 +869,7 @@ py_test(
     name = "unique_dataset_serialization_test",
     size = "small",
     srcs = ["unique_dataset_serialization_test.py"],
-    python_version = "PY2",
+    python_version = "PY3",
     srcs_version = "PY2AND3",
     tags = [
         "no_oss",
@@ -885,7 +888,7 @@ py_test(
     name = "zip_dataset_serialization_test",
     size = "small",
     srcs = ["zip_dataset_serialization_test.py"],
-    python_version = "PY2",
+    python_version = "PY3",
     srcs_version = "PY2AND3",
     tags = [
         "no_oss",
diff --git a/tensorflow/python/data/experimental/kernel_tests/serialization/auto_shard_dataset_serialization_test.py b/tensorflow/python/data/experimental/kernel_tests/serialization/auto_shard_dataset_serialization_test.py
index ee1792f3ff8..195181d14c6 100644
--- a/tensorflow/python/data/experimental/kernel_tests/serialization/auto_shard_dataset_serialization_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/serialization/auto_shard_dataset_serialization_test.py
@@ -19,18 +19,23 @@ from __future__ import print_function
 
 import os
 
+from absl.testing import parameterized
+
 from tensorflow.python.data.experimental.kernel_tests.serialization import dataset_serialization_test_base
 from tensorflow.python.data.experimental.ops import distribute
 from tensorflow.python.data.experimental.ops import interleave_ops
+from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.ops import readers
+from tensorflow.python.framework import combinations
 from tensorflow.python.lib.io import python_io
 from tensorflow.python.platform import test
 from tensorflow.python.util import compat
 
 
 class AutoShardDatasetSerializationTest(
-    dataset_serialization_test_base.DatasetSerializationTestBase):
+    dataset_serialization_test_base.DatasetSerializationTestBase,
+    parameterized.TestCase):
 
   def _record(self, f, r):
     return compat.as_bytes("Record %d of file %d" % (r, f))
@@ -49,6 +54,7 @@ class AutoShardDatasetSerializationTest(
   def setUp(self):
     self._filenames = self._createFiles()
 
+  @combinations.generate(test_base.default_test_combinations())
   def testCore(self):
 
     def build_dataset():
diff --git a/tensorflow/python/data/experimental/kernel_tests/serialization/batch_dataset_serialization_test.py b/tensorflow/python/data/experimental/kernel_tests/serialization/batch_dataset_serialization_test.py
index 8766a1c7cdf..f6603a4090b 100644
--- a/tensorflow/python/data/experimental/kernel_tests/serialization/batch_dataset_serialization_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/serialization/batch_dataset_serialization_test.py
@@ -17,18 +17,22 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from absl.testing import parameterized
 import numpy as np
 
 from tensorflow.python.data.experimental.kernel_tests.serialization import dataset_serialization_test_base
 from tensorflow.python.data.experimental.ops import batching
+from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.framework import combinations
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.ops import array_ops
 from tensorflow.python.platform import test
 
 
 class BatchDatasetSerializationTest(
-    dataset_serialization_test_base.DatasetSerializationTestBase):
+    dataset_serialization_test_base.DatasetSerializationTestBase,
+    parameterized.TestCase):
 
   def build_dataset(self, multiplier=15.0, tensor_slice_len=2, batch_size=2):
     components = (
@@ -38,6 +42,7 @@ class BatchDatasetSerializationTest(
 
     return dataset_ops.Dataset.from_tensor_slices(components).batch(batch_size)
 
+  @combinations.generate(test_base.default_test_combinations())
   def testCore(self):
     tensor_slice_len = 8
     batch_size = 2
@@ -51,6 +56,7 @@ class BatchDatasetSerializationTest(
         lambda x: array_ops.fill([x], x)).apply(
             batching.dense_to_sparse_batch(4, [12]))
 
+  @combinations.generate(test_base.default_test_combinations())
   def testDenseToSparseBatchDatasetCore(self):
     components = np.random.randint(5, size=(40,)).astype(np.int32)
 
@@ -65,12 +71,14 @@ class BatchDatasetSerializationTest(
   def _build_dataset_sparse(self, batch_size=5):
     return dataset_ops.Dataset.range(10).map(self._sparse).batch(batch_size)
 
+  @combinations.generate(test_base.default_test_combinations())
   def testSparseCore(self):
     self.run_core_tests(self._build_dataset_sparse, 2)
 
   def _build_dataset_nested_sparse(self):
     return dataset_ops.Dataset.range(10).map(self._sparse).batch(5).batch(2)
 
+  @combinations.generate(test_base.default_test_combinations())
   def testNestedSparseCore(self):
     self.run_core_tests(self._build_dataset_nested_sparse, 1)
 
diff --git a/tensorflow/python/data/experimental/kernel_tests/serialization/cache_dataset_serialization_test.py b/tensorflow/python/data/experimental/kernel_tests/serialization/cache_dataset_serialization_test.py
index 0f86e44e281..a82a81b9e06 100644
--- a/tensorflow/python/data/experimental/kernel_tests/serialization/cache_dataset_serialization_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/serialization/cache_dataset_serialization_test.py
@@ -22,7 +22,9 @@ import os
 from absl.testing import parameterized
 
 from tensorflow.python.data.experimental.kernel_tests.serialization import dataset_serialization_test_base
+from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.framework import combinations
 from tensorflow.python.framework import errors
 from tensorflow.python.platform import test
 
@@ -52,10 +54,9 @@ class CacheDatasetSerializationTest(
   def expected_outputs(self):
     return list(range(self.range_size)) * self.num_repeats
 
-  @parameterized.named_parameters(
-      ('Memory', True),
-      ('File', False),
-  )
+  @combinations.generate(
+      combinations.times(test_base.default_test_combinations(),
+                         combinations.combine(is_memory=[True, False])))
   def testCheckpointBeforeOneEpoch(self, is_memory):
     ds_fn = self.make_dataset_fn(is_memory)
 
@@ -73,10 +74,9 @@ class CacheDatasetSerializationTest(
             verify_exhausted=False))
     self.assertSequenceEqual(outputs, self.expected_outputs())
 
-  @parameterized.named_parameters(
-      ('Memory', True),
-      ('File', False),
-  )
+  @combinations.generate(
+      combinations.times(test_base.default_test_combinations(),
+                         combinations.combine(is_memory=[True, False])))
   def testCheckpointBeforeOneEpochThenRunFewSteps(self, is_memory):
     ds_fn = self.make_dataset_fn(is_memory)
 
@@ -94,10 +94,9 @@ class CacheDatasetSerializationTest(
             verify_exhausted=False))
     self.assertSequenceEqual(outputs, self.expected_outputs())
 
-  @parameterized.named_parameters(
-      ('Memory', True),
-      ('File', False),
-  )
+  @combinations.generate(
+      combinations.times(test_base.default_test_combinations(),
+                         combinations.combine(is_memory=[True, False])))
   def testCheckpointAfterOneEpoch(self, is_memory):
     ds_fn = self.make_dataset_fn(is_memory)
 
@@ -115,10 +114,9 @@ class CacheDatasetSerializationTest(
             verify_exhausted=False))
     self.assertSequenceEqual(outputs, self.expected_outputs())
 
-  @parameterized.named_parameters(
-      ('Memory', True),
-      ('File', False),
-  )
+  @combinations.generate(
+      combinations.times(test_base.default_test_combinations(),
+                         combinations.combine(is_memory=[True, False])))
   def testCheckpointAfterOneEpochThenRunFewSteps(self, is_memory):
     ds_fn = self.make_dataset_fn(is_memory)
 
@@ -134,10 +132,9 @@ class CacheDatasetSerializationTest(
         verify_exhausted=False)
     self.assertSequenceEqual(outputs, list(range(10)) * 3)
 
-  @parameterized.named_parameters(
-      ('Memory', True),
-      ('File', False),
-  )
+  @combinations.generate(
+      combinations.times(test_base.default_test_combinations(),
+                         combinations.combine(is_memory=[True, False])))
   def testCheckpointBeforeOneEpochButRunCompleteEpoch(self, is_memory):
     ds_fn = self.make_dataset_fn(is_memory)
 
@@ -158,10 +155,9 @@ class CacheDatasetSerializationTest(
         verify_exhausted=False)
     self.assertSequenceEqual(outputs, list(range(10)) * 3)
 
-  @parameterized.named_parameters(
-      ('Memory', True),
-      ('File', False),
-  )
+  @combinations.generate(
+      combinations.times(test_base.default_test_combinations(),
+                         combinations.combine(is_memory=[True, False])))
   def testCheckpointUnusedWriterIterator(self, is_memory):
     ds_fn = self.make_dataset_fn(is_memory)
 
@@ -173,10 +169,9 @@ class CacheDatasetSerializationTest(
         ds_fn, [], self.num_outputs, ckpt_saved=True, verify_exhausted=False)
     self.assertSequenceEqual(outputs, list(range(10)) * 3)
 
-  @parameterized.named_parameters(
-      ('Memory', True),
-      ('File', False),
-  )
+  @combinations.generate(
+      combinations.times(test_base.default_test_combinations(),
+                         combinations.combine(is_memory=[True, False])))
   def testCheckpointUnusedMidwayWriterIterator(self, is_memory):
     ds_fn = self.make_dataset_fn(is_memory)
 
@@ -198,10 +193,9 @@ class CacheDatasetSerializationTest(
             verify_exhausted=False))
     self.assertSequenceEqual(outputs, list(range(10)) * 3)
 
-  @parameterized.named_parameters(
-      ('Memory', True),
-      ('File', False),
-  )
+  @combinations.generate(
+      combinations.times(test_base.default_test_combinations(),
+                         combinations.combine(is_memory=[True, False])))
   def testUnusedCheckpointError(self, is_memory):
     ds_fn = self.make_dataset_fn(is_memory)
 
@@ -221,10 +215,9 @@ class CacheDatasetSerializationTest(
         outputs = self.gen_outputs(
             ds_fn, [], self.num_outputs, verify_exhausted=False)
 
-  @parameterized.named_parameters(
-      ('Memory', True),
-      ('File', False),
-  )
+  @combinations.generate(
+      combinations.times(test_base.default_test_combinations(),
+                         combinations.combine(is_memory=[True, False])))
   def testIgnoreCheckpointIfCacheWritten(self, is_memory):
     ds_fn = self.make_dataset_fn(is_memory)
 
diff --git a/tensorflow/python/data/experimental/kernel_tests/serialization/checkpoint_input_pipeline_hook_test.py b/tensorflow/python/data/experimental/kernel_tests/serialization/checkpoint_input_pipeline_hook_test.py
index 84b8e5ca364..09ee09f717a 100644
--- a/tensorflow/python/data/experimental/kernel_tests/serialization/checkpoint_input_pipeline_hook_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/serialization/checkpoint_input_pipeline_hook_test.py
@@ -18,8 +18,12 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from absl.testing import parameterized
+
 from tensorflow.python.data.experimental.ops import iterator_ops
+from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.framework import combinations
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
@@ -35,7 +39,7 @@ from tensorflow_estimator.python.estimator import model_fn
 
 
 @test_util.run_v1_only('b/123904664')
-class CheckpointInputPipelineHookTest(test.TestCase):
+class CheckpointInputPipelineHookTest(test.TestCase, parameterized.TestCase):
 
   @staticmethod
   def _model_fn(features, labels, mode, config):
@@ -69,6 +73,7 @@ class CheckpointInputPipelineHookTest(test.TestCase):
   def _build_iterator_saver_hook(self, est):
     return iterator_ops.CheckpointInputPipelineHook(est)
 
+  @combinations.generate(test_base.default_test_combinations())
   def testReturnDatasetFromInputFn(self):
 
     def _input_fn():
@@ -81,6 +86,7 @@ class CheckpointInputPipelineHookTest(test.TestCase):
     est.train(_input_fn, steps=2, hooks=[self._build_iterator_saver_hook(est)])
     self.assertSequenceEqual(self._read_vars(est.model_dir), (4, 3))
 
+  @combinations.generate(test_base.default_test_combinations())
   def testBuildIteratorInInputFn(self):
 
     def _input_fn():
@@ -95,6 +101,7 @@ class CheckpointInputPipelineHookTest(test.TestCase):
     est.train(_input_fn, steps=2, hooks=[self._build_iterator_saver_hook(est)])
     self.assertSequenceEqual(self._read_vars(est.model_dir), (4, 3))
 
+  @combinations.generate(test_base.default_test_combinations())
   def testDoNotRestore(self):
 
     def _input_fn():
@@ -110,6 +117,7 @@ class CheckpointInputPipelineHookTest(test.TestCase):
     est.train(_input_fn, steps=2)
     self.assertSequenceEqual(self._read_vars(est.model_dir), (6, 1))
 
+  @combinations.generate(test_base.default_test_combinations())
   def testRaiseErrorIfNoIterator(self):
 
     def _input_fn():
diff --git a/tensorflow/python/data/experimental/kernel_tests/serialization/choose_fastest_branch_dataset_serialization_test.py b/tensorflow/python/data/experimental/kernel_tests/serialization/choose_fastest_branch_dataset_serialization_test.py
index d73420cf2b0..573b773a2e8 100644
--- a/tensorflow/python/data/experimental/kernel_tests/serialization/choose_fastest_branch_dataset_serialization_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/serialization/choose_fastest_branch_dataset_serialization_test.py
@@ -17,10 +17,14 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from absl.testing import parameterized
+
 from tensorflow.python.data.experimental.kernel_tests.serialization import dataset_serialization_test_base
 from tensorflow.python.data.experimental.ops import batching
 from tensorflow.python.data.experimental.ops import optimization
+from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.framework import combinations
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.ops import math_ops
@@ -28,8 +32,10 @@ from tensorflow.python.platform import test
 
 
 class ChooseFastestBranchDatasetSerializationTest(
-    dataset_serialization_test_base.DatasetSerializationTestBase):
+    dataset_serialization_test_base.DatasetSerializationTestBase,
+    parameterized.TestCase):
 
+  @combinations.generate(test_base.default_test_combinations())
   def testCore(self):
 
     def build_ds(size):
@@ -48,6 +54,7 @@ class ChooseFastestBranchDatasetSerializationTest(
     for size in [100, 1000]:
       self.run_core_tests(lambda: build_ds(size), size // 10)  # pylint: disable=cell-var-from-loop
 
+  @combinations.generate(test_base.default_test_combinations())
   def testWithCapture(self):
 
     def build_ds():
@@ -66,6 +73,7 @@ class ChooseFastestBranchDatasetSerializationTest(
 
     self.run_core_tests(build_ds, 10)
 
+  @combinations.generate(test_base.default_test_combinations())
   def testWithPrefetch(self):
 
     def build_ds():
@@ -84,6 +92,7 @@ class ChooseFastestBranchDatasetSerializationTest(
 
     self.run_core_tests(build_ds, 10)
 
+  @combinations.generate(test_base.default_test_combinations())
   def testWithMoreOutputThanInput(self):
 
     def build_ds():
diff --git a/tensorflow/python/data/experimental/kernel_tests/serialization/choose_fastest_dataset_serialization_test.py b/tensorflow/python/data/experimental/kernel_tests/serialization/choose_fastest_dataset_serialization_test.py
index 73146a5239a..cdd2edfd617 100644
--- a/tensorflow/python/data/experimental/kernel_tests/serialization/choose_fastest_dataset_serialization_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/serialization/choose_fastest_dataset_serialization_test.py
@@ -12,20 +12,26 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Tests for the ZipDataset serialization."""
+"""Tests for the ChooseFastestDataset serialization."""
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from absl.testing import parameterized
+
 from tensorflow.python.data.experimental.kernel_tests.serialization import dataset_serialization_test_base
 from tensorflow.python.data.experimental.ops import optimization
+from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.framework import combinations
 from tensorflow.python.platform import test
 
 
 class ChooseFastestDatasetSerializationTest(
-    dataset_serialization_test_base.DatasetSerializationTestBase):
+    dataset_serialization_test_base.DatasetSerializationTestBase,
+    parameterized.TestCase):
 
+  @combinations.generate(test_base.default_test_combinations())
   def testCore(self):
     num_outputs = 10
     batch_size = 2
diff --git a/tensorflow/python/data/experimental/kernel_tests/serialization/concatenate_dataset_serialization_test.py b/tensorflow/python/data/experimental/kernel_tests/serialization/concatenate_dataset_serialization_test.py
index 968c8581d93..0e3bc637274 100644
--- a/tensorflow/python/data/experimental/kernel_tests/serialization/concatenate_dataset_serialization_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/serialization/concatenate_dataset_serialization_test.py
@@ -17,15 +17,19 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from absl.testing import parameterized
 import numpy as np
 
 from tensorflow.python.data.experimental.kernel_tests.serialization import dataset_serialization_test_base
+from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.framework import combinations
 from tensorflow.python.platform import test
 
 
 class ConcatenateDatasetSerializationTest(
-    dataset_serialization_test_base.DatasetSerializationTestBase):
+    dataset_serialization_test_base.DatasetSerializationTestBase,
+    parameterized.TestCase):
 
   def _build_concatenate_dataset(self, var_array):
     input_components = (np.tile(np.array([[1], [2], [3], [4]]), 20),
@@ -36,6 +40,7 @@ class ConcatenateDatasetSerializationTest(
     return dataset_ops.Dataset.from_tensor_slices(input_components).concatenate(
         dataset_ops.Dataset.from_tensor_slices(to_concatenate_components))
 
+  @combinations.generate(test_base.default_test_combinations())
   def testConcatenateCore(self):
     num_outputs = 9
     array = np.tile(np.array([[16], [17], [18], [19], [20]]), 15)
diff --git a/tensorflow/python/data/experimental/kernel_tests/serialization/csv_dataset_serialization_test.py b/tensorflow/python/data/experimental/kernel_tests/serialization/csv_dataset_serialization_test.py
index c1c91a6a4d8..1540e67119e 100644
--- a/tensorflow/python/data/experimental/kernel_tests/serialization/csv_dataset_serialization_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/serialization/csv_dataset_serialization_test.py
@@ -20,13 +20,18 @@ from __future__ import print_function
 import gzip
 import os
 
+from absl.testing import parameterized
+
 from tensorflow.python.data.experimental.kernel_tests.serialization import dataset_serialization_test_base
+from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.experimental.ops import readers
+from tensorflow.python.framework import combinations
 from tensorflow.python.platform import test
 
 
 class CsvDatasetSerializationTest(
-    dataset_serialization_test_base.DatasetSerializationTestBase):
+    dataset_serialization_test_base.DatasetSerializationTestBase,
+    parameterized.TestCase):
 
   def setUp(self):
     self._num_cols = 7
@@ -61,6 +66,7 @@ class CsvDatasetSerializationTest(
 
     return readers.CsvDataset(filename, **kwargs).repeat(self._num_epochs)
 
+  @combinations.generate(test_base.default_test_combinations())
   def testSerializationCore(self):
     defs = [[0]] * self._num_cols
     self.run_core_tests(
diff --git a/tensorflow/python/data/experimental/kernel_tests/serialization/dataset_constructor_serialization_test.py b/tensorflow/python/data/experimental/kernel_tests/serialization/dataset_constructor_serialization_test.py
index 2c31c23341d..88fa7d4e022 100644
--- a/tensorflow/python/data/experimental/kernel_tests/serialization/dataset_constructor_serialization_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/serialization/dataset_constructor_serialization_test.py
@@ -17,22 +17,27 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from absl.testing import parameterized
 import numpy as np
 
 from tensorflow.python.data.experimental.kernel_tests.serialization import dataset_serialization_test_base
+from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.framework import combinations
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.platform import test
 
 
 class FromTensorsSerializationTest(
-    dataset_serialization_test_base.DatasetSerializationTestBase):
+    dataset_serialization_test_base.DatasetSerializationTestBase,
+    parameterized.TestCase):
 
   def _build_tensor_dataset(self, variable_array):
     components = (variable_array, np.array([1, 2, 3]), np.array(37.0))
 
     return dataset_ops.Dataset.from_tensors(components)
 
+  @combinations.generate(test_base.default_test_combinations())
   def testFromTensorsCore(self):
     # Equal length components
     arr = np.array(1)
@@ -42,11 +47,13 @@ class FromTensorsSerializationTest(
 
 
 class FromTensorSlicesSerializationTest(
-    dataset_serialization_test_base.DatasetSerializationTestBase):
+    dataset_serialization_test_base.DatasetSerializationTestBase,
+    parameterized.TestCase):
 
   def _build_tensor_slices_dataset(self, components):
     return dataset_ops.Dataset.from_tensor_slices(components)
 
+  @combinations.generate(test_base.default_test_combinations())
   def testFromTensorSlicesCore(self):
     # Equal length components
     components = (np.tile(np.array([[1], [2], [3], [4]]), 20),
@@ -62,7 +69,8 @@ class FromTensorSlicesSerializationTest(
 
 
 class FromSparseTensorSlicesSerializationTest(
-    dataset_serialization_test_base.DatasetSerializationTestBase):
+    dataset_serialization_test_base.DatasetSerializationTestBase,
+    parameterized.TestCase):
 
   def _build_sparse_tensor_slice_dataset(self, slices):
     indices = np.array(
@@ -74,6 +82,10 @@ class FromSparseTensorSlicesSerializationTest(
     sparse_components = sparse_tensor.SparseTensor(indices, values, dense_shape)
     return dataset_ops.Dataset.from_sparse_tensor_slices(sparse_components)
 
+  @combinations.generate(
+      combinations.combine(
+          tf_api_version=1,
+          mode=["graph", "eager"]))
   def testFromSparseTensorSlicesCore(self):
     slices = [[1., 2., 3.], [1.], [1.], [1., 2.], [], [1., 2.], [], [], []]
 
diff --git a/tensorflow/python/data/experimental/kernel_tests/serialization/filter_dataset_serialization_test.py b/tensorflow/python/data/experimental/kernel_tests/serialization/filter_dataset_serialization_test.py
index 4aaf4500350..76675fcacbe 100644
--- a/tensorflow/python/data/experimental/kernel_tests/serialization/filter_dataset_serialization_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/serialization/filter_dataset_serialization_test.py
@@ -17,20 +17,26 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from absl.testing import parameterized
+
 from tensorflow.python.data.experimental.kernel_tests.serialization import dataset_serialization_test_base
+from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.framework import combinations
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import test
 
 
 class FilterDatasetSerializationTest(
-    dataset_serialization_test_base.DatasetSerializationTestBase):
+    dataset_serialization_test_base.DatasetSerializationTestBase,
+    parameterized.TestCase):
 
   def _build_filter_range_graph(self, div):
     return dataset_ops.Dataset.range(100).filter(
         lambda x: math_ops.not_equal(math_ops.mod(x, div), 2))
 
+  @combinations.generate(test_base.default_test_combinations())
   def testFilterCore(self):
     div = 3
     num_outputs = sum(x % 3 != 2 for x in range(100))
@@ -43,6 +49,7 @@ class FilterDatasetSerializationTest(
             lambda d: math_ops.equal(d["bar"] % 2, 0)).map(
                 lambda d: d["foo"] + d["bar"])
 
+  @combinations.generate(test_base.default_test_combinations())
   def testFilterDictCore(self):
     num_outputs = sum((x**2) % 2 == 0 for x in range(10))
     self.run_core_tests(self._build_filter_dict_graph, num_outputs)
@@ -59,6 +66,7 @@ class FilterDatasetSerializationTest(
     return dataset_ops.Dataset.range(10).map(_map_fn).filter(_filter_fn).map(
         lambda x, i: x)
 
+  @combinations.generate(test_base.default_test_combinations())
   def testSparseCore(self):
     num_outputs = 5
     self.run_core_tests(self._build_sparse_filter, num_outputs)
diff --git a/tensorflow/python/data/experimental/kernel_tests/serialization/fixed_length_record_dataset_serialization_test.py b/tensorflow/python/data/experimental/kernel_tests/serialization/fixed_length_record_dataset_serialization_test.py
index 4a9c6b1c330..40ebc8a05bf 100644
--- a/tensorflow/python/data/experimental/kernel_tests/serialization/fixed_length_record_dataset_serialization_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/serialization/fixed_length_record_dataset_serialization_test.py
@@ -17,15 +17,20 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from absl.testing import parameterized
+
 from tensorflow.python.data.experimental.kernel_tests import reader_dataset_ops_test_base
 from tensorflow.python.data.experimental.kernel_tests.serialization import dataset_serialization_test_base
+from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import readers as core_readers
+from tensorflow.python.framework import combinations
 from tensorflow.python.platform import test
 
 
 class FixedLengthRecordDatasetSerializationTest(
     reader_dataset_ops_test_base.FixedLengthRecordDatasetTestBase,
-    dataset_serialization_test_base.DatasetSerializationTestBase):
+    dataset_serialization_test_base.DatasetSerializationTestBase,
+    parameterized.TestCase):
 
   def _build_iterator_graph(self, num_epochs, compression_type=None):
     filenames = self._createFiles()
@@ -33,6 +38,7 @@ class FixedLengthRecordDatasetSerializationTest(
         filenames, self._record_bytes, self._header_bytes,
         self._footer_bytes).repeat(num_epochs)
 
+  @combinations.generate(test_base.default_test_combinations())
   def testFixedLengthRecordCore(self):
     num_epochs = 5
     num_outputs = num_epochs * self._num_files * self._num_records
diff --git a/tensorflow/python/data/experimental/kernel_tests/serialization/flat_map_dataset_serialization_test.py b/tensorflow/python/data/experimental/kernel_tests/serialization/flat_map_dataset_serialization_test.py
index b2da2c7f668..bfe9521f9c5 100644
--- a/tensorflow/python/data/experimental/kernel_tests/serialization/flat_map_dataset_serialization_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/serialization/flat_map_dataset_serialization_test.py
@@ -17,8 +17,12 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from absl.testing import parameterized
+
 from tensorflow.python.data.experimental.kernel_tests.serialization import dataset_serialization_test_base
+from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.framework import combinations
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
@@ -32,8 +36,10 @@ from tensorflow.python.platform import test
 
 
 class FlatMapDatasetSerializationTest(
-    dataset_serialization_test_base.DatasetSerializationTestBase):
+    dataset_serialization_test_base.DatasetSerializationTestBase,
+    parameterized.TestCase):
 
+  @combinations.generate(test_base.default_test_combinations())
   def testCore(self):
     # Complicated way of saying range(start, start+25).
     def build_ds(start):
@@ -45,6 +51,7 @@ class FlatMapDatasetSerializationTest(
 
     self.run_core_tests(lambda: build_ds(0), 25)
 
+  @combinations.generate(test_base.default_test_combinations())
   def testMapThenFlatMap(self):
 
     def build_ds():
@@ -60,6 +67,7 @@ class FlatMapDatasetSerializationTest(
 
     self.run_core_tests(build_ds, 500)
 
+  @combinations.generate(test_base.default_test_combinations())
   def testCaptureDefunInMapFn(self):
 
     def build_ds():
@@ -76,6 +84,7 @@ class FlatMapDatasetSerializationTest(
 
     self.run_core_tests(build_ds, 100)
 
+  @combinations.generate(test_base.default_test_combinations())
   def testDisallowVariableCapture(self):
 
     def build_ds():
@@ -86,6 +95,7 @@ class FlatMapDatasetSerializationTest(
 
     self.verify_error_on_save(build_ds, 5, errors.FailedPreconditionError)
 
+  @combinations.generate(test_base.default_test_combinations())
   def testDisallowCapturingStatefulOps(self):
 
     def build_ds():
@@ -102,6 +112,7 @@ class FlatMapDatasetSerializationTest(
 
     self.verify_error_on_save(build_ds, 500, errors.FailedPreconditionError)
 
+  @combinations.generate(test_base.default_test_combinations())
   def testSparseCore(self):
 
     def _map_fn(i):
diff --git a/tensorflow/python/data/experimental/kernel_tests/serialization/group_by_reducer_serialization_test.py b/tensorflow/python/data/experimental/kernel_tests/serialization/group_by_reducer_serialization_test.py
index d2f1ffbdca8..3763c1decd2 100644
--- a/tensorflow/python/data/experimental/kernel_tests/serialization/group_by_reducer_serialization_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/serialization/group_by_reducer_serialization_test.py
@@ -17,16 +17,20 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from absl.testing import parameterized
 import numpy as np
 
 from tensorflow.python.data.experimental.kernel_tests.serialization import dataset_serialization_test_base
 from tensorflow.python.data.experimental.ops import grouping
+from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.framework import combinations
 from tensorflow.python.platform import test
 
 
 class GroupByReducerSerializationTest(
-    dataset_serialization_test_base.DatasetSerializationTestBase):
+    dataset_serialization_test_base.DatasetSerializationTestBase,
+    parameterized.TestCase):
 
   def _build_dataset(self, components):
     reducer = grouping.Reducer(
@@ -37,6 +41,7 @@ class GroupByReducerSerializationTest(
     return dataset_ops.Dataset.from_tensor_slices(components).apply(
         grouping.group_by_reducer(lambda x: x % 5, reducer))
 
+  @combinations.generate(test_base.default_test_combinations())
   def testCoreGroupByReducer(self):
     components = np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9], dtype=np.int64)
     self.verify_unused_iterator(
diff --git a/tensorflow/python/data/experimental/kernel_tests/serialization/group_by_window_serialization_test.py b/tensorflow/python/data/experimental/kernel_tests/serialization/group_by_window_serialization_test.py
index 69e28d4ab0a..eaa416dc2fe 100644
--- a/tensorflow/python/data/experimental/kernel_tests/serialization/group_by_window_serialization_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/serialization/group_by_window_serialization_test.py
@@ -17,21 +17,26 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from absl.testing import parameterized
 import numpy as np
 
 from tensorflow.python.data.experimental.kernel_tests.serialization import dataset_serialization_test_base
 from tensorflow.python.data.experimental.ops import grouping
+from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.framework import combinations
 from tensorflow.python.platform import test
 
 
 class GroupByWindowSerializationTest(
-    dataset_serialization_test_base.DatasetSerializationTestBase):
+    dataset_serialization_test_base.DatasetSerializationTestBase,
+    parameterized.TestCase):
 
   def _build_dataset(self, components):
     return dataset_ops.Dataset.from_tensor_slices(components).repeat(-1).apply(
         grouping.group_by_window(lambda x: x % 3, lambda _, xs: xs.batch(4), 4))
 
+  @combinations.generate(test_base.default_test_combinations())
   def testCoreGroupByWindow(self):
     components = np.array(
         [0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 0, 0, 2, 2, 0, 0], dtype=np.int64)
diff --git a/tensorflow/python/data/experimental/kernel_tests/serialization/ignore_errors_serialization_test.py b/tensorflow/python/data/experimental/kernel_tests/serialization/ignore_errors_serialization_test.py
index 5858bd2dbd6..3c2e9276ca0 100644
--- a/tensorflow/python/data/experimental/kernel_tests/serialization/ignore_errors_serialization_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/serialization/ignore_errors_serialization_test.py
@@ -17,21 +17,27 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from absl.testing import parameterized
+
 from tensorflow.python.data.experimental.kernel_tests.serialization import dataset_serialization_test_base
 from tensorflow.python.data.experimental.ops import error_ops
+from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.framework import combinations
 from tensorflow.python.ops import array_ops
 from tensorflow.python.platform import test
 
 
 class IgnoreErrorsSerializationTest(
-    dataset_serialization_test_base.DatasetSerializationTestBase):
+    dataset_serialization_test_base.DatasetSerializationTestBase,
+    parameterized.TestCase):
 
   def _build_ds(self):
     return dataset_ops.Dataset.range(5).map(
         array_ops.ones).map(lambda x: array_ops.gather(x, [0])).apply(
             error_ops.ignore_errors())
 
+  @combinations.generate(test_base.default_test_combinations())
   def testIgnoreErrorsCore(self):
     num_outputs = 4
     self.run_core_tests(self._build_ds, num_outputs)
diff --git a/tensorflow/python/data/experimental/kernel_tests/serialization/interleave_dataset_serialization_test.py b/tensorflow/python/data/experimental/kernel_tests/serialization/interleave_dataset_serialization_test.py
index f3daffbae9e..ff3f238f34b 100644
--- a/tensorflow/python/data/experimental/kernel_tests/serialization/interleave_dataset_serialization_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/serialization/interleave_dataset_serialization_test.py
@@ -21,7 +21,9 @@ from absl.testing import parameterized
 import numpy as np
 
 from tensorflow.python.data.experimental.kernel_tests.serialization import dataset_serialization_test_base
+from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.framework import combinations
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.ops import sparse_ops
 from tensorflow.python.platform import test
@@ -39,16 +41,13 @@ class InterleaveDatasetSerializationTest(
             lambda x: dataset_ops.Dataset.from_tensors(x).repeat(x),
             cycle_length, block_length, num_parallel_calls)
 
-  @parameterized.named_parameters(
-      ("1", 2, 3, None),
-      ("2", 2, 3, 1),
-      ("3", 2, 3, 2),
-      ("4", 1, 3, None),
-      ("5", 1, 3, 1),
-      ("6", 2, 1, None),
-      ("7", 2, 1, 1),
-      ("8", 2, 1, 2),
-  )
+  @combinations.generate(
+      combinations.times(
+          test_base.default_test_combinations(),
+          combinations.combine(
+              cycle_length=2,
+              block_length=[1, 3],
+              num_parallel_calls=[None, 1, 2])))
   def testSerializationCore(self, cycle_length, block_length,
                             num_parallel_calls):
     input_values = np.array([4, 5, 6], dtype=np.int64)
@@ -60,6 +59,7 @@ class InterleaveDatasetSerializationTest(
         num_outputs)
     # pylint: enable=g-long-lambda
 
+  @combinations.generate(test_base.default_test_combinations())
   def testSparseCore(self):
 
     def _map_fn(i):
diff --git a/tensorflow/python/data/experimental/kernel_tests/serialization/map_and_batch_dataset_serialization_test.py b/tensorflow/python/data/experimental/kernel_tests/serialization/map_and_batch_dataset_serialization_test.py
index 9cffd39c842..450cb24fb5b 100644
--- a/tensorflow/python/data/experimental/kernel_tests/serialization/map_and_batch_dataset_serialization_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/serialization/map_and_batch_dataset_serialization_test.py
@@ -19,17 +19,23 @@ from __future__ import print_function
 
 import math
 
+from absl.testing import parameterized
+
 from tensorflow.python.data.experimental.kernel_tests.serialization import dataset_serialization_test_base
 from tensorflow.python.data.experimental.ops import batching
+from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.framework import combinations
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import test
 
 
 class MapAndBatchDatasetSerializationTest(
-    dataset_serialization_test_base.DatasetSerializationTestBase):
+    dataset_serialization_test_base.DatasetSerializationTestBase,
+    parameterized.TestCase):
 
+  @combinations.generate(test_base.default_test_combinations())
   def testNumParallelBatches(self):
     range_size = 11
     num_repeats = 2
@@ -55,6 +61,7 @@ class MapAndBatchDatasetSerializationTest(
     self.run_core_tests(lambda: build_ds(10), num_outputs_keep_remainder)
     self.run_core_tests(lambda: build_ds(10, True), num_outputs_drop_remainder)
 
+  @combinations.generate(test_base.default_test_combinations())
   def testNumParallelCalls(self):
     range_size = 11
     num_repeats = 2
@@ -80,6 +87,7 @@ class MapAndBatchDatasetSerializationTest(
     self.run_core_tests(lambda: build_ds(10), num_outputs_keep_remainder)
     self.run_core_tests(lambda: build_ds(10, True), num_outputs_drop_remainder)
 
+  @combinations.generate(test_base.default_test_combinations())
   def testSparse(self):
 
     def build_dataset():
diff --git a/tensorflow/python/data/experimental/kernel_tests/serialization/map_dataset_serialization_test.py b/tensorflow/python/data/experimental/kernel_tests/serialization/map_dataset_serialization_test.py
index 73801722a98..a81bccf1c4e 100644
--- a/tensorflow/python/data/experimental/kernel_tests/serialization/map_dataset_serialization_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/serialization/map_dataset_serialization_test.py
@@ -17,10 +17,13 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from absl.testing import parameterized
 import numpy as np
 
 from tensorflow.python.data.experimental.kernel_tests.serialization import dataset_serialization_test_base
+from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.framework import combinations
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
@@ -33,7 +36,8 @@ from tensorflow.python.platform import test
 
 
 class MapDatasetSerializationTest(
-    dataset_serialization_test_base.DatasetSerializationTestBase):
+    dataset_serialization_test_base.DatasetSerializationTestBase,
+    parameterized.TestCase):
 
   def setUp(self):
     self._tensor_slice_len = 7
@@ -52,9 +56,11 @@ class MapDatasetSerializationTest(
         dataset_ops.Dataset.from_tensor_slices(components).map(_map_fn)
         .repeat(self._num_epochs))
 
+  @combinations.generate(test_base.default_test_combinations())
   def testSaveRestoreCore(self):
     self.run_core_tests(self._build_ds, self._num_outputs)
 
+  @combinations.generate(test_base.default_test_combinations())
   def testSaveStatefulFunction(self):
 
     def _build_ds():
@@ -67,6 +73,7 @@ class MapDatasetSerializationTest(
 
     self.verify_error_on_save(_build_ds, 15, errors.FailedPreconditionError)
 
+  @combinations.generate(test_base.default_test_combinations())
   def testCaptureVariableInMapFn(self):
 
     def _build_ds():
@@ -77,6 +84,7 @@ class MapDatasetSerializationTest(
 
     self.verify_error_on_save(_build_ds, 15, errors.FailedPreconditionError)
 
+  @combinations.generate(test_base.default_test_combinations())
   def testCaptureConstantInMapFn(self):
 
     def _build_ds():
@@ -86,6 +94,7 @@ class MapDatasetSerializationTest(
 
     self.run_core_tests(_build_ds, 10)
 
+  @combinations.generate(test_base.default_test_combinations())
   def testCaptureDefunInMapFn(self):
     num_outputs = 100
 
@@ -99,6 +108,7 @@ class MapDatasetSerializationTest(
 
     self.run_core_tests(_build_ds, num_outputs)
 
+  @combinations.generate(test_base.default_test_combinations())
   def testBuildDefunInMapFn(self):
     num_outputs = 100
 
@@ -118,6 +128,7 @@ class MapDatasetSerializationTest(
 
     self.run_core_tests(_build_ds, num_outputs)
 
+  @combinations.generate(test_base.default_test_combinations())
   def testSparseCore(self):
 
     def _sparse(i):
diff --git a/tensorflow/python/data/experimental/kernel_tests/serialization/matching_files_dataset_serialization_test.py b/tensorflow/python/data/experimental/kernel_tests/serialization/matching_files_dataset_serialization_test.py
index 94b5e1b0b62..909bab89f66 100644
--- a/tensorflow/python/data/experimental/kernel_tests/serialization/matching_files_dataset_serialization_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/serialization/matching_files_dataset_serialization_test.py
@@ -21,17 +21,23 @@ import os
 import shutil
 import tempfile
 
+from absl.testing import parameterized
+
 from tensorflow.python.data.experimental.kernel_tests.serialization import dataset_serialization_test_base
 from tensorflow.python.data.experimental.ops import matching_files
+from tensorflow.python.data.kernel_tests import test_base
+from tensorflow.python.framework import combinations
 from tensorflow.python.platform import test
 
 
 class MatchingFilesDatasetSerializationTest(
-    dataset_serialization_test_base.DatasetSerializationTestBase):
+    dataset_serialization_test_base.DatasetSerializationTestBase,
+    parameterized.TestCase):
 
   def _build_iterator_graph(self, test_patterns):
     return matching_files.MatchingFilesDataset(test_patterns)
 
+  @combinations.generate(test_base.default_test_combinations())
   def testMatchingFilesCore(self):
     tmp_dir = tempfile.mkdtemp()
     width = 16
diff --git a/tensorflow/python/data/experimental/kernel_tests/serialization/optimize_dataset_serialization_test.py b/tensorflow/python/data/experimental/kernel_tests/serialization/optimize_dataset_serialization_test.py
index 646f306f519..385b1acd49c 100644
--- a/tensorflow/python/data/experimental/kernel_tests/serialization/optimize_dataset_serialization_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/serialization/optimize_dataset_serialization_test.py
@@ -17,15 +17,21 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from absl.testing import parameterized
+
 from tensorflow.python.data.experimental.kernel_tests.serialization import dataset_serialization_test_base
 from tensorflow.python.data.experimental.ops import optimization
+from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.framework import combinations
 from tensorflow.python.platform import test
 
 
 class OptimizeDatasetSerializationTest(
-    dataset_serialization_test_base.DatasetSerializationTestBase):
+    dataset_serialization_test_base.DatasetSerializationTestBase,
+    parameterized.TestCase):
 
+  @combinations.generate(test_base.default_test_combinations())
   def testCore(self):
 
     def build_dataset(num_elements, batch_size):
@@ -34,6 +40,7 @@ class OptimizeDatasetSerializationTest(
 
     self.run_core_tests(lambda: build_dataset(200, 10), 20)
 
+  @combinations.generate(test_base.default_test_combinations())
   def testWithNewFunction(self):
     """Tests that optimized datasets with new functions work."""
 
diff --git a/tensorflow/python/data/experimental/kernel_tests/serialization/padded_batch_dataset_serialization_test.py b/tensorflow/python/data/experimental/kernel_tests/serialization/padded_batch_dataset_serialization_test.py
index 3988e64a647..956279cb7a5 100644
--- a/tensorflow/python/data/experimental/kernel_tests/serialization/padded_batch_dataset_serialization_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/serialization/padded_batch_dataset_serialization_test.py
@@ -17,18 +17,23 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from absl.testing import parameterized
 import numpy as np
 
 from tensorflow.python.data.experimental.kernel_tests.serialization import dataset_serialization_test_base
+from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.framework import combinations
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import string_ops
 from tensorflow.python.platform import test
 
 
 class PaddedBatchDatasetSerializationTest(
-    dataset_serialization_test_base.DatasetSerializationTestBase):
+    dataset_serialization_test_base.DatasetSerializationTestBase,
+    parameterized.TestCase):
 
+  @combinations.generate(test_base.default_test_combinations())
   def testPaddedBatch(self):
 
     def build_dataset(seq_lens):
@@ -39,6 +44,7 @@ class PaddedBatchDatasetSerializationTest(
     seq_lens = np.random.randint(1, 20, size=(32,)).astype(np.int32)
     self.run_core_tests(lambda: build_dataset(seq_lens), 8)
 
+  @combinations.generate(test_base.default_test_combinations())
   def testPaddedBatchNonDefaultPadding(self):
 
     def build_dataset(seq_lens):
diff --git a/tensorflow/python/data/experimental/kernel_tests/serialization/parallel_interleave_dataset_serialization_test.py b/tensorflow/python/data/experimental/kernel_tests/serialization/parallel_interleave_dataset_serialization_test.py
index c441ee753f9..79ee2937d8a 100644
--- a/tensorflow/python/data/experimental/kernel_tests/serialization/parallel_interleave_dataset_serialization_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/serialization/parallel_interleave_dataset_serialization_test.py
@@ -17,18 +17,22 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from absl.testing import parameterized
 import numpy as np
 
 from tensorflow.python.data.experimental.kernel_tests.serialization import dataset_serialization_test_base
 from tensorflow.python.data.experimental.ops import interleave_ops
+from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.framework import combinations
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.ops import sparse_ops
 from tensorflow.python.platform import test
 
 
 class ParallelInterleaveDatasetSerializationTest(
-    dataset_serialization_test_base.DatasetSerializationTestBase):
+    dataset_serialization_test_base.DatasetSerializationTestBase,
+    parameterized.TestCase):
 
   def setUp(self):
     self.input_values = np.array([4, 5, 6], dtype=np.int64)
@@ -42,6 +46,7 @@ class ParallelInterleaveDatasetSerializationTest(
                 lambda x: dataset_ops.Dataset.range(10 * x, 11 * x),
                 cycle_length, block_length, sloppy)))
 
+  @combinations.generate(test_base.default_test_combinations())
   def testSerializationCore(self):
     # cycle_length > 1, block_length > 1
     cycle_length = 2
@@ -59,6 +64,7 @@ class ParallelInterleaveDatasetSerializationTest(
     self.run_core_tests(lambda: self._build_ds(cycle_length, block_length),
                         self.num_outputs)
 
+  @combinations.generate(test_base.default_test_combinations())
   def testSerializationWithSloppy(self):
     break_points = self.gen_break_points(self.num_outputs, 10)
     expected_outputs = np.repeat(
@@ -78,6 +84,7 @@ class ParallelInterleaveDatasetSerializationTest(
     # block_length = 1
     run_test(2, 1)
 
+  @combinations.generate(test_base.default_test_combinations())
   def testSparseCore(self):
 
     def _map_fn(i):
diff --git a/tensorflow/python/data/experimental/kernel_tests/serialization/parallel_map_dataset_serialization_test.py b/tensorflow/python/data/experimental/kernel_tests/serialization/parallel_map_dataset_serialization_test.py
index 6ec012f5f7d..48081d16ac4 100644
--- a/tensorflow/python/data/experimental/kernel_tests/serialization/parallel_map_dataset_serialization_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/serialization/parallel_map_dataset_serialization_test.py
@@ -17,10 +17,13 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from absl.testing import parameterized
 import numpy as np
 
 from tensorflow.python.data.experimental.kernel_tests.serialization import dataset_serialization_test_base
+from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.framework import combinations
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
@@ -32,7 +35,8 @@ from tensorflow.python.platform import test
 
 
 class ParallelMapDatasetSerializationTest(
-    dataset_serialization_test_base.DatasetSerializationTestBase):
+    dataset_serialization_test_base.DatasetSerializationTestBase,
+    parameterized.TestCase):
 
   def setUp(self):
     self._tensor_slice_len = 7
@@ -61,6 +65,7 @@ class ParallelMapDatasetSerializationTest(
     return (dataset_ops.Dataset.from_tensor_slices(components).map(
         _map_fn, num_parallel_calls=3).repeat(self._num_epochs).prefetch(5))
 
+  @combinations.generate(test_base.default_test_combinations())
   def testSaveRestoreCore(self):
     for ds_fn in [self._build_ds, self._build_ds_with_prefetch]:
       self.run_core_tests(ds_fn, self._num_outputs)
@@ -78,6 +83,7 @@ class ParallelMapDatasetSerializationTest(
 
     self.verify_error_on_save(_build_ds, 15, errors.FailedPreconditionError)
 
+  @combinations.generate(test_base.default_test_combinations())
   def testCaptureVariableInMapFn(self):
 
     def _build_ds():
@@ -89,6 +95,7 @@ class ParallelMapDatasetSerializationTest(
 
     self.verify_error_on_save(_build_ds, 15, errors.FailedPreconditionError)
 
+  @combinations.generate(test_base.default_test_combinations())
   def testCaptureConstantInMapFn(self):
 
     def _build_ds():
@@ -98,6 +105,7 @@ class ParallelMapDatasetSerializationTest(
 
     self.run_core_tests(_build_ds, 10)
 
+  @combinations.generate(test_base.default_test_combinations())
   def testCaptureDefunInMapFn(self):
     num_outputs = 100
 
@@ -112,6 +120,7 @@ class ParallelMapDatasetSerializationTest(
 
     self.run_core_tests(_build_ds, num_outputs)
 
+  @combinations.generate(test_base.default_test_combinations())
   def testBuildDefunInMapFn(self):
     num_outputs = 100
 
diff --git a/tensorflow/python/data/experimental/kernel_tests/serialization/parse_example_dataset_serialization_test.py b/tensorflow/python/data/experimental/kernel_tests/serialization/parse_example_dataset_serialization_test.py
index 6698fce8270..738fb1ecdbe 100644
--- a/tensorflow/python/data/experimental/kernel_tests/serialization/parse_example_dataset_serialization_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/serialization/parse_example_dataset_serialization_test.py
@@ -17,16 +17,21 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from absl.testing import parameterized
+
 from tensorflow.python.data.experimental.kernel_tests import reader_dataset_ops_test_base
 from tensorflow.python.data.experimental.kernel_tests.serialization import dataset_serialization_test_base
+from tensorflow.python.data.kernel_tests import test_base
+from tensorflow.python.framework import combinations
 from tensorflow.python.platform import test
 
 
 class ParseExampleDatasetSerializationTest(
     reader_dataset_ops_test_base.MakeBatchedFeaturesDatasetTestBase,
-    dataset_serialization_test_base.DatasetSerializationTestBase):
+    dataset_serialization_test_base.DatasetSerializationTestBase,
+    parameterized.TestCase):
 
-  def ParseExampleDataset(self, num_repeat, batch_size):
+  def _parse_example_dataset(self, num_repeat, batch_size):
     return self.make_batch_feature(
         filenames=self.test_filenames,
         num_epochs=num_repeat,
@@ -34,13 +39,14 @@ class ParseExampleDatasetSerializationTest(
         reader_num_threads=5,
         parser_num_threads=10)
 
+  @combinations.generate(test_base.default_test_combinations())
   def testSerializationCore(self):
     num_repeat = 5
     batch_size = 2
     num_outputs = self._num_records * self._num_files * num_repeat // batch_size
     # pylint: disable=g-long-lambda
     self.run_core_tests(
-        lambda: self.ParseExampleDataset(
+        lambda: self._parse_example_dataset(
             num_repeat=num_repeat, batch_size=batch_size), num_outputs)
 
 
diff --git a/tensorflow/python/data/experimental/kernel_tests/serialization/prefetch_dataset_serialization_test.py b/tensorflow/python/data/experimental/kernel_tests/serialization/prefetch_dataset_serialization_test.py
index 738d9561e2f..98b89fca6ff 100644
--- a/tensorflow/python/data/experimental/kernel_tests/serialization/prefetch_dataset_serialization_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/serialization/prefetch_dataset_serialization_test.py
@@ -17,18 +17,24 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from absl.testing import parameterized
+
 from tensorflow.python.data.experimental.kernel_tests.serialization import dataset_serialization_test_base
+from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.framework import combinations
 from tensorflow.python.platform import test
 
 
 class PrefetchDatasetSerializationTest(
-    dataset_serialization_test_base.DatasetSerializationTestBase):
+    dataset_serialization_test_base.DatasetSerializationTestBase,
+    parameterized.TestCase):
 
   def build_dataset(self, seed):
     return dataset_ops.Dataset.range(100).prefetch(10).shuffle(
         buffer_size=10, seed=seed, reshuffle_each_iteration=False)
 
+  @combinations.generate(test_base.default_test_combinations())
   def testCore(self):
     num_outputs = 100
     self.run_core_tests(lambda: self.build_dataset(10), num_outputs)
diff --git a/tensorflow/python/data/experimental/kernel_tests/serialization/range_dataset_serialization_test.py b/tensorflow/python/data/experimental/kernel_tests/serialization/range_dataset_serialization_test.py
index c06cd39d241..557bdc72a20 100644
--- a/tensorflow/python/data/experimental/kernel_tests/serialization/range_dataset_serialization_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/serialization/range_dataset_serialization_test.py
@@ -19,8 +19,12 @@ from __future__ import print_function
 
 import os
 
+from absl.testing import parameterized
+
 from tensorflow.python.data.experimental.kernel_tests.serialization import dataset_serialization_test_base
+from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.framework import combinations
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
@@ -32,7 +36,8 @@ from tensorflow.python.platform import test
 
 
 class RangeDatasetSerializationTest(
-    dataset_serialization_test_base.DatasetSerializationTestBase):
+    dataset_serialization_test_base.DatasetSerializationTestBase,
+    parameterized.TestCase):
 
   def _iterator_checkpoint_prefix_local(self):
     return os.path.join(self.get_temp_dir(), "iterator")
@@ -53,6 +58,8 @@ class RangeDatasetSerializationTest(
                                                       iterator_state_variant)
     return restore_op
 
+  @combinations.generate(
+      combinations.combine(tf_api_version=1, mode=["graph"]))
   def testSaveRestore(self):
 
     def _build_graph(start, stop):
@@ -105,6 +112,7 @@ class RangeDatasetSerializationTest(
   def _build_range_dataset(self, start, stop):
     return dataset_ops.Dataset.range(start, stop)
 
+  @combinations.generate(test_base.default_test_combinations())
   def testRangeCore(self):
     start = 2
     stop = 10
diff --git a/tensorflow/python/data/experimental/kernel_tests/serialization/rebatch_dataset_serialization_test.py b/tensorflow/python/data/experimental/kernel_tests/serialization/rebatch_dataset_serialization_test.py
index 0ae26927ca5..8fa7fcbd10f 100644
--- a/tensorflow/python/data/experimental/kernel_tests/serialization/rebatch_dataset_serialization_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/serialization/rebatch_dataset_serialization_test.py
@@ -17,15 +17,21 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from absl.testing import parameterized
+
 from tensorflow.python.data.experimental.kernel_tests.serialization import dataset_serialization_test_base
 from tensorflow.python.data.experimental.ops import distribute
+from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.framework import combinations
 from tensorflow.python.platform import test
 
 
 class RebatchDatasetSerializationTest(
-    dataset_serialization_test_base.DatasetSerializationTestBase):
+    dataset_serialization_test_base.DatasetSerializationTestBase,
+    parameterized.TestCase):
 
+  @combinations.generate(test_base.default_test_combinations())
   def testCore(self):
 
     def build_dataset(num_elements, batch_size):
diff --git a/tensorflow/python/data/experimental/kernel_tests/serialization/sample_from_datasets_serialization_test.py b/tensorflow/python/data/experimental/kernel_tests/serialization/sample_from_datasets_serialization_test.py
index f12267db681..2dcc272615b 100644
--- a/tensorflow/python/data/experimental/kernel_tests/serialization/sample_from_datasets_serialization_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/serialization/sample_from_datasets_serialization_test.py
@@ -17,14 +17,19 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from absl.testing import parameterized
+
 from tensorflow.python.data.experimental.kernel_tests.serialization import dataset_serialization_test_base
 from tensorflow.python.data.experimental.ops import interleave_ops
+from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.framework import combinations
 from tensorflow.python.platform import test
 
 
 class SampleFromDatasetsSerializationTest(
-    dataset_serialization_test_base.DatasetSerializationTestBase):
+    dataset_serialization_test_base.DatasetSerializationTestBase,
+    parameterized.TestCase):
 
   def _build_dataset(self, probs, num_samples):
     dataset = interleave_ops.sample_from_datasets(
@@ -36,6 +41,7 @@ class SampleFromDatasetsSerializationTest(
         seed=1813)
     return dataset.take(num_samples)
 
+  @combinations.generate(test_base.default_test_combinations())
   def testSerializationCore(self):
     self.run_core_tests(lambda: self._build_dataset([0.5, 0.5], 100), 100)
 
diff --git a/tensorflow/python/data/experimental/kernel_tests/serialization/scan_dataset_serialization_test.py b/tensorflow/python/data/experimental/kernel_tests/serialization/scan_dataset_serialization_test.py
index 33aa33c4e26..31e3e578402 100644
--- a/tensorflow/python/data/experimental/kernel_tests/serialization/scan_dataset_serialization_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/serialization/scan_dataset_serialization_test.py
@@ -17,19 +17,25 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from absl.testing import parameterized
+
 from tensorflow.python.data.experimental.kernel_tests.serialization import dataset_serialization_test_base
 from tensorflow.python.data.experimental.ops import scan_ops
+from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.framework import combinations
 from tensorflow.python.platform import test
 
 
 class ScanDatasetSerializationTest(
-    dataset_serialization_test_base.DatasetSerializationTestBase):
+    dataset_serialization_test_base.DatasetSerializationTestBase,
+    parameterized.TestCase):
 
   def _build_dataset(self, num_elements):
     return dataset_ops.Dataset.from_tensors(1).repeat(num_elements).apply(
         scan_ops.scan([0, 1], lambda a, _: ([a[1], a[0] + a[1]], a[1])))
 
+  @combinations.generate(test_base.default_test_combinations())
   def testScanCore(self):
     num_output = 5
     self.run_core_tests(lambda: self._build_dataset(num_output), num_output)
diff --git a/tensorflow/python/data/experimental/kernel_tests/serialization/sequence_dataset_serialization_test.py b/tensorflow/python/data/experimental/kernel_tests/serialization/sequence_dataset_serialization_test.py
index 09c09aa0b8a..22fe264e8be 100644
--- a/tensorflow/python/data/experimental/kernel_tests/serialization/sequence_dataset_serialization_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/serialization/sequence_dataset_serialization_test.py
@@ -17,25 +17,31 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from absl.testing import parameterized
 import numpy as np
 
 from tensorflow.python.data.experimental.kernel_tests.serialization import dataset_serialization_test_base
+from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.framework import combinations
 from tensorflow.python.platform import test
 
 
 class SkipDatasetSerializationTest(
-    dataset_serialization_test_base.DatasetSerializationTestBase):
+    dataset_serialization_test_base.DatasetSerializationTestBase,
+    parameterized.TestCase):
 
   def _build_skip_dataset(self, count):
     components = (np.arange(10),)
     return dataset_ops.Dataset.from_tensor_slices(components).skip(count)
 
+  @combinations.generate(test_base.default_test_combinations())
   def testSkipFewerThanInputs(self):
     count = 4
     num_outputs = 10 - count
     self.run_core_tests(lambda: self._build_skip_dataset(count), num_outputs)
 
+  @combinations.generate(test_base.default_test_combinations())
   def testSkipVarious(self):
     # Skip more than inputs
     self.run_core_tests(lambda: self._build_skip_dataset(20), 0)
@@ -45,6 +51,7 @@ class SkipDatasetSerializationTest(
     # Skip nothing
     self.run_core_tests(lambda: self._build_skip_dataset(0), 10)
 
+  @combinations.generate(test_base.default_test_combinations())
   def testInvalidSkip(self):
     with self.assertRaisesRegexp(ValueError,
                                  'Shape must be rank 0 but is rank 1'):
@@ -52,16 +59,19 @@ class SkipDatasetSerializationTest(
 
 
 class TakeDatasetSerializationTest(
-    dataset_serialization_test_base.DatasetSerializationTestBase):
+    dataset_serialization_test_base.DatasetSerializationTestBase,
+    parameterized.TestCase):
 
   def _build_take_dataset(self, count):
     components = (np.arange(10),)
     return dataset_ops.Dataset.from_tensor_slices(components).take(count)
 
+  @combinations.generate(test_base.default_test_combinations())
   def testTakeFewerThanInputs(self):
     count = 4
     self.run_core_tests(lambda: self._build_take_dataset(count), count)
 
+  @combinations.generate(test_base.default_test_combinations())
   def testTakeVarious(self):
     # Take more than inputs
     self.run_core_tests(lambda: self._build_take_dataset(20), 10)
@@ -79,20 +89,24 @@ class TakeDatasetSerializationTest(
 
 
 class RepeatDatasetSerializationTest(
-    dataset_serialization_test_base.DatasetSerializationTestBase):
+    dataset_serialization_test_base.DatasetSerializationTestBase,
+    parameterized.TestCase):
 
   def _build_repeat_dataset(self, count, take_count=3):
     components = (np.arange(10),)
     return dataset_ops.Dataset.from_tensor_slices(components).take(
         take_count).repeat(count)
 
+  @combinations.generate(test_base.default_test_combinations())
   def testFiniteRepeat(self):
     count = 10
     self.run_core_tests(lambda: self._build_repeat_dataset(count), 3 * count)
 
+  @combinations.generate(test_base.default_test_combinations())
   def testEmptyRepeat(self):
     self.run_core_tests(lambda: self._build_repeat_dataset(0), 0)
 
+  @combinations.generate(test_base.default_test_combinations())
   def testInfiniteRepeat(self):
     self.verify_unused_iterator(
         lambda: self._build_repeat_dataset(-1), 10, verify_exhausted=False)
@@ -104,6 +118,7 @@ class RepeatDatasetSerializationTest(
     # Test repeat empty dataset
     self.run_core_tests(lambda: self._build_repeat_dataset(-1, 0), 0)
 
+  @combinations.generate(test_base.default_test_combinations())
   def testInvalidRepeat(self):
     with self.assertRaisesRegexp(
         ValueError, 'Shape must be rank 0 but is rank 1'):
diff --git a/tensorflow/python/data/experimental/kernel_tests/serialization/serialization_integration_test.py b/tensorflow/python/data/experimental/kernel_tests/serialization/serialization_integration_test.py
index 2cada3f3a5f..e5bd9420adb 100644
--- a/tensorflow/python/data/experimental/kernel_tests/serialization/serialization_integration_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/serialization/serialization_integration_test.py
@@ -19,15 +19,18 @@ from __future__ import print_function
 
 import os
 
+from absl.testing import parameterized
+
 from tensorflow.python.data.experimental.ops import iterator_ops as contrib_iterator_ops
 from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.framework import combinations
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.platform import test
 from tensorflow.python.training import saver as saver_lib
 
 
-class SerializationIntegrationTest(test.TestCase):
+class SerializationIntegrationTest(test.TestCase, parameterized.TestCase):
 
   def _build_input_pipeline(self, name, num_outputs):
     with ops.name_scope(name):
@@ -52,6 +55,8 @@ class SerializationIntegrationTest(test.TestCase):
   def _ckpt_path(self):
     return os.path.join(self.get_temp_dir(), "iterator")
 
+  @combinations.generate(
+      combinations.combine(tf_api_version=1, mode=["graph"]))
   def testConcurrentSaves(self):
     num_pipelines = 100
     num_outputs = 100
@@ -82,6 +87,8 @@ class SerializationIntegrationTest(test.TestCase):
     for output in all_outputs:
       self.assertSequenceEqual(sorted(output), range(num_outputs))
 
+  @combinations.generate(
+      combinations.combine(tf_api_version=1, mode=["graph"]))
   def testUninitializedIterator(self):
     num_pipelines = 1
     num_outputs = 1
diff --git a/tensorflow/python/data/experimental/kernel_tests/serialization/shard_dataset_serialization_test.py b/tensorflow/python/data/experimental/kernel_tests/serialization/shard_dataset_serialization_test.py
index e180b103157..3745dad7d24 100644
--- a/tensorflow/python/data/experimental/kernel_tests/serialization/shard_dataset_serialization_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/serialization/shard_dataset_serialization_test.py
@@ -20,7 +20,9 @@ from __future__ import print_function
 from absl.testing import parameterized
 
 from tensorflow.python.data.experimental.kernel_tests.serialization import dataset_serialization_test_base
+from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.framework import combinations
 from tensorflow.python.platform import test
 
 
@@ -31,7 +33,11 @@ class ShardDatasetSerializationTest(
   def _build_dataset(self, num_elements, num_shards, index):
     return dataset_ops.Dataset.range(num_elements).shard(num_shards, index)
 
-  @parameterized.parameters((10, 5, 2), (10, 10, 0), (100, 2, 0))
+  @combinations.generate(
+      combinations.times(
+          test_base.default_test_combinations(),
+          combinations.combine(
+              elems=[10, 100], num_shards=[2, 5], index=[0, 1])))
   def testCore(self, elems, num_shards, index):
     self.run_core_tests(lambda: self._build_dataset(elems, num_shards, index),
                         elems // num_shards)
diff --git a/tensorflow/python/data/experimental/kernel_tests/serialization/shuffle_and_repeat_dataset_serialization_test.py b/tensorflow/python/data/experimental/kernel_tests/serialization/shuffle_and_repeat_dataset_serialization_test.py
index 42f01b7ac14..ae2715f51f1 100644
--- a/tensorflow/python/data/experimental/kernel_tests/serialization/shuffle_and_repeat_dataset_serialization_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/serialization/shuffle_and_repeat_dataset_serialization_test.py
@@ -17,19 +17,25 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from absl.testing import parameterized
+
 from tensorflow.python.data.experimental.kernel_tests.serialization import dataset_serialization_test_base
 from tensorflow.python.data.experimental.ops import shuffle_ops
+from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.framework import combinations
 from tensorflow.python.platform import test
 
 
 class ShuffleAndRepeatSerializationTest(
-    dataset_serialization_test_base.DatasetSerializationTestBase):
+    dataset_serialization_test_base.DatasetSerializationTestBase,
+    parameterized.TestCase):
 
   def _build_ds(self, seed):
     return dataset_ops.Dataset.range(20).apply(
         shuffle_ops.shuffle_and_repeat(buffer_size=5, count=5, seed=seed))
 
+  @combinations.generate(test_base.default_test_combinations())
   def testCore(self):
     self.run_core_tests(lambda: self._build_ds(10), 100)
 
diff --git a/tensorflow/python/data/experimental/kernel_tests/serialization/shuffle_dataset_serialization_test.py b/tensorflow/python/data/experimental/kernel_tests/serialization/shuffle_dataset_serialization_test.py
index 8e05823ccbe..c839a067243 100644
--- a/tensorflow/python/data/experimental/kernel_tests/serialization/shuffle_dataset_serialization_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/serialization/shuffle_dataset_serialization_test.py
@@ -17,13 +17,13 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import itertools
-
 from absl.testing import parameterized
 
 from tensorflow.python.data.experimental.kernel_tests.serialization import dataset_serialization_test_base
 from tensorflow.python.data.experimental.ops import iterator_ops as contrib_iterator_ops
+from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.framework import combinations
 from tensorflow.python.framework import ops
 from tensorflow.python.platform import test
 from tensorflow.python.training import saver as saver_lib
@@ -50,7 +50,12 @@ class ShuffleDatasetSerializationTest(
     options.experimental_optimization.apply_default_optimizations = False
     return dataset.with_options(options)
 
-  @parameterized.parameters(itertools.product([True, False], [1, 3, 5, 8, 10]))
+  @combinations.generate(
+      combinations.times(
+          test_base.default_test_combinations(),
+          combinations.combine(
+              reshuffle_each_iteration=[True, False],
+              buffer_size=[1, 3, 5, 8, 10])))
   def testShuffleCore(self, reshuffle_each_iteration, buffer_size):
     seed = 55
     range_limit = 5
@@ -67,7 +72,12 @@ class ShuffleDatasetSerializationTest(
 
   # TODO(b/133780904): Re-enable this test once randomness state is hoisted out
   # of the input pipeline.
-  @parameterized.parameters(itertools.product([True, False], [1, 3, 5, 8, 10]))
+  @combinations.generate(
+      combinations.times(
+          test_base.default_test_combinations(),
+          combinations.combine(
+              reshuffle_each_iteration=[True, False],
+              buffer_size=[1, 3, 5, 8, 10])))
   def _testNonDeterministicSeeding(self, reshuffle_each_iteration, buffer_size):
     range_limit = 5
     num_repeats = 2
@@ -101,7 +111,12 @@ class ShuffleDatasetSerializationTest(
         verify_exhausted=False)
     self.match(expected, actual)
 
-  @parameterized.parameters(itertools.product([True, False], [1, 3, 5, 8, 10]))
+  @combinations.generate(
+      combinations.combine(
+          tf_api_version=1,
+          mode=["graph"],
+          reshuffle_each_iteration=[True, False],
+          buffer_size=[1, 3, 5, 8, 10]))
   def testMultipleIterators(self, reshuffle_each_iteration, buffer_size):
     range_limit = 5
     num_repeats = 2
diff --git a/tensorflow/python/data/experimental/kernel_tests/serialization/sql_dataset_serialization_test.py b/tensorflow/python/data/experimental/kernel_tests/serialization/sql_dataset_serialization_test.py
index e3a44a4d6d7..9094955e175 100644
--- a/tensorflow/python/data/experimental/kernel_tests/serialization/sql_dataset_serialization_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/serialization/sql_dataset_serialization_test.py
@@ -19,9 +19,13 @@ from __future__ import print_function
 
 import os
 
+from absl.testing import parameterized
+
 from tensorflow.python.data.experimental.kernel_tests import sql_dataset_test_base
 from tensorflow.python.data.experimental.kernel_tests.serialization import dataset_serialization_test_base
 from tensorflow.python.data.experimental.ops import readers
+from tensorflow.python.data.kernel_tests import test_base
+from tensorflow.python.framework import combinations
 from tensorflow.python.framework import dtypes
 from tensorflow.python.ops import array_ops
 from tensorflow.python.platform import test
@@ -29,7 +33,8 @@ from tensorflow.python.platform import test
 
 class SqlDatasetSerializationTest(
     sql_dataset_test_base.SqlDatasetTestBase,
-    dataset_serialization_test_base.DatasetSerializationTestBase):
+    dataset_serialization_test_base.DatasetSerializationTestBase,
+    parameterized.TestCase):
 
   def _build_dataset(self, num_repeats):
     data_source_name = os.path.join(test.get_temp_dir(), "tftest.sqlite")
@@ -41,7 +46,8 @@ class SqlDatasetSerializationTest(
     return readers.SqlDataset(driver_name, data_source_name, query,
                               output_types).repeat(num_repeats)
 
-  def testSQLSaveable(self):
+  @combinations.generate(test_base.default_test_combinations())
+  def testCore(self):
     num_repeats = 4
     num_outputs = num_repeats * 2
     self.run_core_tests(lambda: self._build_dataset(num_repeats), num_outputs)
diff --git a/tensorflow/python/data/experimental/kernel_tests/serialization/stats_dataset_serialization_test.py b/tensorflow/python/data/experimental/kernel_tests/serialization/stats_dataset_serialization_test.py
index 66d423634be..27b14f0730f 100644
--- a/tensorflow/python/data/experimental/kernel_tests/serialization/stats_dataset_serialization_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/serialization/stats_dataset_serialization_test.py
@@ -17,10 +17,14 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from absl.testing import parameterized
+
 from tensorflow.python.data.experimental.kernel_tests.serialization import dataset_serialization_test_base
 from tensorflow.python.data.experimental.ops import stats_aggregator
 from tensorflow.python.data.experimental.ops import stats_ops
+from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.framework import combinations
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
@@ -31,13 +35,15 @@ from tensorflow.python.platform import test
 # transformation `stats_ops.set_stats_aggregator`, since we don't support
 # saving/restoring resources (StatsAggregator in this case) yet.
 class StatsDatasetSerializationTest(
-    dataset_serialization_test_base.DatasetSerializationTestBase):
+    dataset_serialization_test_base.DatasetSerializationTestBase,
+    parameterized.TestCase):
 
   def _build_dataset_bytes_stats(self, num_elements):
     return dataset_ops.Dataset.range(num_elements).map(
         lambda x: array_ops.tile([x], ops.convert_to_tensor([x]))).apply(
             stats_ops.bytes_produced_stats("bytes_produced"))
 
+  @combinations.generate(test_base.default_test_combinations())
   def test_bytes_produced_stats_invalid_tag_shape(self):
     with self.assertRaisesRegexp(
         ValueError, "Shape must be rank 0 but is rank 1"):
@@ -47,6 +53,7 @@ class StatsDatasetSerializationTest(
               stats_ops.bytes_produced_stats(["bytes_produced"])), 100)
       # pylint: enable=g-long-lambda
 
+  @combinations.generate(test_base.default_test_combinations())
   def testBytesStatsDatasetSaveableCore(self):
     num_outputs = 100
     self.run_core_tests(lambda: self._build_dataset_bytes_stats(num_outputs),
@@ -63,6 +70,7 @@ class StatsDatasetSerializationTest(
     return dataset_ops.Dataset.range(num_elements).apply(
         stats_ops.latency_stats(tag1)).apply(stats_ops.latency_stats(tag2))
 
+  @combinations.generate(test_base.default_test_combinations())
   def test_latency_stats_invalid_tag_shape(self):
     with self.assertRaisesRegexp(
         ValueError, "Shape must be rank 0 but is rank 1"):
@@ -73,6 +81,7 @@ class StatsDatasetSerializationTest(
           100)
       # pylint: enable=g-long-lambda
 
+  @combinations.generate(test_base.default_test_combinations())
   def testLatencyStatsDatasetSaveableCore(self):
     num_outputs = 100
 
@@ -93,6 +102,7 @@ class StatsDatasetSerializationTest(
     return dataset_ops.Dataset.range(10).apply(
         stats_ops.set_stats_aggregator(aggregator))
 
+  @combinations.generate(test_base.default_test_combinations())
   def test_set_stats_aggregator_not_support_checkpointing(self):
     with self.assertRaisesRegexp(errors.UnimplementedError,
                                  "does not support checkpointing"):
diff --git a/tensorflow/python/data/experimental/kernel_tests/serialization/take_while_dataset_serialization_test.py b/tensorflow/python/data/experimental/kernel_tests/serialization/take_while_dataset_serialization_test.py
index 67a27ac7570..c189c13b458 100644
--- a/tensorflow/python/data/experimental/kernel_tests/serialization/take_while_dataset_serialization_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/serialization/take_while_dataset_serialization_test.py
@@ -21,7 +21,9 @@ from absl.testing import parameterized
 
 from tensorflow.python.data.experimental.kernel_tests.serialization import dataset_serialization_test_base
 from tensorflow.python.data.experimental.ops import take_while_ops
+from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.framework import combinations
 from tensorflow.python.platform import test
 
 
@@ -33,10 +35,13 @@ class TakeWhileDatasetSerializationTest(
     return dataset_ops.Dataset.range(num_elements).apply(
         take_while_ops.take_while(lambda x: x < upper_bound))
 
-  @parameterized.parameters((23, 7), (10, 0), (25, 25))
-  def testCore(self, num_elem, upper_bound):
-    self.run_core_tests(lambda: self._build_dataset(num_elem, upper_bound),
-                        upper_bound)
+  @combinations.generate(
+      combinations.times(
+          test_base.default_test_combinations(),
+          combinations.combine(num_elements=[10, 23], upper_bound=[10, 23])))
+  def testCore(self, num_elements, upper_bound):
+    self.run_core_tests(lambda: self._build_dataset(num_elements, upper_bound),
+                        min(num_elements, upper_bound))
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/data/experimental/kernel_tests/serialization/textline_dataset_serialization_test.py b/tensorflow/python/data/experimental/kernel_tests/serialization/textline_dataset_serialization_test.py
index 97827c85d94..5203d75f095 100644
--- a/tensorflow/python/data/experimental/kernel_tests/serialization/textline_dataset_serialization_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/serialization/textline_dataset_serialization_test.py
@@ -17,20 +17,26 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from absl.testing import parameterized
+
 from tensorflow.python.data.experimental.kernel_tests import reader_dataset_ops_test_base
 from tensorflow.python.data.experimental.kernel_tests.serialization import dataset_serialization_test_base
+from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import readers as core_readers
+from tensorflow.python.framework import combinations
 from tensorflow.python.platform import test
 
 
 class TextLineDatasetSerializationTest(
     reader_dataset_ops_test_base.TextLineDatasetTestBase,
-    dataset_serialization_test_base.DatasetSerializationTestBase):
+    dataset_serialization_test_base.DatasetSerializationTestBase,
+    parameterized.TestCase):
 
   def _build_iterator_graph(self, test_filenames, compression_type=None):
     return core_readers.TextLineDataset(
         test_filenames, compression_type=compression_type, buffer_size=10)
 
+  @combinations.generate(test_base.default_test_combinations())
   def testTextLineCore(self):
     compression_types = [None, "GZIP", "ZLIB"]
     num_files = 5
diff --git a/tensorflow/python/data/experimental/kernel_tests/serialization/tf_record_dataset_serialization_test.py b/tensorflow/python/data/experimental/kernel_tests/serialization/tf_record_dataset_serialization_test.py
index 92cd8e0e4ff..3fa88bc1267 100644
--- a/tensorflow/python/data/experimental/kernel_tests/serialization/tf_record_dataset_serialization_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/serialization/tf_record_dataset_serialization_test.py
@@ -21,15 +21,20 @@ import gzip
 import os
 import zlib
 
+from absl.testing import parameterized
+
 from tensorflow.python.data.experimental.kernel_tests import reader_dataset_ops_test_base
 from tensorflow.python.data.experimental.kernel_tests.serialization import dataset_serialization_test_base
+from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import readers as core_readers
+from tensorflow.python.framework import combinations
 from tensorflow.python.platform import test
 
 
 class TFRecordDatasetSerializationTest(
     reader_dataset_ops_test_base.TFRecordDatasetTestBase,
-    dataset_serialization_test_base.DatasetSerializationTestBase):
+    dataset_serialization_test_base.DatasetSerializationTestBase,
+    parameterized.TestCase):
 
   def _build_iterator_graph(self,
                             num_epochs,
@@ -62,6 +67,7 @@ class TFRecordDatasetSerializationTest(
         filenames, compression_type,
         buffer_size=buffer_size).repeat(num_epochs).batch(batch_size)
 
+  @combinations.generate(test_base.default_test_combinations())
   def testTFRecordWithoutBufferCore(self):
     num_epochs = 5
     batch_size = num_epochs
@@ -76,12 +82,14 @@ class TFRecordDatasetSerializationTest(
         num_outputs * batch_size)
     # pylint: enable=g-long-lambda
 
+  @combinations.generate(test_base.default_test_combinations())
   def testTFRecordWithBufferCore(self):
     num_epochs = 5
     num_outputs = num_epochs * self._num_files * self._num_records
     self.run_core_tests(lambda: self._build_iterator_graph(num_epochs),
                         num_outputs)
 
+  @combinations.generate(test_base.default_test_combinations())
   def testTFRecordWithCompressionCore(self):
     num_epochs = 5
     num_outputs = num_epochs * self._num_files * self._num_records
diff --git a/tensorflow/python/data/experimental/kernel_tests/serialization/unbatch_dataset_serialization_test.py b/tensorflow/python/data/experimental/kernel_tests/serialization/unbatch_dataset_serialization_test.py
index e900c56d0d1..d788d4a9f03 100644
--- a/tensorflow/python/data/experimental/kernel_tests/serialization/unbatch_dataset_serialization_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/serialization/unbatch_dataset_serialization_test.py
@@ -17,16 +17,20 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from absl.testing import parameterized
 import numpy as np
 
 from tensorflow.python.data.experimental.kernel_tests.serialization import dataset_serialization_test_base
 from tensorflow.python.data.experimental.ops import batching
+from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.framework import combinations
 from tensorflow.python.platform import test
 
 
 class UnbatchDatasetSerializationTest(
-    dataset_serialization_test_base.DatasetSerializationTestBase):
+    dataset_serialization_test_base.DatasetSerializationTestBase,
+    parameterized.TestCase):
 
   def build_dataset(self, multiplier=15.0, tensor_slice_len=2, batch_size=2):
     components = (
@@ -37,6 +41,7 @@ class UnbatchDatasetSerializationTest(
     return dataset_ops.Dataset.from_tensor_slices(components).batch(
         batch_size).apply(batching.unbatch())
 
+  @combinations.generate(test_base.default_test_combinations())
   def testCore(self):
     tensor_slice_len = 8
     batch_size = 2
diff --git a/tensorflow/python/data/experimental/kernel_tests/serialization/unique_dataset_serialization_test.py b/tensorflow/python/data/experimental/kernel_tests/serialization/unique_dataset_serialization_test.py
index 278fd857c5a..f9b77fe69e8 100644
--- a/tensorflow/python/data/experimental/kernel_tests/serialization/unique_dataset_serialization_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/serialization/unique_dataset_serialization_test.py
@@ -17,15 +17,21 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from absl.testing import parameterized
+
 from tensorflow.python.data.experimental.kernel_tests.serialization import dataset_serialization_test_base
 from tensorflow.python.data.experimental.ops import unique
+from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.framework import combinations
 from tensorflow.python.platform import test
 
 
 class UniqueDatasetSerializationTest(
-    dataset_serialization_test_base.DatasetSerializationTestBase):
+    dataset_serialization_test_base.DatasetSerializationTestBase,
+    parameterized.TestCase):
 
+  @combinations.generate(test_base.default_test_combinations())
   def testUnique(self):
 
     def build_dataset(num_elements, unique_elem_range):
diff --git a/tensorflow/python/data/experimental/kernel_tests/serialization/zip_dataset_serialization_test.py b/tensorflow/python/data/experimental/kernel_tests/serialization/zip_dataset_serialization_test.py
index b26691fed07..a1b7cfef093 100644
--- a/tensorflow/python/data/experimental/kernel_tests/serialization/zip_dataset_serialization_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/serialization/zip_dataset_serialization_test.py
@@ -17,15 +17,19 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from absl.testing import parameterized
 import numpy as np
 
 from tensorflow.python.data.experimental.kernel_tests.serialization import dataset_serialization_test_base
+from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.framework import combinations
 from tensorflow.python.platform import test
 
 
 class ZipDatasetSerializationTest(
-    dataset_serialization_test_base.DatasetSerializationTestBase):
+    dataset_serialization_test_base.DatasetSerializationTestBase,
+    parameterized.TestCase):
 
   def _build_dataset(self, arr):
     components = [
@@ -39,6 +43,7 @@ class ZipDatasetSerializationTest(
     ]
     return dataset_ops.Dataset.zip((datasets[0], (datasets[1], datasets[2])))
 
+  @combinations.generate(test_base.default_test_combinations())
   def testCore(self):
     # Equal length components
     arr = [37.0, 38.0, 39.0, 40.0]
diff --git a/tensorflow/python/data/experimental/kernel_tests/snapshot_test.py b/tensorflow/python/data/experimental/kernel_tests/snapshot_test.py
index 590b2841437..9ff47b67316 100644
--- a/tensorflow/python/data/experimental/kernel_tests/snapshot_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/snapshot_test.py
@@ -133,9 +133,10 @@ class SnapshotDatasetTest(reader_dataset_ops_test_base.TFRecordDatasetTestBase,
   @combinations.generate(
       combinations.times(
           test_base.default_test_combinations(),
-          combinations.combine(
-              compression=[snapshot.COMPRESSION_NONE,
-                           snapshot.COMPRESSION_GZIP])))
+          combinations.combine(compression=[
+              snapshot.COMPRESSION_NONE, snapshot.COMPRESSION_GZIP,
+              snapshot.COMPRESSION_SNAPPY
+          ])))
   def testWriteSnapshotSimpleSuccessful(self, compression):
     tmpdir = self.makeSnapshotDirectory()
 
@@ -159,9 +160,10 @@ class SnapshotDatasetTest(reader_dataset_ops_test_base.TFRecordDatasetTestBase,
   @combinations.generate(
       combinations.times(
           test_base.default_test_combinations(),
-          combinations.combine(
-              compression=[snapshot.COMPRESSION_NONE,
-                           snapshot.COMPRESSION_GZIP])))
+          combinations.combine(compression=[
+              snapshot.COMPRESSION_NONE, snapshot.COMPRESSION_GZIP,
+              snapshot.COMPRESSION_SNAPPY
+          ])))
   def testReadSnapshotBackAfterWrite(self, compression):
     self.setUpTFRecord()
     filenames = self.test_filenames
@@ -260,9 +262,10 @@ class SnapshotDatasetTest(reader_dataset_ops_test_base.TFRecordDatasetTestBase,
       combinations.times(
           test_base.default_test_combinations(),
           combinations.times(
-              combinations.combine(
-                  compression=[snapshot.COMPRESSION_NONE,
-                               snapshot.COMPRESSION_GZIP]),
+              combinations.combine(compression=[
+                  snapshot.COMPRESSION_NONE, snapshot.COMPRESSION_GZIP,
+                  snapshot.COMPRESSION_SNAPPY
+              ]),
               combinations.combine(threads=2, size=[1, 2]) +
               combinations.combine(threads=8, size=[1, 4, 8]))))
   def testReadSnapshotBackAfterMultiThreadedWrite(
diff --git a/tensorflow/python/data/experimental/ops/distribute.py b/tensorflow/python/data/experimental/ops/distribute.py
index 7245a3d7928..dcf67fb7e57 100644
--- a/tensorflow/python/data/experimental/ops/distribute.py
+++ b/tensorflow/python/data/experimental/ops/distribute.py
@@ -17,7 +17,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.python.compat import compat
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.util import nest
 from tensorflow.python.data.util import structure
@@ -99,17 +98,10 @@ class _RebatchDataset(dataset_ops.UnaryDataset):
 
     self._element_spec = structure.convert_legacy_structure(
         input_types, output_shapes, input_classes)
-    if compat.forward_compatible(2019, 8, 13) or not use_fallback:
-      variant_tensor = ged_ops.rebatch_dataset(
-          self._input_dataset._variant_tensor,  # pylint: disable=protected-access
-          num_replicas=num_replicas,
-          use_fallback=use_fallback,
-          **self._flat_structure)
-    else:
-      variant_tensor = ged_ops.rebatch_dataset(
-          self._input_dataset._variant_tensor,  # pylint: disable=protected-access
-          num_replicas=num_replicas,
-          **self._flat_structure)
+    variant_tensor = ged_ops.rebatch_dataset(
+        self._input_dataset._variant_tensor,  # pylint: disable=protected-access
+        num_replicas=num_replicas,
+        **self._flat_structure)
     super(_RebatchDataset, self).__init__(input_dataset, variant_tensor)
 
   @property
diff --git a/tensorflow/python/data/experimental/ops/interleave_ops.py b/tensorflow/python/data/experimental/ops/interleave_ops.py
index 07351b86449..d9ea2ac288f 100644
--- a/tensorflow/python/data/experimental/ops/interleave_ops.py
+++ b/tensorflow/python/data/experimental/ops/interleave_ops.py
@@ -37,7 +37,7 @@ from tensorflow.python.util.tf_export import tf_export
     None,
     "Use `tf.data.Dataset.interleave(map_func, cycle_length, block_length, "
     "num_parallel_calls=tf.data.experimental.AUTOTUNE)` instead. If sloppy "
-    "execution is desired, use `tf.data.Options.experimental_determinstic`.")
+    "execution is desired, use `tf.data.Options.experimental_deterministic`.")
 @tf_export("data.experimental.parallel_interleave")
 def parallel_interleave(map_func,
                         cycle_length,
@@ -96,7 +96,7 @@ def parallel_interleave(map_func,
   return _apply_fn
 
 
-class _DirectedInterleaveDataset(dataset_ops.Dataset):
+class _DirectedInterleaveDataset(dataset_ops.DatasetV2):
   """A substitute for `Dataset.interleave()` on a fixed list of datasets."""
 
   def __init__(self, selector_input, data_inputs):
@@ -122,15 +122,12 @@ class _DirectedInterleaveDataset(dataset_ops.Dataset):
 
     self._element_spec = structure.convert_legacy_structure(
         first_output_types, output_shapes, first_output_classes)
-    super(_DirectedInterleaveDataset, self).__init__()
-
-  def _as_variant_tensor(self):
     # pylint: disable=protected-access
-    return (
-        gen_experimental_dataset_ops.directed_interleave_dataset(
-            self._selector_input._variant_tensor,
-            [data_input._variant_tensor for data_input in self._data_inputs],
-            **self._flat_structure))
+    variant_tensor = gen_experimental_dataset_ops.directed_interleave_dataset(
+        self._selector_input._variant_tensor,
+        [data_input._variant_tensor for data_input in self._data_inputs],
+        **self._flat_structure)
+    super(_DirectedInterleaveDataset, self).__init__(variant_tensor)
 
   def _inputs(self):
     return [self._selector_input] + self._data_inputs
diff --git a/tensorflow/python/data/experimental/ops/parsing_ops.py b/tensorflow/python/data/experimental/ops/parsing_ops.py
index 2f74eba5a89..cf04da3d344 100644
--- a/tensorflow/python/data/experimental/ops/parsing_ops.py
+++ b/tensorflow/python/data/experimental/ops/parsing_ops.py
@@ -40,31 +40,27 @@ class _ParseExampleDataset(dataset_ops.UnaryDataset):
     self._num_parallel_calls = num_parallel_calls
     # pylint: disable=protected-access
     self._features = parsing_ops._prepend_none_dimension(features)
-    # sparse_keys and dense_keys come back sorted here.
-    (sparse_keys, sparse_types, dense_keys, dense_types, dense_defaults,
-     dense_shapes) = parsing_ops._features_to_raw_params(
-         self._features, [
-             parsing_ops.VarLenFeature, parsing_ops.SparseFeature,
-             parsing_ops.FixedLenFeature, parsing_ops.FixedLenSequenceFeature
-         ])
-    # TODO(b/112859642): Pass sparse_index and sparse_values for SparseFeature.
-    (_, dense_defaults_vec, sparse_keys, sparse_types, dense_keys, dense_shapes,
-     dense_shape_as_shape) = parsing_ops._process_raw_parameters(
-         None, dense_defaults, sparse_keys, sparse_types, dense_keys,
-         dense_types, dense_shapes)
+    # TODO(b/112859642): Pass sparse_index and sparse_values for SparseFeature
+    params = parsing_ops._ParseOpParams.from_features(self._features, [
+        parsing_ops.VarLenFeature, parsing_ops.SparseFeature,
+        parsing_ops.FixedLenFeature, parsing_ops.FixedLenSequenceFeature
+    ])
     # pylint: enable=protected-access
-    self._sparse_keys = sparse_keys
-    self._sparse_types = sparse_types
-    self._dense_keys = dense_keys
-    self._dense_defaults = dense_defaults_vec
-    self._dense_shapes = dense_shapes
-    self._dense_types = dense_types
+    self._sparse_keys = params.sparse_keys
+    self._sparse_types = params.sparse_types
+    self._dense_keys = params.dense_keys
+    self._dense_defaults = params.dense_defaults_vec
+    self._dense_shapes = params.dense_shapes_as_proto
+    self._dense_types = params.dense_types
+    dense_shape_as_shape = params.dense_shapes
     input_dataset_shape = dataset_ops.get_legacy_output_shapes(
         self._input_dataset)
     dense_output_shapes = [input_dataset_shape.concatenate(shape)
                            for shape in dense_shape_as_shape]
-    sparse_output_shapes = [input_dataset_shape.concatenate([None])
-                            for _ in range(len(sparse_keys))]
+    sparse_output_shapes = [
+        input_dataset_shape.concatenate([None])
+        for _ in range(len(self._sparse_keys))
+    ]
 
     output_shapes = dict(
         zip(self._dense_keys + self._sparse_keys,
diff --git a/tensorflow/python/data/experimental/ops/snapshot.py b/tensorflow/python/data/experimental/ops/snapshot.py
index 8c7622635c4..3b6babd44e4 100644
--- a/tensorflow/python/data/experimental/ops/snapshot.py
+++ b/tensorflow/python/data/experimental/ops/snapshot.py
@@ -25,6 +25,7 @@ from tensorflow.python.ops import gen_experimental_dataset_ops as ged_ops
 
 
 COMPRESSION_GZIP = "GZIP"
+COMPRESSION_SNAPPY = "SNAPPY"
 COMPRESSION_NONE = None
 
 
diff --git a/tensorflow/python/data/kernel_tests/from_sparse_tensor_slices_test.py b/tensorflow/python/data/kernel_tests/from_sparse_tensor_slices_test.py
index 2ce9c9a061c..147d31366bb 100644
--- a/tensorflow/python/data/kernel_tests/from_sparse_tensor_slices_test.py
+++ b/tensorflow/python/data/kernel_tests/from_sparse_tensor_slices_test.py
@@ -17,21 +17,25 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from absl.testing import parameterized
 import numpy as np
 
 from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.framework import combinations
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import sparse_tensor
-from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.platform import test
 
 
-@test_util.run_v1_only("deprecated API, no eager or V2 test coverage")
-class FromSparseTensorSlicesTest(test_base.DatasetTestBase):
+class FromSparseTensorSlicesTest(test_base.DatasetTestBase,
+                                 parameterized.TestCase):
 
+  # TODO(jsimsa): Break this down to multiple (parameterized) test cases.
+  @combinations.generate(
+      combinations.combine(tf_api_version=1, mode=["graph"]))
   def testFromSparseTensorSlices(self):
     """Test a dataset based on slices of a `tf.SparseTensor`."""
     st = array_ops.sparse_placeholder(dtypes.float64)
@@ -80,6 +84,11 @@ class FromSparseTensorSlicesTest(test_base.DatasetTestBase):
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(get_next)
 
+  @combinations.generate(combinations.combine(tf_api_version=2, mode=["eager"]))
+  def testFromSparseTensorSlicesError(self):
+    with self.assertRaises(AttributeError):
+      dataset_ops.Dataset.from_sparse_tensor_slices(None)
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/data/kernel_tests/memory_cleanup_test.py b/tensorflow/python/data/kernel_tests/memory_cleanup_test.py
index b48c1fe37c9..827122a92a1 100644
--- a/tensorflow/python/data/kernel_tests/memory_cleanup_test.py
+++ b/tensorflow/python/data/kernel_tests/memory_cleanup_test.py
@@ -19,6 +19,7 @@ from __future__ import division
 from __future__ import print_function
 
 import time
+from absl.testing import parameterized
 import six
 
 from tensorflow.core.protobuf import config_pb2
@@ -26,8 +27,8 @@ from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.ops import multi_device_iterator_ops
 from tensorflow.python.eager import context
+from tensorflow.python.framework import combinations
 from tensorflow.python.framework import ops
-from tensorflow.python.framework import test_util
 from tensorflow.python.platform import test
 from tensorflow.python.platform import tf_logging as logging
 
@@ -39,8 +40,7 @@ except ImportError:
   memory_profiler = None
 
 
-@test_util.run_all_in_graph_and_eager_modes
-class MemoryCleanupTest(test_base.DatasetTestBase):
+class MemoryCleanupTest(test_base.DatasetTestBase, parameterized.TestCase):
 
   def assertNotIncreasingMemory(self,
                                 f,
@@ -64,10 +64,9 @@ class MemoryCleanupTest(test_base.DatasetTestBase):
           "Maximum allowed increase: %f") % (initial, increase,
                                              increase_threshold_absolute_mb)
 
-  @test_util.run_v1_only("b/121264236")
+  # TODO(b/121264236): Support v2 behavior
+  @combinations.generate(combinations.combine(tf_api_version=1, mode="eager"))
   def testEagerMemoryUsageWithReset(self):
-    if not context.executing_eagerly():
-      self.skipTest("Only eager mode test")
     if memory_profiler is None:
       self.skipTest("memory_profiler required to run this test")
 
@@ -82,10 +81,10 @@ class MemoryCleanupTest(test_base.DatasetTestBase):
     self.assertNotIncreasingMemory(
         f, num_iters=100, increase_threshold_absolute_mb=350)
 
-  @test_util.run_v1_only("b/121264236")
+  # TODO(b/121264236): Support v2 behavior
+  @combinations.generate(
+      combinations.combine(tf_api_version=1, mode="eager"))
   def testEagerMemoryUsageWithRecreation(self):
-    if not context.executing_eagerly():
-      self.skipTest("Only eager mode test")
     if memory_profiler is None:
       self.skipTest("memory_profiler required to run this test")
 
diff --git a/tensorflow/python/data/kernel_tests/multi_device_iterator_test.py b/tensorflow/python/data/kernel_tests/multi_device_iterator_test.py
index 69ecccbd596..27ee2cbe019 100644
--- a/tensorflow/python/data/kernel_tests/multi_device_iterator_test.py
+++ b/tensorflow/python/data/kernel_tests/multi_device_iterator_test.py
@@ -29,6 +29,7 @@ from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.ops import multi_device_iterator_ops
 from tensorflow.python.eager import context
 from tensorflow.python.eager import def_function
+from tensorflow.python.framework import combinations
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
@@ -38,12 +39,17 @@ from tensorflow.python.ops import data_flow_ops
 from tensorflow.python.platform import test
 
 
-@test_util.run_all_in_graph_and_eager_modes
+def skip_v2_test_combinations():
+  # TODO(b/121264236): Support v2 behavior for these tests.
+  return combinations.combine(tf_api_version=1, mode=["eager", "graph"])
+
+
 class MultiDeviceIteratorTest(test_base.DatasetTestBase,
                               parameterized.TestCase):
 
-  @parameterized.parameters(0, 1, 42,)
-  @test_util.run_v1_only("b/121264236")
+  @combinations.generate(
+      combinations.times(skip_v2_test_combinations(),
+                         combinations.combine(num_inits=[0, 1, 42])))
   def testInitOnly(self, num_inits):
     dataset = dataset_ops.Dataset.range(10)
     multi_device_iterator = multi_device_iterator_ops.MultiDeviceIterator(
@@ -54,7 +60,7 @@ class MultiDeviceIteratorTest(test_base.DatasetTestBase,
       for _ in range(num_inits):
         self.evaluate(multi_device_iterator.initializer)
 
-  @test_util.run_v1_only("b/121264236")
+  @combinations.generate(skip_v2_test_combinations())
   def testBasic(self):
     dataset = dataset_ops.Dataset.range(10)
     multi_device_iterator = multi_device_iterator_ops.MultiDeviceIterator(
@@ -72,7 +78,7 @@ class MultiDeviceIteratorTest(test_base.DatasetTestBase,
         self.evaluate(elem_on_1)
         self.evaluate(elem_on_2)
 
-  @test_util.run_v1_only("b/121264236")
+  @combinations.generate(skip_v2_test_combinations())
   def testOneOnSameDevice(self):
     with ops.device("/cpu:0"):
       dataset = dataset_ops.Dataset.range(10)
@@ -91,7 +97,7 @@ class MultiDeviceIteratorTest(test_base.DatasetTestBase,
         self.evaluate(elem_on_1)
         self.evaluate(elem_on_2)
 
-  @test_util.run_v1_only("b/121264236")
+  @combinations.generate(skip_v2_test_combinations())
   def testRepeatDevices(self):
     with ops.device("/cpu:0"):
       dataset = dataset_ops.Dataset.range(20)
@@ -116,7 +122,7 @@ class MultiDeviceIteratorTest(test_base.DatasetTestBase,
         self.evaluate(elem_on_3)
         self.evaluate(elem_on_4)
 
-  @test_util.run_v1_only("b/121264236")
+  @combinations.generate(skip_v2_test_combinations())
   def testNotFullyDivisible(self):
     dataset = dataset_ops.Dataset.range(9)
     multi_device_iterator = multi_device_iterator_ops.MultiDeviceIterator(
@@ -136,7 +142,7 @@ class MultiDeviceIteratorTest(test_base.DatasetTestBase,
         self.evaluate(elem_on_1)
         self.evaluate(elem_on_2)
 
-  @test_util.run_v1_only("b/121264236")
+  @combinations.generate(skip_v2_test_combinations())
   def testGetNextAsOptional(self):
     if context.executing_eagerly():
       return
@@ -173,7 +179,7 @@ class MultiDeviceIteratorTest(test_base.DatasetTestBase,
       with self.assertRaises(errors.InvalidArgumentError):
         self.evaluate(elem_on_2_t)
 
-  @test_util.run_v1_only("b/121264236")
+  @combinations.generate(skip_v2_test_combinations())
   def testUneven(self):
     dataset = dataset_ops.Dataset.range(10)
     multi_device_iterator = multi_device_iterator_ops.MultiDeviceIterator(
@@ -193,7 +199,7 @@ class MultiDeviceIteratorTest(test_base.DatasetTestBase,
         self.evaluate(elem_on_1)
         self.evaluate(elem_on_2)
 
-  @test_util.run_v1_only("b/121264236")
+  @combinations.generate(skip_v2_test_combinations())
   def testMultipleInitializationsGraph(self):
     if context.executing_eagerly():
       return
@@ -217,7 +223,7 @@ class MultiDeviceIteratorTest(test_base.DatasetTestBase,
         self.assertEqual([(i, 0), (i, 1)], self.evaluate([elem_on_1,
                                                           elem_on_2]))
 
-  @test_util.run_v1_only("b/121264236")
+  @combinations.generate(skip_v2_test_combinations())
   def testMultipleInitializationsEager(self):
     if not context.executing_eagerly():
       return
@@ -233,7 +239,7 @@ class MultiDeviceIteratorTest(test_base.DatasetTestBase,
       elem_on_1, elem_on_2 = multi_device_iterator.get_next()
       self.assertEqual([(0, 0), (1, 1)], self.evaluate([elem_on_1, elem_on_2]))
 
-  @test_util.run_v1_only("b/121264236")
+  @combinations.generate(skip_v2_test_combinations())
   def testBasicGpu(self):
     if not test_util.is_gpu_available():
       self.skipTest("No GPU available")
@@ -254,7 +260,7 @@ class MultiDeviceIteratorTest(test_base.DatasetTestBase,
         self.evaluate(elem_on_1)
         self.evaluate(elem_on_2)
 
-  @test_util.run_v1_only("b/121264236")
+  @combinations.generate(skip_v2_test_combinations())
   def testUnevenGpu(self):
     if not test_util.is_gpu_available():
       self.skipTest("No GPU available")
@@ -277,7 +283,7 @@ class MultiDeviceIteratorTest(test_base.DatasetTestBase,
         self.evaluate(elem_on_1)
         self.evaluate(elem_on_2)
 
-  @test_util.run_v1_only("b/121264236")
+  @combinations.generate(skip_v2_test_combinations())
   def testGetNextAsOptionalGpu(self):
     if not test_util.is_gpu_available() or context.executing_eagerly():
       self.skipTest("No GPU available")
@@ -314,7 +320,7 @@ class MultiDeviceIteratorTest(test_base.DatasetTestBase,
       with self.assertRaises(errors.InvalidArgumentError):
         self.evaluate(elem_on_2_t)
 
-  @test_util.run_v1_only("b/121264236")
+  @combinations.generate(skip_v2_test_combinations())
   def testOptimization(self):
     dataset = dataset_ops.Dataset.range(10)
     dataset = dataset.apply(optimization.assert_next(["MemoryCacheImpl"]))
@@ -341,9 +347,10 @@ class MultiDeviceIteratorTest(test_base.DatasetTestBase,
         self.evaluate(elem_on_2)
 
 
-class MultiDeviceIteratorV2Test(test_base.DatasetTestBase):
+class MultiDeviceIteratorV2Test(test_base.DatasetTestBase,
+                                parameterized.TestCase):
 
-  @test_util.run_v2_only
+  @combinations.generate(combinations.combine(tf_api_version=2, mode="eager"))
   def testBasic(self):
     if not test_util.is_gpu_available():
       self.skipTest("No GPU available")
@@ -357,7 +364,7 @@ class MultiDeviceIteratorV2Test(test_base.DatasetTestBase):
     for i, el in enumerate(mdi):
       self.assertEqual([i * 2, i * 2 + 1], [el[0].numpy(), el[1].numpy()])
 
-  @test_util.run_v2_only
+  @combinations.generate(combinations.combine(tf_api_version=2, mode="eager"))
   def testBasicFunction(self):
     if not test_util.is_gpu_available():
       self.skipTest("No GPU available")
@@ -380,7 +387,7 @@ class MultiDeviceIteratorV2Test(test_base.DatasetTestBase):
     for i in range(10):
       self.assertEqual(queue.dequeue().numpy(), i)
 
-  @test_util.run_v2_only
+  @combinations.generate(combinations.combine(tf_api_version=2, mode="eager"))
   def testFunctionError(self):
     if not test_util.is_gpu_available():
       self.skipTest("No GPU available")
@@ -414,7 +421,7 @@ class MultiDeviceIteratorV2Test(test_base.DatasetTestBase):
 
     self.assertEqual(queue.size().numpy(), 2)
 
-  @test_util.run_v2_only
+  @combinations.generate(combinations.combine(tf_api_version=2, mode="eager"))
   def testMultipleInitializations(self):
     if not test_util.is_gpu_available():
       self.skipTest("No GPU available")
@@ -428,7 +435,7 @@ class MultiDeviceIteratorV2Test(test_base.DatasetTestBase):
       for i, el in enumerate(multi_device_iterator):
         self.assertEqual([i * 2, i * 2 + 1], [el[0].numpy(), el[1].numpy()])
 
-  @test_util.run_v2_only
+  @combinations.generate(combinations.combine(tf_api_version=2, mode="eager"))
   def testLimitedRetracing(self):
     if not test_util.is_gpu_available():
       self.skipTest("No GPU available")
diff --git a/tensorflow/python/data/kernel_tests/test_base.py b/tensorflow/python/data/kernel_tests/test_base.py
index c81ec21c485..af63774729b 100644
--- a/tensorflow/python/data/kernel_tests/test_base.py
+++ b/tensorflow/python/data/kernel_tests/test_base.py
@@ -19,7 +19,6 @@ from __future__ import print_function
 
 import re
 
-from tensorflow.python import tf2
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.util import nest
 from tensorflow.python.data.util import structure
@@ -43,13 +42,6 @@ def default_test_combinations():
 class DatasetTestBase(test.TestCase):
   """Base class for dataset tests."""
 
-  @classmethod
-  def setUpClass(cls):
-    if tf2.enabled():
-      dataset_ops.Dataset = dataset_ops.DatasetV2
-    else:
-      dataset_ops.Dataset = dataset_ops.DatasetV1
-
   def assert_op_cancelled(self, op):
     with self.assertRaises(errors.CancelledError):
       self.evaluate(op)
diff --git a/tensorflow/python/data/ops/dataset_ops.py b/tensorflow/python/data/ops/dataset_ops.py
index 2efc47424ef..b8997eef2a8 100644
--- a/tensorflow/python/data/ops/dataset_ops.py
+++ b/tensorflow/python/data/ops/dataset_ops.py
@@ -2071,9 +2071,10 @@ class DatasetV1(DatasetV2):
     return DatasetV1Adapter(super(DatasetV1, self).with_options(options))
 
 
-# TODO(b/119044825): Until all `tf.data` unit tests are converted to V2, keep
-# this alias in place.
-Dataset = DatasetV1
+if tf2.enabled():
+  Dataset = DatasetV2
+else:
+  Dataset = DatasetV1
 
 
 class DatasetV1Adapter(DatasetV1):
@@ -2786,7 +2787,16 @@ class StructuredFunctionWrapper(object):
 
       resource_tracker = tracking.ResourceTracker()
       with tracking.resource_tracker_scope(resource_tracker):
-        self._function = wrapper_fn._get_concrete_function_internal()
+        self._function = (
+            wrapper_fn._get_concrete_function_internal_garbage_collected())
+
+        # TODO(jsimsa): Garbage collecting functions containing PyFunc nodes
+        # triggers use-after-free. Figure out why and stop excluding functions
+        # with PyFunc nodes from garbage collection.
+        for node in self._function.function_def.node_def:
+          if node.op in ("PyFunc", "PyFuncStateless", "EagerPyFunc"):
+            self._function._garbage_collector.release()
+
         if add_to_graph:
           self._function.add_to_graph(ops.get_default_graph())
       if resource_tracker.resources:
diff --git a/tensorflow/python/data/ops/multi_device_iterator_ops.py b/tensorflow/python/data/ops/multi_device_iterator_ops.py
index acebe54e6c7..7173506f6e7 100644
--- a/tensorflow/python/data/ops/multi_device_iterator_ops.py
+++ b/tensorflow/python/data/ops/multi_device_iterator_ops.py
@@ -51,7 +51,8 @@ class _PerDeviceGenerator(dataset_ops.DatasetV2):
     def _init_func():
       return multi_device_iterator_string_handle
 
-    init_func_concrete = _init_func._get_concrete_function_internal()  # pylint: disable=protected-access
+    init_func_concrete = (
+        _init_func._get_concrete_function_internal_garbage_collected())  # pylint: disable=protected-access
 
     # TODO(b/124254153): Enable autograph once the overhead is low enough.
     @function.defun(autograph=False)  # Pure graph code.
@@ -62,7 +63,8 @@ class _PerDeviceGenerator(dataset_ops.DatasetV2):
           Tout=[dtypes.string],
           f=init_func_concrete)
 
-    self._init_func = _remote_init_func._get_concrete_function_internal()  # pylint: disable=protected-access
+    self._init_func = (
+        _remote_init_func._get_concrete_function_internal_garbage_collected())  # pylint: disable=protected-access
     self._init_captured_args = self._init_func.captured_inputs
 
     # TODO(b/124254153): Enable autograph once the overhead is low enough.
@@ -84,7 +86,8 @@ class _PerDeviceGenerator(dataset_ops.DatasetV2):
           output_types=structure.get_flat_tensor_types(self._element_spec),
           output_shapes=structure.get_flat_tensor_shapes(self._element_spec))
 
-    next_func_concrete = _next_func._get_concrete_function_internal()  # pylint: disable=protected-access
+    next_func_concrete = (
+        _next_func._get_concrete_function_internal_garbage_collected())  # pylint: disable=protected-access
 
     # TODO(b/124254153): Enable autograph once the overhead is low enough.
     @function.defun_with_attributes(
@@ -98,7 +101,8 @@ class _PerDeviceGenerator(dataset_ops.DatasetV2):
           Tout=structure.get_flat_tensor_types(self._element_spec),
           f=next_func_concrete)
 
-    self._next_func = _remote_next_func._get_concrete_function_internal()  # pylint: disable=protected-access
+    self._next_func = (
+        _remote_next_func._get_concrete_function_internal_garbage_collected())  # pylint: disable=protected-access
     self._next_captured_args = self._next_func.captured_inputs
 
     self._incarnation_id_index = -1
@@ -113,7 +117,8 @@ class _PerDeviceGenerator(dataset_ops.DatasetV2):
     def _finalize_func(unused_string_handle):
       return array_ops.constant(0, dtypes.int64)
 
-    finalize_func_concrete = _finalize_func._get_concrete_function_internal()  # pylint: disable=protected-access
+    finalize_func_concrete = (
+        _finalize_func._get_concrete_function_internal_garbage_collected())  # pylint: disable=protected-access
 
     # TODO(b/124254153): Enable autograph once the overhead is low enough.
     @function.defun(
@@ -126,8 +131,8 @@ class _PerDeviceGenerator(dataset_ops.DatasetV2):
           Tout=[dtypes.int64],
           f=finalize_func_concrete)
 
-    self._finalize_func = _remote_finalize_func._get_concrete_function_internal(  # pylint: disable=protected-access
-    )
+    self._finalize_func = (_remote_finalize_func  # pylint: disable=protected-access
+                           ._get_concrete_function_internal_garbage_collected())
     self._finalize_captured_args = self._finalize_func.captured_inputs
 
     variant_tensor = gen_dataset_ops.generator_dataset(
diff --git a/tensorflow/python/data/ops/readers.py b/tensorflow/python/data/ops/readers.py
index b55406b72e3..1c0988aa4a9 100644
--- a/tensorflow/python/data/ops/readers.py
+++ b/tensorflow/python/data/ops/readers.py
@@ -28,8 +28,6 @@ from tensorflow.python.ops import gen_dataset_ops
 from tensorflow.python.ops import gen_experimental_dataset_ops as ged_ops
 from tensorflow.python.util.tf_export import tf_export
 
-
-# TODO(b/64974358): Increase default buffer size to 256 MB.
 _DEFAULT_READER_BUFFER_SIZE_BYTES = 256 * 1024  # 256 KB
 
 
@@ -72,6 +70,7 @@ def _create_dataset_reader(dataset_creator, filenames, num_parallel_reads=None):
   Returns:
     A `Dataset` that reads data from `filenames`.
   """
+
   def read_one_file(filename):
     filename = ops.convert_to_tensor(filename, dtypes.string, name="filename")
     return dataset_creator(filename)
@@ -83,8 +82,12 @@ def _create_dataset_reader(dataset_creator, filenames, num_parallel_reads=None):
         read_one_file, num_parallel_calls=num_parallel_reads)
   else:
     return ParallelInterleaveDataset(
-        filenames, read_one_file, cycle_length=num_parallel_reads,
-        block_length=1, sloppy=False, buffer_output_elements=None,
+        filenames,
+        read_one_file,
+        cycle_length=num_parallel_reads,
+        block_length=1,
+        sloppy=False,
+        buffer_output_elements=None,
         prefetch_input_elements=None)
 
 
@@ -112,8 +115,9 @@ class _TextLineDataset(dataset_ops.DatasetSource):
         "buffer_size",
         buffer_size,
         argument_default=_DEFAULT_READER_BUFFER_SIZE_BYTES)
-    variant_tensor = gen_dataset_ops.text_line_dataset(
-        self._filenames, self._compression_type, self._buffer_size)
+    variant_tensor = gen_dataset_ops.text_line_dataset(self._filenames,
+                                                       self._compression_type,
+                                                       self._buffer_size)
     super(_TextLineDataset, self).__init__(variant_tensor)
 
   @property
@@ -125,7 +129,10 @@ class _TextLineDataset(dataset_ops.DatasetSource):
 class TextLineDatasetV2(dataset_ops.DatasetSource):
   """A `Dataset` comprising lines from one or more text files."""
 
-  def __init__(self, filenames, compression_type=None, buffer_size=None,
+  def __init__(self,
+               filenames,
+               compression_type=None,
+               buffer_size=None,
                num_parallel_reads=None):
     """Creates a `TextLineDataset`.
 
@@ -167,11 +174,15 @@ class TextLineDatasetV2(dataset_ops.DatasetSource):
 class TextLineDatasetV1(dataset_ops.DatasetV1Adapter):
   """A `Dataset` comprising lines from one or more text files."""
 
-  def __init__(self, filenames, compression_type=None, buffer_size=None,
+  def __init__(self,
+               filenames,
+               compression_type=None,
+               buffer_size=None,
                num_parallel_reads=None):
     wrapped = TextLineDatasetV2(filenames, compression_type, buffer_size,
                                 num_parallel_reads)
     super(TextLineDatasetV1, self).__init__(wrapped)
+
   __init__.__doc__ = TextLineDatasetV2.__init__.__doc__
 
   @property
@@ -206,8 +217,9 @@ class _TFRecordDataset(dataset_ops.DatasetSource):
         "buffer_size",
         buffer_size,
         argument_default=_DEFAULT_READER_BUFFER_SIZE_BYTES)
-    variant_tensor = gen_dataset_ops.tf_record_dataset(
-        self._filenames, self._compression_type, self._buffer_size)
+    variant_tensor = gen_dataset_ops.tf_record_dataset(self._filenames,
+                                                       self._compression_type,
+                                                       self._buffer_size)
     super(_TFRecordDataset, self).__init__(variant_tensor)
 
   @property
@@ -269,7 +281,10 @@ class ParallelInterleaveDataset(dataset_ops.UnaryDataset):
 class TFRecordDatasetV2(dataset_ops.DatasetV2):
   """A `Dataset` comprising records from one or more TFRecord files."""
 
-  def __init__(self, filenames, compression_type=None, buffer_size=None,
+  def __init__(self,
+               filenames,
+               compression_type=None,
+               buffer_size=None,
                num_parallel_reads=None):
     """Creates a `TFRecordDataset` to read one or more TFRecord files.
 
@@ -313,10 +328,10 @@ class TFRecordDatasetV2(dataset_ops.DatasetV2):
              compression_type=None,
              buffer_size=None,
              num_parallel_reads=None):
-    return TFRecordDatasetV2(filenames or self._filenames,
-                             compression_type or self._compression_type,
-                             buffer_size or self._buffer_size,
-                             num_parallel_reads or self._num_parallel_reads)
+    return TFRecordDatasetV2(filenames or self._filenames, compression_type or
+                             self._compression_type, buffer_size or
+                             self._buffer_size, num_parallel_reads or
+                             self._num_parallel_reads)
 
   def _inputs(self):
     return self._impl._inputs()  # pylint: disable=protected-access
@@ -330,11 +345,15 @@ class TFRecordDatasetV2(dataset_ops.DatasetV2):
 class TFRecordDatasetV1(dataset_ops.DatasetV1Adapter):
   """A `Dataset` comprising records from one or more TFRecord files."""
 
-  def __init__(self, filenames, compression_type=None, buffer_size=None,
+  def __init__(self,
+               filenames,
+               compression_type=None,
+               buffer_size=None,
                num_parallel_reads=None):
-    wrapped = TFRecordDatasetV2(
-        filenames, compression_type, buffer_size, num_parallel_reads)
+    wrapped = TFRecordDatasetV2(filenames, compression_type, buffer_size,
+                                num_parallel_reads)
     super(TFRecordDatasetV1, self).__init__(wrapped)
+
   __init__.__doc__ = TFRecordDatasetV2.__init__.__doc__
 
   def _clone(self,
@@ -344,10 +363,10 @@ class TFRecordDatasetV1(dataset_ops.DatasetV1Adapter):
              num_parallel_reads=None):
     # pylint: disable=protected-access
     return TFRecordDatasetV1(
-        filenames or self._dataset._filenames,
-        compression_type or self._dataset._compression_type,
-        buffer_size or self._dataset._buffer_size,
-        num_parallel_reads or self._dataset._num_parallel_reads)
+        filenames or self._dataset._filenames, compression_type or
+        self._dataset._compression_type, buffer_size or
+        self._dataset._buffer_size, num_parallel_reads or
+        self._dataset._num_parallel_reads)
 
   @property
   def _filenames(self):
@@ -372,8 +391,8 @@ class _FixedLengthRecordDataset(dataset_ops.DatasetSource):
 
     Args:
       filenames: A `tf.string` tensor containing one or more filenames.
-      record_bytes: A `tf.int64` scalar representing the number of bytes in
-        each record.
+      record_bytes: A `tf.int64` scalar representing the number of bytes in each
+        record.
       header_bytes: (Optional.) A `tf.int64` scalar representing the number of
         bytes to skip at the start of a file.
       footer_bytes: (Optional.) A `tf.int64` scalar representing the number of
@@ -424,8 +443,8 @@ class FixedLengthRecordDatasetV2(dataset_ops.DatasetSource):
     Args:
       filenames: A `tf.string` tensor or `tf.data.Dataset` containing one or
         more filenames.
-      record_bytes: A `tf.int64` scalar representing the number of bytes in
-        each record.
+      record_bytes: A `tf.int64` scalar representing the number of bytes in each
+        record.
       header_bytes: (Optional.) A `tf.int64` scalar representing the number of
         bytes to skip at the start of a file.
       footer_bytes: (Optional.) A `tf.int64` scalar representing the number of
@@ -477,10 +496,11 @@ class FixedLengthRecordDatasetV1(dataset_ops.DatasetV1Adapter):
                buffer_size=None,
                compression_type=None,
                num_parallel_reads=None):
-    wrapped = FixedLengthRecordDatasetV2(
-        filenames, record_bytes, header_bytes, footer_bytes, buffer_size,
-        compression_type, num_parallel_reads)
+    wrapped = FixedLengthRecordDatasetV2(filenames, record_bytes, header_bytes,
+                                         footer_bytes, buffer_size,
+                                         compression_type, num_parallel_reads)
     super(FixedLengthRecordDatasetV1, self).__init__(wrapped)
+
   __init__.__doc__ = FixedLengthRecordDatasetV2.__init__.__doc__
 
   @property
diff --git a/tensorflow/python/data/util/random_seed.py b/tensorflow/python/data/util/random_seed.py
index ef8047bdce9..0d2c75bb1af 100644
--- a/tensorflow/python/data/util/random_seed.py
+++ b/tensorflow/python/data/util/random_seed.py
@@ -50,7 +50,7 @@ def get_seed(seed):
   else:
     with ops.name_scope("seed2") as scope:
       seed2 = ops.convert_to_tensor(seed2, dtype=dtypes.int64)
-      seed2 = array_ops.where(
+      seed2 = array_ops.where_v2(
           math_ops.logical_and(
               math_ops.equal(seed, 0), math_ops.equal(seed2, 0)),
           constant_op.constant(2**31 - 1, dtype=dtypes.int64),
diff --git a/tensorflow/python/debug/BUILD b/tensorflow/python/debug/BUILD
index 34a1da2ac2c..cf37dae3ede 100644
--- a/tensorflow/python/debug/BUILD
+++ b/tensorflow/python/debug/BUILD
@@ -486,7 +486,6 @@ py_library(
     deps = [
         ":debug_py",
         "//tensorflow:tensorflow_py",
-        "//tensorflow/examples/tutorials/mnist:input_data",
     ],
 )
 
@@ -579,6 +578,7 @@ cuda_py_test(
     srcs = ["lib/check_numerics_callback_test.py"],
     additional_deps = [
         ":check_numerics_callback",
+        "//third_party/py/numpy",
         "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:gradients",
         "//tensorflow/python:math_ops",
diff --git a/tensorflow/python/debug/examples/debug_mnist.py b/tensorflow/python/debug/examples/debug_mnist.py
index 823179cf5dc..77be326ec50 100644
--- a/tensorflow/python/debug/examples/debug_mnist.py
+++ b/tensorflow/python/debug/examples/debug_mnist.py
@@ -30,7 +30,6 @@ import tempfile
 
 import tensorflow
 
-from tensorflow.examples.tutorials.mnist import input_data
 from tensorflow.python import debug as tf_debug
 
 tf = tensorflow.compat.v1
@@ -44,18 +43,28 @@ RAND_SEED = 42
 
 def main(_):
   # Import data
-  mnist = input_data.read_data_sets(FLAGS.data_dir,
-                                    one_hot=True,
-                                    fake_data=FLAGS.fake_data)
+  if FLAGS.fake_data:
+    imgs = tf.random.uniform(maxval=256, shape=(10, 28, 28), dtype=tf.int32)
+    labels = tf.random.uniform(maxval=10, shape=(10,), dtype=tf.int32)
+    mnist_train = imgs, labels
+    mnist_test = imgs, labels
+  else:
+    mnist_train, mnist_test = tf.keras.datasets.mnist.load_data()
 
-  def feed_dict(train):
-    if train or FLAGS.fake_data:
-      xs, ys = mnist.train.next_batch(FLAGS.train_batch_size,
-                                      fake_data=FLAGS.fake_data)
-    else:
-      xs, ys = mnist.test.images, mnist.test.labels
+  def format_example(imgs, labels):
+    imgs = tf.reshape(imgs, [-1, 28 * 28])
+    imgs = tf.cast(imgs, tf.float32) / 255.0
+    labels = tf.one_hot(labels, depth=10, dtype=tf.float32)
+    return imgs, labels
 
-    return {x: xs, y_: ys}
+  ds_train = tf.data.Dataset.from_tensor_slices(mnist_train)
+  ds_train = ds_train.shuffle(1000).repeat().batch(FLAGS.train_batch_size)
+  ds_train = ds_train.map(format_example)
+  it_train = ds_train.make_initializable_iterator()
+
+  ds_test = tf.data.Dataset.from_tensors(mnist_test).repeat()
+  ds_test = ds_test.map(format_example)
+  it_test = ds_test.make_initializable_iterator()
 
   sess = tf.InteractiveSession()
 
@@ -63,9 +72,12 @@ def main(_):
 
   # Input placeholders.
   with tf.name_scope("input"):
-    x = tf.placeholder(
-        tf.float32, [None, IMAGE_SIZE * IMAGE_SIZE], name="x-input")
-    y_ = tf.placeholder(tf.float32, [None, NUM_LABELS], name="y-input")
+    handle = tf.placeholder(tf.string, shape=())
+
+    iterator = tf.data.Iterator.from_string_handle(
+        handle, (tf.float32, tf.float32), ((None, IMAGE_SIZE * IMAGE_SIZE), ()))
+
+    x, y_ = iterator.get_next()
 
   def weight_variable(shape):
     """Create a weight variable with appropriate initialization."""
@@ -122,6 +134,10 @@ def main(_):
       accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
 
   sess.run(tf.global_variables_initializer())
+  sess.run(it_train.initializer)
+  sess.run(it_test.initializer)
+  train_handle = sess.run(it_train.string_handle())
+  test_handle = sess.run(it_test.string_handle())
 
   if FLAGS.debug and FLAGS.tensorboard_debug_address:
     raise ValueError(
@@ -141,10 +157,10 @@ def main(_):
   # Add this point, sess is a debug wrapper around the actual Session if
   # FLAGS.debug is true. In that case, calling run() will launch the CLI.
   for i in range(FLAGS.max_steps):
-    acc = sess.run(accuracy, feed_dict=feed_dict(False))
+    acc = sess.run(accuracy, feed_dict={handle: train_handle})
     print("Accuracy at step %d: %s" % (i, acc))
 
-    sess.run(train_step, feed_dict=feed_dict(True))
+    sess.run(train_step, feed_dict={handle: test_handle})
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/debug/lib/check_numerics_callback.py b/tensorflow/python/debug/lib/check_numerics_callback.py
index 2d49c137dd9..2b09b8dec48 100644
--- a/tensorflow/python/debug/lib/check_numerics_callback.py
+++ b/tensorflow/python/debug/lib/check_numerics_callback.py
@@ -19,13 +19,14 @@ from __future__ import division
 from __future__ import print_function
 
 import collections
-import functools
+import threading
 
 import numpy as np
 
 from tensorflow.python.debug.lib import source_utils
 from tensorflow.python.framework import op_callbacks
 from tensorflow.python.ops import array_ops
+from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util import compat
 
 
@@ -60,10 +61,9 @@ _CHECK_NUMERICS_INPUT_LOOKUP = collections.defaultdict(dict)
 
 def _maybe_lookup_original_input_tensor(graph, tensor):
   if (graph and
-      graph.name and
-      graph.name in _CHECK_NUMERICS_INPUT_LOOKUP and
-      tensor.name in _CHECK_NUMERICS_INPUT_LOOKUP[graph.name]):
-    return _CHECK_NUMERICS_INPUT_LOOKUP[graph.name][tensor.name]
+      graph in _CHECK_NUMERICS_INPUT_LOOKUP and
+      tensor.name in _CHECK_NUMERICS_INPUT_LOOKUP[graph]):
+    return _CHECK_NUMERICS_INPUT_LOOKUP[graph][tensor.name]
   else:
     return tensor
 
@@ -135,7 +135,7 @@ def get_check_numerics_error_message(slot,
   elif len(inputs) == 1:
     message += "\n  Input tensor: %s\n" % (
         _maybe_lookup_original_input_tensor(graph, inputs[0]))
-  if graph and graph.name:
+  if graph and hasattr(graph, "name") and graph.name:
     message += "  Graph name: \"%s\"\n" % graph.name
 
   # Format the stack trace for the op's creation. We omit files that
@@ -161,9 +161,7 @@ def get_check_numerics_error_message(slot,
   return message
 
 
-def _check_numerics_callback(stack_height_limit,
-                             path_length_limit,
-                             op_type,
+def _check_numerics_callback(op_type,
                              inputs,
                              attrs,
                              outputs,
@@ -184,9 +182,7 @@ def _check_numerics_callback(stack_height_limit,
             get_check_numerics_error_message(
                 slot, len(outputs), op_type, output, inputs,
                 graph=graph, traceback=output.op.traceback))
-        if graph.name:
-          _CHECK_NUMERICS_INPUT_LOOKUP[
-              graph.name][checked_output.name] = output
+        _CHECK_NUMERICS_INPUT_LOOKUP[graph][checked_output.name] = output
         instrumented_outputs.append(checked_output)
       else:
         instrumented_outputs.append(output)
@@ -203,16 +199,27 @@ def _check_numerics_callback(stack_height_limit,
             output,
             get_check_numerics_error_message(
                 slot, len(outputs), op_type, output, inputs,
-                stack_height_limit=stack_height_limit,
-                path_length_limit=path_length_limit))
+                stack_height_limit=_state.config.stack_height_limit,
+                path_length_limit=_state.config.path_length_limit))
 
 
-def check_numerics(stack_height_limit=30,
-                   path_length_limit=50):
-  r"""Creates a context manager that checks numerics of tensors in ops.
+CheckNumericsConfig = collections.namedtuple(
+    "CheckNumericsConfig", "stack_height_limit path_length_limit")
+_state = threading.local()
 
-  This context manager works for eagerly-executed ops and ops executed in
-  `tf.function`s (graphs) in a unified way.
+
+def enable_check_numerics(stack_height_limit=30,
+                          path_length_limit=50):
+  r"""Enable tensor numerics checking in an eager/graph unified fashion.
+
+  The numerics checking mechanism will cause any TensorFlow eager execution or
+  graph execution to error out as soon as an op's output tensor contains
+  infinity or NaN.
+
+  This method is idempotent. Calling it multiple times has the same effect
+  as calling it once.
+
+  This method takes effect only on the thread in which it is called.
 
   When a op's float-type output tensor contains any Infinity or NaN, an
   `tf.errors.InvalidArgumentError` will be thrown, with an error message that
@@ -221,7 +228,7 @@ def check_numerics(stack_height_limit=30,
     - Data type (dtype) of the tensor.
     - Shape of the tensor (to the extent known at the time of eager execution
       or graph construction).
-    - (Graph mode only): Name of the containing graph.
+    - Name of the containing graph (if available).
     - (Graph mode only): The stack trace of the intra-graph op's creation,
       with a stack-height limit and a path-length limit for visual clarity.
       The stack frames that belong to the user's code (as opposed to
@@ -229,19 +236,90 @@ def check_numerics(stack_height_limit=30,
     - (Eager mode only): How many of the offending tensor's elements are
       `Infinity` and `NaN`, respectively.
 
+  Once enabled, the check-numerics mechanism can be disabled by using
+  `tf.debugging.disable_check_numerics()`.
+
+  Example usage:
+
+  1. Catching infinity during the execution of a `tf.function` graph:
+
+     ```py
+     import tensorflow as tf
+
+     tf.debugging.enable_check_numerics()
+
+     @tf.function
+     def square_log_x_plus_1(x):
+       v = tf.math.log(x + 1)
+       return tf.math.square(v)
+
+     x = -1.0
+
+     # When the following line runs, a function graph will be compiled
+     # from the Python function `log_x_plus_1()`. Due to the
+     # `enable_check_numerics()` call above, the graph will contain
+     # numerics checking ops that will run during the function graph's
+     # execution. The function call generates an -infinity when the Log
+     # (logarithm) op operates on the output tensor of the Add op.
+     # The program errors out at this line, printing an error message.
+     y = log_x_plus_1(x)
+     z = -y
+    ```
+
+  2. Catching NaN during eager execution:
+
+     ```py
+     import numpy as np
+     import tensorflow as tf
+
+     tf.debugging.enable_check_numerics()
+
+     x = np.array([[0.0, -1.0], [4.0, 3.0]])
+
+     # The following line executes the Sqrt op eagerly. Due to the negative
+     # element in the input array, a NaN is generated. Due to the
+     # `enable_check_numerics()` call above, the program errors immediately
+     # at this line, printing an error message.
+     y = tf.math.sqrt(x)
+     z = tf.matmul(y, y)
+     ```
+
   Args:
     stack_height_limit: Limit to the height of the printed stack trace.
       Applicable only to ops in `tf.function`s (graphs).
     path_length_limit: Limit to the file path included in the printed stack
       trace. Applicable only to ops in `tf.function`s (graphs).
-
-  Returns:
-    A thread-local context manager.
   """
-  # TODO(cais): Once this is exposed as a public API add code example in the
-  # doc string above.
 
-  return op_callbacks.op_callback(functools.partial(
-      _check_numerics_callback,
-      stack_height_limit,
-      path_length_limit))
+  if not hasattr(_state, "config"):
+    _state.config = CheckNumericsConfig(
+        stack_height_limit=stack_height_limit,
+        path_length_limit=path_length_limit)
+  op_callbacks.add_op_callback(_check_numerics_callback)
+
+  logging.info(
+      "Enabled check-numerics callback in thread %s",
+      threading.current_thread().name)
+
+
+def disable_check_numerics():
+  """Disable the eager/graph unified numerics checking mechanism.
+
+  This method can be used after a call to `tf.debugging.enable_check_numerics()`
+  to disable the numerics-checking mechanism that catches inifnity and NaN
+  values output by ops executed eagerly or in tf.function-compiled graphs.
+
+  This method is idempotent. Calling it multiple times has the same effect
+  as calling it once.
+
+  This method takes effect only on the thread in which it is called.
+  """
+  try:
+    op_callbacks.remove_op_callback(_check_numerics_callback)
+    logging.info(
+        "Disabled check-numerics callback in thread %s",
+        threading.current_thread().name)
+  except KeyError:
+    # Tolerate disabling the check numerics callback without
+    # enable_check_numerics() being called first.
+    pass
diff --git a/tensorflow/python/debug/lib/check_numerics_callback_test.py b/tensorflow/python/debug/lib/check_numerics_callback_test.py
index f782df976fb..ecefbacdbd5 100644
--- a/tensorflow/python/debug/lib/check_numerics_callback_test.py
+++ b/tensorflow/python/debug/lib/check_numerics_callback_test.py
@@ -19,8 +19,11 @@ from __future__ import print_function
 
 import re
 
+import numpy as np
+
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.debug.lib import check_numerics_callback
+from tensorflow.python.eager import context
 from tensorflow.python.eager import def_function
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
@@ -30,7 +33,6 @@ from tensorflow.python.framework import test_util
 from tensorflow.python.keras import layers
 from tensorflow.python.keras import models
 from tensorflow.python.keras import optimizer_v2
-from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import custom_gradient
 from tensorflow.python.ops import gradient_checker_v2
 from tensorflow.python.ops import math_ops
@@ -65,6 +67,10 @@ class LimitStringLengthTest(test_util.TensorFlowTestCase):
 
 class CheckNumericsCallbackTest(test_util.TensorFlowTestCase):
 
+  def tearDown(self):
+    check_numerics_callback.disable_check_numerics()
+    super(CheckNumericsCallbackTest, self).tearDown()
+
   def _assertRaisesInvalidArgumentErrorAndGetMessage(self, func):
     caught = None
     try:
@@ -76,11 +82,13 @@ class CheckNumericsCallbackTest(test_util.TensorFlowTestCase):
 
   def testCatchEagerOpFloat32Inf(self):
     """Test catching Infinity in eager op execution: float32."""
-    with check_numerics_callback.check_numerics():
-      x = constant_op.constant([2.0, 3.0])
-      y = constant_op.constant([1.0, 0.0])
-      message = self._assertRaisesInvalidArgumentErrorAndGetMessage(
-          lambda: x / y)
+    check_numerics_callback.enable_check_numerics()
+
+    x = constant_op.constant([2.0, 3.0])
+    y = constant_op.constant([1.0, 0.0])
+    message = self._assertRaisesInvalidArgumentErrorAndGetMessage(
+        lambda: x / y)
+
     # Check the content of the error message.
     self.assertTrue(re.search(r"eagerly-executing op.*\"RealDiv\"", message))
     self.assertTrue(re.search(r"dtype.*float32", message))
@@ -89,15 +97,37 @@ class CheckNumericsCallbackTest(test_util.TensorFlowTestCase):
     self.assertIn("0: %s" % x, message)
     self.assertIn("1: %s" % y, message)
 
+  def testEnableCheckNumericsIsIdempotent(self):
+    """Two calls to enable_check_numerics() have same effect as one call."""
+    check_numerics_callback.enable_check_numerics()
+    check_numerics_callback.enable_check_numerics()
+
+    x = constant_op.constant([2.0, 3.0])
+    y = constant_op.constant([1.0, 0.0])
+    message = self._assertRaisesInvalidArgumentErrorAndGetMessage(
+        lambda: x / y)
+
+    # Check the content of the error message.
+    self.assertTrue(re.search(r"eagerly-executing op.*\"RealDiv\"", message))
+    self.assertTrue(re.search(r"dtype.*float32", message))
+    self.assertIn("shape: (2,)\n", message)
+    self.assertIn("# of +Inf elements: 1\n", message)
+    self.assertIn("0: %s" % x, message)
+    self.assertIn("1: %s" % y, message)
+
+  def testCallingDisableCheckNumericsWithoutEnablingFirstIsTolerated(self):
+    check_numerics_callback.disable_check_numerics()
+
   def testCatchEagerOpFloat16NaN(self):
     """Test catching Infinity in eager op execution: float16."""
-    with check_numerics_callback.check_numerics():
-      def log1p(x):
-        y = 1.0 + x
-        return math_ops.log(y)
-      x = constant_op.constant([[-1.0]], dtype=dtypes.float16)
-      message = self._assertRaisesInvalidArgumentErrorAndGetMessage(
-          lambda: log1p(x))
+    check_numerics_callback.enable_check_numerics()
+    def log1p(x):
+      y = 1.0 + x
+      return math_ops.log(y)
+    x = constant_op.constant([[-1.0]], dtype=dtypes.float16)
+    message = self._assertRaisesInvalidArgumentErrorAndGetMessage(
+        lambda: log1p(x))
+
     # Check the content of the error message.
     self.assertTrue(re.search(r"eagerly-executing op.*\"Log\"", message))
     self.assertTrue(re.search(r"dtype.*float16", message))
@@ -107,24 +137,27 @@ class CheckNumericsCallbackTest(test_util.TensorFlowTestCase):
 
   def testNoCatchEagerOpExecution(self):
     """Test running multiple steps of eager execution without Inf/NaN."""
-    with check_numerics_callback.check_numerics():
-      x = constant_op.constant([2.0, 3.0])
-      y = constant_op.constant([1.0, 0.0])
-      self.assertAllClose((x + y) * (x - y), [3.0, 9.0])
+    check_numerics_callback.enable_check_numerics()
+    x = constant_op.constant([2.0, 3.0])
+    y = constant_op.constant([1.0, 0.0])
+    self.assertAllClose((x + y) * (x - y), [3.0, 9.0])
 
+  @test_util.run_in_graph_and_eager_modes
   def testCatchFunctionOpInfFloat64(self):
     """Test catching infinites generated in a FuncGraph."""
-    with check_numerics_callback.check_numerics():
-      @def_function.function
-      def divide_sum_with_diff(x, y):
-        w1 = x + y
-        w2 = x - y
-        u = w1 / w2
-        return u * 2.0
-      x = constant_op.constant(2.0, dtype=dtypes.float64)
-      y = constant_op.constant(2.0, dtype=dtypes.float64)
-      message = self._assertRaisesInvalidArgumentErrorAndGetMessage(
-          lambda: divide_sum_with_diff(x, y))
+
+    check_numerics_callback.enable_check_numerics()
+    @def_function.function
+    def divide_sum_with_diff(x, y):
+      w1 = x + y
+      w2 = x - y
+      u = w1 / w2
+      return u * 2.0
+    x = constant_op.constant(2.0, dtype=dtypes.float64)
+    y = constant_op.constant(2.0, dtype=dtypes.float64)
+    message = self._assertRaisesInvalidArgumentErrorAndGetMessage(
+        lambda: self.evaluate(divide_sum_with_diff(x, y)))
+
     # Check the content of the error message.
     self.assertTrue(re.search(r"graph op.*\"RealDiv\"", message))
     self.assertTrue(re.search(r"dtype.*float64", message))
@@ -137,18 +170,21 @@ class CheckNumericsCallbackTest(test_util.TensorFlowTestCase):
     self.assertTrue(re.search(r"Stack trace of op's creation", message))
     self.assertIn("u = w1 / w2", message)
 
+  @test_util.run_in_graph_and_eager_modes
   def testControlFlowGraphWithNaNBFloat16(self):
     """Test catching bfloat16 NaNs in a control-flow-v2 FuncGraph."""
+    check_numerics_callback.enable_check_numerics()
+
     @def_function.function
     def my_conditional(x):
-      with check_numerics_callback.check_numerics():
-        if math_ops.less(math_ops.reduce_sum(x), 0.0):
-          return math_ops.log(x)
-        else:
-          return math_ops.log(-x)
+      if math_ops.less(math_ops.reduce_sum(x), 0.0):
+        return math_ops.log(x)
+      else:
+        return math_ops.log(-x)
+
     x = constant_op.constant([1.0, 2.0, 3.0], dtype=dtypes.bfloat16)
     message = self._assertRaisesInvalidArgumentErrorAndGetMessage(
-        lambda: my_conditional(x))
+        lambda: self.evaluate(my_conditional(x)))
     # Check the content of the error message.
     self.assertTrue(re.search(r"graph op.*\"Log\"", message))
     self.assertTrue(re.search(r"dtype.*bfloat16", message))
@@ -158,147 +194,166 @@ class CheckNumericsCallbackTest(test_util.TensorFlowTestCase):
     # Check that the correct line for op creation is printed.
     self.assertTrue(re.search(r"Stack trace of op's creation", message))
     self.assertIn("return math_ops.log(-x)", message)
-    self.assertTrue(message.endswith("\n"))
+    if context.executing_eagerly():
+      # The code path for raising error is slightly different under graph mode.
+      self.assertTrue(message.endswith("\n"))
 
+  @test_util.run_in_graph_and_eager_modes
   def testOverflowInTfFunction(self):
     """Test catching Infinity caused by overflow in a tf.function with while."""
-    with check_numerics_callback.check_numerics():
+    check_numerics_callback.enable_check_numerics()
 
-      @def_function.function
-      def accumulation_function(counter, lim, accum):
-        while math_ops.less(counter, lim):
-          accum.assign(accum * 2.0)
-          counter.assign_add(1)
+    @def_function.function
+    def accumulation_function(counter, lim, accum):
+      while math_ops.less(counter, lim):
+        accum.assign(accum * 2.0)
+        counter.assign_add(1)
 
-      counter = variables.Variable(0, dtype=dtypes.int32)
-      # Repeated `* 2.0` overflows a float32 tensor in 128 steps. So the
-      # 1000-step limit is sufficient.
-      lim = constant_op.constant(1000, dtype=dtypes.int32)
-      accum = variables.Variable(1.0)
-      message = self._assertRaisesInvalidArgumentErrorAndGetMessage(
-          lambda: accumulation_function(counter, lim, accum))
+    counter = variables.Variable(0, dtype=dtypes.int32)
+    # Repeated `* 2.0` overflows a float32 tensor in 128 steps. So the
+    # 1000-step limit is sufficient.
+    lim = constant_op.constant(1000, dtype=dtypes.int32)
+    accum = variables.Variable(1.0)
 
-      self.assertAllClose(counter.numpy(), 128)
-      # Check the content of the error message.
-      # The overflow to +Infinity happens during the `* 2.0` operation.
-      self.assertTrue(re.search(r"graph op.*\"Mul\"", message))
-      self.assertTrue(re.search(r"dtype.*float32", message))
-      self.assertIn("shape: ()\n", message)
-      # Check that the correct input op is printed.
-      self.assertIn("Input tensors (2):", message)
-      # Check that the correct input ops are printed.
-      self.assertTrue(re.search(r"0:.*Tensor.*ReadVariableOp:0", message))
-      self.assertTrue(re.search(r"1:.*Tensor.*mul/y:0", message))
-      # Check that the correct line for op creation is printed.
-      self.assertTrue(re.search(r"Stack trace of op's creation", message))
-      self.assertIn("accum.assign(accum * 2.0)", message)
+    if not context.executing_eagerly():
+      self.evaluate([counter.initializer, accum.initializer])
 
+    message = self._assertRaisesInvalidArgumentErrorAndGetMessage(
+        lambda: self.evaluate(accumulation_function(counter, lim, accum)))
+
+    self.assertAllClose(self.evaluate(counter), 128)
+    # Check the content of the error message.
+    # The overflow to +Infinity happens during the `* 2.0` operation.
+    self.assertTrue(re.search(r"graph op.*\"Mul\"", message))
+    self.assertTrue(re.search(r"dtype.*float32", message))
+    self.assertIn("shape: ()\n", message)
+    # Check that the correct input op is printed.
+    self.assertIn("Input tensors (2):", message)
+    # Check that the correct input ops are printed.
+    self.assertTrue(re.search(r"0:.*Tensor.*ReadVariableOp:0", message))
+    self.assertTrue(re.search(r"1:.*Tensor.*mul/y:0", message))
+    # Check that the correct line for op creation is printed.
+    self.assertTrue(re.search(r"Stack trace of op's creation", message))
+    self.assertIn("accum.assign(accum * 2.0)", message)
+
+  @test_util.run_in_graph_and_eager_modes
   def testKerasModelHealthyPredictAndFitCalls(self):
     """Test a simple healthy keras model runs fine under the callback."""
-    with check_numerics_callback.check_numerics():
-      model = models.Sequential()
-      model.add(layers.Dense(
-          units=100,
-          input_shape=(5,),
-          activation="relu",
-          kernel_initializer="ones"))
-      model.add(layers.BatchNormalization())
-      model.add(layers.Dropout(0.5))
-      model.add(layers.Dense(
-          units=1,
-          activation="linear",
-          kernel_initializer="ones"))
+    check_numerics_callback.enable_check_numerics()
 
-      model.compile(
-          loss="mse", optimizer=optimizer_v2.gradient_descent.SGD(1e-3))
+    model = models.Sequential()
+    model.add(layers.Dense(
+        units=100,
+        input_shape=(5,),
+        use_bias=False,
+        activation="relu",
+        kernel_initializer="ones"))
+    model.add(layers.BatchNormalization())
+    model.add(layers.Dropout(0.5))
+    model.add(layers.Dense(
+        units=1,
+        activation="linear",
+        kernel_initializer="ones"))
 
-      batch_size = 16
-      xs = array_ops.zeros([batch_size, 5])
-      ys = array_ops.ones([batch_size, 1])
+    model.compile(
+        loss="mse", optimizer=optimizer_v2.gradient_descent.SGD(1e-3))
 
-      outputs = model.predict(xs)
-      self.assertEqual(outputs.shape, (batch_size, 1))
+    batch_size = 16
+    xs = np.zeros([batch_size, 5])
+    ys = np.ones([batch_size, 1])
 
-      epochs = 100
-      history = model.fit(xs, ys, epochs=epochs, verbose=0)
-      self.assertEqual(len(history.history["loss"]), epochs)
+    outputs = model.predict(xs)
+    self.assertEqual(outputs.shape, (batch_size, 1))
 
+    epochs = 100
+    history = model.fit(xs, ys, epochs=epochs, verbose=0)
+    self.assertEqual(len(history.history["loss"]), epochs)
+
+  @test_util.run_in_graph_and_eager_modes
   def testKerasModelUnhealthyPredictAndFitCallsWithLargeLearningRate(self):
     """Test keras model training crashes with Infinity is caught by callback."""
-    with check_numerics_callback.check_numerics():
-      model = models.Sequential()
-      # Use weight initializers for deterministic behavior during test.
-      model.add(layers.Dense(
-          units=100,
-          input_shape=(5,),
-          activation="relu",
-          kernel_initializer="ones"))
-      model.add(layers.Dense(
-          units=1,
-          activation="linear",
-          kernel_initializer="ones"))
+    check_numerics_callback.enable_check_numerics()
 
-      lr = 1e3    # Intentionally huge learning rate.
-      model.compile(loss="mse", optimizer=optimizer_v2.gradient_descent.SGD(lr))
+    model = models.Sequential()
+    # Use weight initializers for deterministic behavior during test.
+    model.add(layers.Dense(
+        units=100,
+        input_shape=(5,),
+        activation="relu",
+        kernel_initializer="ones"))
+    model.add(layers.Dense(
+        units=1,
+        activation="linear",
+        kernel_initializer="ones"))
 
-      batch_size = 16
-      xs = array_ops.zeros([batch_size, 5])
-      ys = array_ops.ones([batch_size, 1])
+    lr = 1e3    # Intentionally huge learning rate.
+    model.compile(loss="mse", optimizer=optimizer_v2.gradient_descent.SGD(lr))
 
-      outputs = model.predict(xs)
-      self.assertEqual(outputs.shape, (batch_size, 1))
+    batch_size = 16
+    xs = np.zeros([batch_size, 5])
+    ys = np.ones([batch_size, 1])
 
-      epochs = 100
-      message = self._assertRaisesInvalidArgumentErrorAndGetMessage(
-          lambda: model.fit(xs, ys, epochs=epochs, verbose=0))
+    outputs = model.predict(xs)
+    self.assertEqual(outputs.shape, (batch_size, 1))
 
-      # Check the content of the error message.
-      # Let's not hardcode the op name for future-proof.
-      self.assertTrue(re.search(r"graph op.*\".*\"", message))
-      self.assertTrue(re.search(r"dtype:.*float32", message))
-      self.assertTrue(re.search(r"shape:.*\(.*\)", message))
-      # Check that the correct input op is printed.
-      self.assertTrue(re.search(r"Input tensor.*", message))
-      # Check that the correct line for op creation is printed.
-      self.assertTrue(re.search(r"Stack trace of op's creation", message))
+    epochs = 100
+    message = self._assertRaisesInvalidArgumentErrorAndGetMessage(
+        lambda: model.fit(xs, ys, epochs=epochs, verbose=0))
+
+    # Check the content of the error message.
+    # Let's not hardcode the op name for future-proof.
+    self.assertTrue(re.search(r"graph op.*\".*\"", message))
+    self.assertTrue(re.search(r"dtype:.*float32", message))
+    self.assertTrue(re.search(r"shape:.*\(.*\)", message))
+    # Check that the correct input op is printed.
+    self.assertTrue(re.search(r"Input tensor.*", message))
+    # Check that the correct line for op creation is printed.
+    self.assertTrue(re.search(r"Stack trace of op's creation", message))
+    # The stacks are different between when eager execution is enabled and
+    # when it's not (i.e., v1 graph). TODO(cais): Investigate if we can improve
+    # this.
+    if context.executing_eagerly():
       self.assertIn("lambda: model.fit(xs, ys,", message)
+    else:
+      self.assertIn("model.compile(", message)
 
+  @test_util.run_in_graph_and_eager_modes
   def testInfInCustomKerasLayerWithTfFunctionPredictCall(self):
     """Test catching Infinity in a custom layer, w/ tf.function."""
+    check_numerics_callback.enable_check_numerics()
 
-    with check_numerics_callback.check_numerics():
-      class DivByXLayer(layers.Layer):
+    class DivByXLayer(layers.Layer):
 
-        @def_function.function
-        def call(self, x):
-          """The computation performed by the for-test custom layer.
+      @def_function.function
+      def call(self, x):
+        """The computation performed by the for-test custom layer.
 
-          Generates Infinity by intention.
+        Generates Infinity by intention.
 
-          Args:
-            x: Input tensor of scalar shape.
+        Args:
+          x: Input tensor of scalar shape.
 
-          Returns:
-            A scalar tensor.
-          """
-          one_over_x = 1.0 / x
-          return one_over_x
+        Returns:
+          A scalar tensor.
+        """
+        one_over_x = 1.0 / x
+        return one_over_x
 
-      model = models.Sequential()
-      model.add(DivByXLayer(input_shape=[5]))
+    model = models.Sequential()
+    model.add(DivByXLayer(input_shape=[5]))
 
-      # TODO(b/140245224): Currently the model must be compiled prior to
-      # predict() being called(). Or keras will fall back to V1 behavior.
-      # Remove this after the bug is fixed.
-      model.compile(loss="mse", optimizer="sgd")
+    # TODO(b/140245224): Currently the model must be compiled prior to
+    # predict() being called(). Or keras will fall back to V1 behavior.
+    # Remove this after the bug is fixed.
+    model.compile(loss="mse", optimizer="sgd")
 
-      xs = array_ops.ones([1, 5])
-      # Calling the model with non-zero inputs should be fine.
-      self.assertAllClose(model.predict(xs), [[1.0, 1.0, 1.0, 1.0, 1.0]])
+    xs = np.ones([1, 5])
+    # Calling the model with non-zero inputs should be fine.
+    self.assertAllClose(model.predict(xs), [[1.0, 1.0, 1.0, 1.0, 1.0]])
 
-      xs = array_ops.zeros([1, 5])
-      message = self._assertRaisesInvalidArgumentErrorAndGetMessage(
-          lambda: model.predict(xs))
+    xs = np.zeros([1, 5])
+    message = self._assertRaisesInvalidArgumentErrorAndGetMessage(
+        lambda: model.predict(xs))
 
     # Check the content of the error message.
     self.assertTrue(re.search(r"graph op.*\"RealDiv\"", message))
@@ -310,42 +365,43 @@ class CheckNumericsCallbackTest(test_util.TensorFlowTestCase):
     self.assertTrue(re.search(r"Stack trace of op's creation", message))
     self.assertIn("one_over_x = 1.0 / x", message)
 
+  @test_util.run_in_graph_and_eager_modes
   def testInfInCustomKerasLayerWithoutTfFuntionPredictCall(self):
     """Test catching Infinity in a custom layer, w/o tf.function."""
+    check_numerics_callback.enable_check_numerics()
 
-    with check_numerics_callback.check_numerics():
-      class DivByXLayer(layers.Layer):
+    class DivByXLayer(layers.Layer):
 
-        # Not using the tf.function decorator here.
-        def call(self, x):
-          """The computation performed by the for-test custom layer.
+      # Not using the tf.function decorator here.
+      def call(self, x):
+        """The computation performed by the for-test custom layer.
 
-          Generates Infinity by intention.
+        Generates Infinity by intention.
 
-          Args:
-            x: Input tensor of scalar shape.
+        Args:
+          x: Input tensor of scalar shape.
 
-          Returns:
-            A scalar tensor.
-          """
-          one_over_x = 1.0 / x
-          return one_over_x
+        Returns:
+          A scalar tensor.
+        """
+        one_over_x = 1.0 / x
+        return one_over_x
 
-      model = models.Sequential()
-      model.add(DivByXLayer(input_shape=[5]))
+    model = models.Sequential()
+    model.add(DivByXLayer(input_shape=[5]))
 
-      # TODO(b/140245224): Currently the model must be compiled prior to
-      # predict() being called(). Or keras will fall back to V1 behavior.
-      # Remove this after the bug is fixed.
-      model.compile(loss="mse", optimizer="sgd")
+    # TODO(b/140245224): Currently the model must be compiled prior to
+    # predict() being called(). Or keras will fall back to V1 behavior.
+    # Remove this after the bug is fixed.
+    model.compile(loss="mse", optimizer="sgd")
 
-      xs = array_ops.ones([1, 5])
-      # Calling the model with non-zero inputs should be fine.
-      self.assertAllClose(model.predict(xs), [[1.0, 1.0, 1.0, 1.0, 1.0]])
+    xs = np.ones([1, 5])
+    # Calling the model with non-zero inputs should be fine.
+    self.assertAllClose(model.predict(xs), [[1.0, 1.0, 1.0, 1.0, 1.0]])
 
-      xs = array_ops.zeros([1, 5])
-      message = self._assertRaisesInvalidArgumentErrorAndGetMessage(
-          lambda: model.predict(xs))
+    xs = np.zeros([1, 5])
+    message = self._assertRaisesInvalidArgumentErrorAndGetMessage(
+        lambda: model.predict(xs))
 
     # Check the content of the error message.
     self.assertTrue(re.search(r"graph op.*\"RealDiv\"", message))
@@ -357,20 +413,38 @@ class CheckNumericsCallbackTest(test_util.TensorFlowTestCase):
     self.assertTrue(re.search(r"Stack trace of op's creation", message))
     self.assertIn("one_over_x = 1.0 / x", message)
 
+  @test_util.run_in_graph_and_eager_modes
+  def testDatasetMapHealthyResults(self):
+    check_numerics_callback.enable_check_numerics()
+
+    tensor = constant_op.constant(
+        [0.5, 1.0, 1.5, 2.0, 2.5, 3.0, 3.5, 4.0, 4.5, 5.0])
+
+    def map_fn(x):
+      return math_ops.log(math_ops.square(x) + 1)
+
+    dataset = dataset_ops.Dataset.from_tensor_slices(tensor).batch(2).map(
+        map_fn)
+    iterator = dataset_ops.make_one_shot_iterator(dataset)
+
+    self.assertAllClose(self.evaluate(iterator.get_next()), np.log([1.25, 2]))
+    self.assertAllClose(self.evaluate(iterator.get_next()), np.log([3.25, 5]))
+
+  @test_util.run_in_graph_and_eager_modes
   def testCatchInfinityInDatasetMapFunction(self):
     """Test that callback catches NaN in a tf.dataset map function."""
-    with check_numerics_callback.check_numerics():
+    check_numerics_callback.enable_check_numerics()
 
-      def generate_nan(x):
-        """Intetionally generates NaNs by taking log of negative number."""
-        casted_x = math_ops.cast(x, dtypes.float32)
-        return math_ops.log([[-1.0, 1.0], [3.0, 5.0]]) + casted_x
+    def generate_nan(x):
+      """Intetionally generates NaNs by taking log of negative number."""
+      casted_x = math_ops.cast(x, dtypes.float32)
+      return math_ops.log([[-1.0, 1.0], [3.0, 5.0]]) + casted_x
 
-      dataset = dataset_ops.Dataset.range(10).map(generate_nan)
-      iterator = dataset_ops.make_one_shot_iterator(dataset)
+    dataset = dataset_ops.Dataset.range(10).map(generate_nan)
+    iterator = dataset_ops.make_one_shot_iterator(dataset)
 
-      message = self._assertRaisesInvalidArgumentErrorAndGetMessage(
-          iterator.next)
+    message = self._assertRaisesInvalidArgumentErrorAndGetMessage(
+        lambda: self.evaluate(iterator.get_next()))
 
     # Check the content of the error message.
     self.assertTrue(re.search(r"graph op.*\"Log\"", message))
@@ -381,29 +455,32 @@ class CheckNumericsCallbackTest(test_util.TensorFlowTestCase):
         "-> |   return math_ops.log([[-1.0, 1.0], [3.0, 5.0]]) + casted_x",
         message)
 
+  @test_util.run_in_graph_and_eager_modes
   def testCustomGradietWithNaNWithTfFunction(self):
     """Test that callback catches NaN in a gradient function during backprop."""
-    with check_numerics_callback.check_numerics():
-      @custom_gradient.custom_gradient
-      def func_with_bad_grad(x):
-        output = math_ops.sin(x)
-        @def_function.function
-        def grad(dy):
-          # `dy` will come in as 1.0. Taking log of -1.0 leads to NaN.
-          return math_ops.log(-dy)
-        return output, grad
+    check_numerics_callback.enable_check_numerics()
 
-      x = constant_op.constant(-2.0, dtype=dtypes.float16)
-      def f(x):
-        return func_with_bad_grad(x)
+    @custom_gradient.custom_gradient
+    def func_with_bad_grad(x):
+      output = math_ops.sin(x)
+      @def_function.function
+      def grad(dy):
+        # `dy` will come in as 1.0. Taking log of -1.0 leads to NaN.
+        return math_ops.log(-dy)
+      return output, grad
 
-      message = self._assertRaisesInvalidArgumentErrorAndGetMessage(
-          lambda: gradient_checker_v2.compute_gradient(f, [x]))
+    x = constant_op.constant(-2.0, dtype=dtypes.float16)
+    def f(x):
+      return func_with_bad_grad(x)
+
+    message = self._assertRaisesInvalidArgumentErrorAndGetMessage(
+        lambda: gradient_checker_v2.compute_gradient(f, [x]))
 
     # Check the content of the error message.
     self.assertTrue(re.search(r"graph op.*\"Log\"", message))
     self.assertTrue(re.search(r"dtype.*float16", message))
-    self.assertIn("shape: ()\n", message)
+    if context.executing_eagerly():
+      self.assertIn("shape: ()\n", message)
     self.assertTrue(re.search(r"Input tensor.*Tensor.*Neg:0", message))
     self.assertIn("-> |   return math_ops.log(-dy)", message)
 
diff --git a/tensorflow/python/distribute/BUILD b/tensorflow/python/distribute/BUILD
index 28315136ddf..8f26dac87bd 100644
--- a/tensorflow/python/distribute/BUILD
+++ b/tensorflow/python/distribute/BUILD
@@ -218,6 +218,7 @@ py_library(
 
 py_test(
     name = "distribute_coordinator_test",
+    size = "medium",
     srcs = ["distribute_coordinator_test.py"],
     python_version = "PY2",
     srcs_version = "PY2AND3",
@@ -1152,9 +1153,10 @@ distribute_py_test(
     name = "ctl_correctness_test",
     srcs = ["ctl_correctness_test.py"],
     main = "ctl_correctness_test.py",
-    shard_count = 3,
+    shard_count = 10,
     tags = [
         "multi_and_single_gpu",
+        "noguitar",  # b/140755528
     ],
     deps = [
         "//tensorflow/python:keras_lib",
diff --git a/tensorflow/python/distribute/cluster_resolver/tfconfig_cluster_resolver.py b/tensorflow/python/distribute/cluster_resolver/tfconfig_cluster_resolver.py
index 421351944c2..3658a6bcaa9 100644
--- a/tensorflow/python/distribute/cluster_resolver/tfconfig_cluster_resolver.py
+++ b/tensorflow/python/distribute/cluster_resolver/tfconfig_cluster_resolver.py
@@ -172,6 +172,7 @@ class TFConfigClusterResolver(ClusterResolver):
     # where available
     task_type = task_type if task_type is not None else self.task_type
     task_id = task_id if task_id is not None else self.task_id
+    rpc_layer = rpc_layer if rpc_layer is not None else self.rpc_layer
 
     return format_master_url(cluster_spec.task_address(task_type, task_id),
-                             self.rpc_layer)
+                             rpc_layer)
diff --git a/tensorflow/python/distribute/collective_all_reduce_strategy.py b/tensorflow/python/distribute/collective_all_reduce_strategy.py
index a74d6e4bed8..3003623fdfb 100644
--- a/tensorflow/python/distribute/collective_all_reduce_strategy.py
+++ b/tensorflow/python/distribute/collective_all_reduce_strategy.py
@@ -159,9 +159,7 @@ class CollectiveAllReduceExtended(mirrored_strategy.MirroredExtended):
     if ops.executing_eagerly_outside_functions():
       try:
         context.context().configure_collective_ops(
-            scoped_allocator_enabled_ops=("CollectiveReduce",),
-            use_nccl_communication=(self._communication == cross_device_ops_lib
-                                    .CollectiveCommunication.NCCL))
+            scoped_allocator_enabled_ops=("CollectiveReduce",))
       except RuntimeError:
         logging.warning("Collective ops is not configured at program startup. "
                         "Some performance features may not be enabled.")
@@ -189,7 +187,8 @@ class CollectiveAllReduceExtended(mirrored_strategy.MirroredExtended):
     self._cross_device_ops = cross_device_ops_lib.CollectiveAllReduce(
         num_workers=self._num_workers,
         num_gpus_per_worker=num_gpus,
-        collective_keys=self._collective_keys)
+        collective_keys=self._collective_keys,
+        communication=self._communication)
     super(CollectiveAllReduceExtended, self)._initialize_local(local_devices)
 
     self._cluster_spec = None
@@ -240,8 +239,6 @@ class CollectiveAllReduceExtended(mirrored_strategy.MirroredExtended):
           collective_leader=multi_worker_util.collective_leader(
               cluster_spec, task_type, task_id),
           scoped_allocator_enabled_ops=("CollectiveReduce",),
-          use_nccl_communication=(self._communication == cross_device_ops_lib
-                                  .CollectiveCommunication.NCCL),
           device_filters=("/job:%s/task:%d" % (task_type, task_id),))
       self._collective_ops_configured = True
 
@@ -288,7 +285,8 @@ class CollectiveAllReduceExtended(mirrored_strategy.MirroredExtended):
     self._cross_device_ops = cross_device_ops_lib.CollectiveAllReduce(
         num_workers=self._num_workers,
         num_gpus_per_worker=num_gpus,
-        collective_keys=self._collective_keys)
+        collective_keys=self._collective_keys,
+        communication=self._communication)
     super(CollectiveAllReduceExtended, self)._initialize_local(local_devices)
     self._input_workers = input_lib.InputWorkers(
         self._device_map, [(self._worker_device, self.worker_devices)])
@@ -444,7 +442,9 @@ class CollectiveAllReduceExtended(mirrored_strategy.MirroredExtended):
     del rewrite_options.scoped_allocator_opts.enable_op[:]
     rewrite_options.scoped_allocator_opts.enable_op.append("CollectiveReduce")
 
-    if self._communication == cross_device_ops_lib.CollectiveCommunication.NCCL:
+    if (not ops.executing_eagerly_outside_functions() and
+        self._communication ==
+        cross_device_ops_lib.CollectiveCommunication.NCCL):
       updated_config.experimental.collective_nccl = True
 
     if not self._cluster_spec:
diff --git a/tensorflow/python/distribute/collective_all_reduce_strategy_test.py b/tensorflow/python/distribute/collective_all_reduce_strategy_test.py
index 8c303661926..419678348b7 100644
--- a/tensorflow/python/distribute/collective_all_reduce_strategy_test.py
+++ b/tensorflow/python/distribute/collective_all_reduce_strategy_test.py
@@ -379,7 +379,7 @@ class DistributedCollectiveAllReduceStrategyTest(
     else:
       def fn():
         dataset = dataset_ops.Dataset.range(100)
-        it = dataset.make_one_shot_iterator()
+        it = dataset_ops.make_one_shot_iterator(dataset)
         return it.get_next
     # We use CPU as the device when num_gpus = 0
     devices_per_worker = max(1, num_gpus)
diff --git a/tensorflow/python/distribute/cross_device_ops.py b/tensorflow/python/distribute/cross_device_ops.py
index c5774ff98e2..8c7282b2c8a 100644
--- a/tensorflow/python/distribute/cross_device_ops.py
+++ b/tensorflow/python/distribute/cross_device_ops.py
@@ -1008,7 +1008,8 @@ class CollectiveAllReduce(CrossDeviceOps):
                num_workers=1,
                num_gpus_per_worker=0,
                num_packs=1,
-               collective_keys=None):
+               collective_keys=None,
+               communication=CollectiveCommunication.AUTO):
     """Initializes the object.
 
     Args:
@@ -1016,12 +1017,14 @@ class CollectiveAllReduce(CrossDeviceOps):
       num_gpus_per_worker: number of GPUs per worker.
       num_packs: gradients will be packed into `num_packs` chunks.
       collective_keys: an optional CollectiveKey object.
+      communication: indicates which collective communication to use.
     """
     self._num_workers = num_workers
     self._num_gpus_per_worker = num_gpus_per_worker
     self._num_packs = num_packs
     self._collective_keys = (collective_keys or
                              cross_device_utils.CollectiveKeys())
+    self._communication = communication
     super(CollectiveAllReduce, self).__init__()
 
   @property
@@ -1119,11 +1122,22 @@ class CollectiveAllReduce(CrossDeviceOps):
   def _do_batch_all_reduce_dense(self, reduce_op, per_replica_values):
     """All-reduce across all workers in a batch."""
 
+    chunked_gv = self._make_gradient_chunks(per_replica_values, self._num_packs)
+
+    batch_size = len(per_replica_values)
+    # Pass self._communication to the runtime as a communication hint.
+    communication_hint = self._communication.value
+    # For now, we use NCCL only when batch_size > 1 and num_packs is 1.
+    # TODO(b/132575814): switch to NCCL for all collectives when communication
+    # is NCCL.
+    if self._communication == CollectiveCommunication.NCCL and (
+        batch_size == 1 or self._num_packs != 1):
+      communication_hint = CollectiveCommunication.AUTO.value
+
     logging.log_first_n(
         logging.INFO, "Collective batch_all_reduce: %d all-reduces, "
-        "num_workers = %d" % (len(per_replica_values), self._num_workers), 10)
-
-    chunked_gv = self._make_gradient_chunks(per_replica_values, self._num_packs)
+        "num_workers = %d, communication_hint = %s" % (
+            batch_size, self._num_workers, communication_hint), 10)
 
     reduced_gv_list = []
     for chunk in chunked_gv:
@@ -1136,7 +1150,7 @@ class CollectiveAllReduce(CrossDeviceOps):
           scaled_grads = [g for g, _ in grad_and_vars]
           collective_reduced = cross_device_utils.build_collective_reduce(
               scaled_grads, self._num_workers, self._collective_keys, "Add",
-              "Id")
+              "Id", communication_hint)
           result = []
           for (_, v), g in zip(grad_and_vars, collective_reduced):
             result.append([g, v])
diff --git a/tensorflow/python/distribute/cross_device_utils.py b/tensorflow/python/distribute/cross_device_utils.py
index 6ef06b91799..90d29eabe7a 100644
--- a/tensorflow/python/distribute/cross_device_utils.py
+++ b/tensorflow/python/distribute/cross_device_utils.py
@@ -318,7 +318,8 @@ def build_collective_reduce(input_tensors,
                             num_workers,
                             collective_keys,
                             reduction_op='Add',
-                            unary_op='Id'):
+                            unary_op='Id',
+                            communication_hint='auto'):
   """Build a subgraph that does one full all-reduce, using the collective Op.
 
   Args:
@@ -330,6 +331,8 @@ def build_collective_reduce(input_tensors,
     collective_keys: a CollectiveKeys object.
     reduction_op: string naming the reduction op.
     unary_op: string naming the unary final op.
+    communication_hint: string providing hint to runtime for choosing collective
+      implementation.
 
   Returns:
     An array of final tensors, one per device, computed by the full reduction.
@@ -354,7 +357,7 @@ def build_collective_reduce(input_tensors,
       with ops.device(devices[d]):
         reduce_op = collective_ops.all_reduce(
             input_tensors[d], group_size, group_key, instance_key, reduction_op,
-            unary_op, subdiv_offsets)
+            unary_op, subdiv_offsets, communication_hint)
         out_tensors.append(reduce_op)
     return out_tensors
 
diff --git a/tensorflow/python/distribute/distribute_lib.py b/tensorflow/python/distribute/distribute_lib.py
index 852e964918f..b7c5174cea4 100644
--- a/tensorflow/python/distribute/distribute_lib.py
+++ b/tensorflow/python/distribute/distribute_lib.py
@@ -1354,16 +1354,21 @@ class StrategyExtendedV2(object):
 
     Variables created inside the strategy scope are "owned" by it:
 
-    >>> with strategy.scope():
-    ...   v = tf.Variable(1.)
-    >>> strategy.variable_created_in_scope(v)
+    ```python
+    strategy = tf.distribute.StrategyExtended()
+    with strategy.scope():
+      v = tf.Variable(1.)
+    strategy.variable_created_in_scope(v)
     True
+    ```
 
     Variables created outside the strategy are not owned by it:
 
-    >>> v = tf.Variable(1.)
-    >>> strategy.variable_created_in_scope(v)
+    ```python
+    v = tf.Variable(1.)
+    strategy.variable_created_in_scope(v)
     False
+    ```
 
     Args:
       v: A `tf.Variable` instance.
@@ -2213,9 +2218,9 @@ class _DefaultDistributionExtended(StrategyExtendedV1):
     def __init__(self, dataset):
       self._dataset = dataset
       if eager_context.executing_eagerly():
-        self._iterator = dataset.make_one_shot_iterator()
+        self._iterator = dataset_ops.make_one_shot_iterator(dataset)
       else:
-        self._iterator = dataset.make_initializable_iterator()
+        self._iterator = dataset_ops.make_initializable_iterator(dataset)
 
     def get_next(self):
       return self._iterator.get_next()
diff --git a/tensorflow/python/distribute/distribute_lib_test.py b/tensorflow/python/distribute/distribute_lib_test.py
index d0d14a7831e..fb8116d4ab2 100644
--- a/tensorflow/python/distribute/distribute_lib_test.py
+++ b/tensorflow/python/distribute/distribute_lib_test.py
@@ -358,7 +358,7 @@ class TestStrategyTest(test.TestCase):
     dataset = dataset_ops.Dataset.from_tensors(1.).repeat()
     dist.extended.experimental_run_steps_on_iterator(
         lambda _, inputs: all_inputs.append(self.evaluate(inputs)),
-        dataset.make_one_shot_iterator())
+        dataset_ops.make_one_shot_iterator(dataset))
     self.assertEqual(all_inputs, [1.])
 
   @_run_in_and_out_of_scope
diff --git a/tensorflow/python/distribute/input_lib.py b/tensorflow/python/distribute/input_lib.py
index b55f933a668..34e6c397602 100644
--- a/tensorflow/python/distribute/input_lib.py
+++ b/tensorflow/python/distribute/input_lib.py
@@ -484,6 +484,11 @@ class DistributedDataset(_IterableInput):
         # pylint: disable=protected-access
         with ops.colocate_with(dataset._variant_tensor):
           dataset = distribute._RebatchDataset(dataset, split_batch_by)
+          # Add a prefetch to pipeline rebatching for performance.
+          # TODO(rachelim): Instead of inserting an extra prefetch stage here,
+          # leverage static graph rewrites to insert _RebatchDataset before
+          # the final `prefetch` if it exists.
+          dataset = dataset.prefetch(split_batch_by)
       except errors.InvalidArgumentError as e:
         if "without encountering a batch" in str(e):
           six.reraise(
diff --git a/tensorflow/python/distribute/minimize_loss_test.py b/tensorflow/python/distribute/minimize_loss_test.py
index 789ee970b9d..cb92840481e 100644
--- a/tensorflow/python/distribute/minimize_loss_test.py
+++ b/tensorflow/python/distribute/minimize_loss_test.py
@@ -529,5 +529,18 @@ class MinimizeLossStepTest(test.TestCase, parameterized.TestCase):
     self.assertEqual(initial_loss.dtype, loss_tensor.dtype)
     self.assertEqual(initial_loss.shape, loss_tensor.shape)
 
+  @combinations.generate(
+      strategy_combinations.distributions_and_v2_optimizers())
+  def test_empty_var_list(self, distribution, optimizer_fn):
+    opt = optimizer_fn()
+    with distribution.scope():
+
+      def run_fn():
+        opt.minimize(lambda: constant_op.constant(1.), [])
+        opt.apply_gradients([])
+
+      distribution.experimental_run_v2(run_fn)
+
+
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/distribute/mirrored_strategy_test.py b/tensorflow/python/distribute/mirrored_strategy_test.py
index 5c8e9a778dd..d4d9c5f33a5 100644
--- a/tensorflow/python/distribute/mirrored_strategy_test.py
+++ b/tensorflow/python/distribute/mirrored_strategy_test.py
@@ -179,7 +179,7 @@ class MirroredTwoDeviceDistributionTest(
     def fn():
       dataset = dataset_ops.Dataset.range(2).interleave(
           (lambda _: dataset_ops.Dataset.range(10)), cycle_length=2)
-      it = dataset.make_one_shot_iterator()
+      it = dataset_ops.make_one_shot_iterator(dataset)
       return it.get_next
     expected_values = [[i, i] for i in range(0, 10)]
 
@@ -1115,7 +1115,7 @@ class MultiWorkerMirroredStrategyTest(
     self._configure_distribution_strategy(distribution)
     def fn():
       dataset = dataset_ops.Dataset.range(100)
-      it = dataset.make_one_shot_iterator()
+      it = dataset_ops.make_one_shot_iterator(dataset)
       return it.get_next
     num_gpus = context.num_gpus()
     num_workers = 2
diff --git a/tensorflow/python/distribute/one_device_strategy_test.py b/tensorflow/python/distribute/one_device_strategy_test.py
index 3ba44be30e9..f825c5e1f9e 100644
--- a/tensorflow/python/distribute/one_device_strategy_test.py
+++ b/tensorflow/python/distribute/one_device_strategy_test.py
@@ -73,7 +73,7 @@ class OneDeviceStrategyTest(
   def testMakeInputFnIteratorWithCallable(self, distribution):
     def fn():
       dataset = dataset_ops.Dataset.range(10)
-      it = dataset.make_one_shot_iterator()
+      it = dataset_ops.make_one_shot_iterator(dataset)
       return it.get_next
     expected_values = [[i] for i in range(10)]
     input_fn = self._input_fn_to_test_input_context(
diff --git a/tensorflow/python/distribute/parameter_server_strategy_test.py b/tensorflow/python/distribute/parameter_server_strategy_test.py
index f8202fd050b..66734ccb42c 100644
--- a/tensorflow/python/distribute/parameter_server_strategy_test.py
+++ b/tensorflow/python/distribute/parameter_server_strategy_test.py
@@ -649,7 +649,7 @@ class ParameterServerStrategyTest(
     else:
       def fn():
         dataset = dataset_ops.Dataset.range(100)
-        it = dataset.make_one_shot_iterator()
+        it = dataset_ops.make_one_shot_iterator(dataset)
         return it.get_next
     expected_values = [[i+j for j in range(num_gpus)]
                        for i in range(0, 100, num_gpus)]
@@ -682,7 +682,7 @@ class ParameterServerStrategyTest(
     else:
       def fn():
         dataset = dataset_ops.Dataset.range(100)
-        it = dataset.make_one_shot_iterator()
+        it = dataset_ops.make_one_shot_iterator(dataset)
         return it.get_next
     expected_values = [[i+j for j in range(num_gpus)]
                        for i in range(0, 100, num_gpus)]
diff --git a/tensorflow/python/distribute/values.py b/tensorflow/python/distribute/values.py
index 873cb3188f8..55b977678f1 100644
--- a/tensorflow/python/distribute/values.py
+++ b/tensorflow/python/distribute/values.py
@@ -378,10 +378,12 @@ class DistributedDelegate(DistributedValues):
 
   def __getattr__(self, name):
     # The '_use_resource_variables' and the attrs starts with '_self' are used
-    # for restoring the saved_model proto. At the point these attrs are queried,
-    # the variable has not been initialized. Thus it should not query those of
-    # the underlying components.
-    if name.startswith("_self_") or name == "_use_resource_variables":
+    # for restoring the saved_model proto, and '_attribute_sentinel' is used for
+    # Layer tracking. At the point these attrs are queried, the variable has not
+    # been initialized. Thus it should not query those of the underlying
+    # components.
+    if name.startswith("_self_") or name in (
+        "_use_resource_variables", "_attribute_sentinel"):
       return super(DistributedDelegate, self).__getattr__(name)
 
     # TODO(priyag): This needs to be made robust against pitfalls from mix use
@@ -594,7 +596,7 @@ DistributedVarOp = collections.namedtuple(
     "DistributedVarOp", ["name", "graph", "traceback", "type"])
 
 
-class DistributedVariable(DistributedDelegate, variables_lib.AbstractVariable):
+class DistributedVariable(DistributedDelegate, variables_lib.Variable):
   """Holds a map from replica to variables."""
   # TODO(josh11b): Support changing the set of variables if e.g. if new
   # devices are joining or a device is to leave.
diff --git a/tensorflow/python/eager/BUILD b/tensorflow/python/eager/BUILD
index b56eee2c7ab..46577a11d16 100644
--- a/tensorflow/python/eager/BUILD
+++ b/tensorflow/python/eager/BUILD
@@ -65,7 +65,6 @@ py_library(
         ":core",
         ":def_function",
         ":execute",
-        ":execution_callbacks",
         ":forwardprop",
         ":forwardprop_util",
         ":function",
@@ -359,9 +358,9 @@ cuda_py_test(
         ":function",
         ":test",
         "@absl_py//absl/testing:parameterized",
-        "//tensorflow/python:data_flow_ops",
-        "//tensorflow/python:constant_op",
         "//tensorflow/python:clip_ops",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:data_flow_ops",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:errors",
         "//tensorflow/python:framework_ops",
@@ -410,30 +409,6 @@ py_library(
     ],
 )
 
-py_library(
-    name = "execution_callbacks",
-    srcs = ["execution_callbacks.py"],
-    srcs_version = "PY2AND3",
-    visibility = ["//tensorflow:internal"],
-    deps = [
-        ":context",
-        "//tensorflow/python:op_callbacks",
-        "//tensorflow/python:pywrap_tensorflow",
-        "//third_party/py/numpy",
-    ],
-)
-
-tf_py_test(
-    name = "execution_callbacks_test",
-    srcs = ["execution_callbacks_test.py"],
-    additional_deps = [
-        ":execution_callbacks",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:math_ops",
-    ],
-)
-
 py_library(
     name = "graph_only_ops",
     srcs = ["graph_only_ops.py"],
diff --git a/tensorflow/python/eager/backprop.py b/tensorflow/python/eager/backprop.py
index 826d39c4777..0eb5bca2574 100644
--- a/tensorflow/python/eager/backprop.py
+++ b/tensorflow/python/eager/backprop.py
@@ -816,8 +816,10 @@ class GradientTape(object):
       self._pop_tape()
 
   def _push_tape(self):
+    """Pushes a new tape onto the tape stack."""
     if self._recording:
-      raise ValueError("Tape is already recording.")
+      raise ValueError("Tape is still recording, This can happen if you try to "
+                       "re-enter an already-active tape.")
     if self._tape is None:
       self._tape = tape.push_new_tape(
           persistent=self._persistent,
diff --git a/tensorflow/python/eager/backprop_test.py b/tensorflow/python/eager/backprop_test.py
index 27f093de704..86b5b9ebd8b 100644
--- a/tensorflow/python/eager/backprop_test.py
+++ b/tensorflow/python/eager/backprop_test.py
@@ -48,7 +48,6 @@ from tensorflow.python.ops import nn_ops
 from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import variable_scope
-from tensorflow.python.ops.signal import fft_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.training import training
 from tensorflow.python.util import nest
@@ -1491,24 +1490,25 @@ class BackpropTest(test.TestCase, parameterized.TestCase):
     def f(x):
 
       @custom_gradient.custom_gradient(primals=(x,))
-      def g(unused_dz):
+      def g(dz):
 
         def h(unused_ddz):
           return 2.2
 
-        return x * 2.1, h
+        return x * 2.1 * dz, h
 
       return x + 1., g
 
     with backprop.GradientTape(persistent=True) as t:
       with backprop.GradientTape(persistent=True) as tt:
         v = variables.Variable(1.)
-        self.evaluate(v.initializer)
+        w = variables.Variable(0.)
+        self.evaluate([v.initializer, w.initializer])
         t.watch(v)
         tt.watch(v)
-        output = f(v)
+        output = f(v + w)
         self.assertAllClose(2., output)
-      g = tt.gradient(output, v)
+      g = tt.gradient(output, v, output_gradients=1. + w)
       self.assertAllClose(2.1, g)
     gg = t.gradient(g, v)
     self.assertAllClose(2.2, gg)
@@ -1578,23 +1578,6 @@ class BackpropTest(test.TestCase, parameterized.TestCase):
     with self.assertRaisesRegexp(ValueError, 'ndarray'):
       g.watch(np.array(1.))
 
-  def testOpWithNoAttrs(self):
-
-    @function.defun(autograph=False)
-    def f():
-      with backprop.GradientTape() as tape:
-        xs = random_ops.random_normal([10, 32])
-        tape.watch(xs)
-        # The `rfft()` op has no defined attrs, which exercises a different
-        # branch in the Python op wrapper code generator for recording
-        # gradients.
-        ys = fft_ops.rfft(xs)
-        self.assertEmpty(ys.op.node_def.attr)
-      gs = tape.gradient(ys, xs)
-      self.assertIsNotNone(gs)
-
-    f.get_concrete_function()
-
 
 class JacobianTest(test.TestCase):
 
diff --git a/tensorflow/python/eager/benchmarks_test.py b/tensorflow/python/eager/benchmarks_test.py
index 01d2134da92..a05fea6039b 100644
--- a/tensorflow/python/eager/benchmarks_test.py
+++ b/tensorflow/python/eager/benchmarks_test.py
@@ -74,7 +74,7 @@ def c_tfe_py_fastpath_execute(a,
   try:
     return pywrap_tensorflow.TFE_Py_FastPathExecute(
         ctx._handle, ctx.device_name, "MatMul", name,
-        ctx._post_execution_callbacks, a, b, "transpose_a", transpose_a,
+        ctx.op_callbacks, a, b, "transpose_a", transpose_a,
         "transpose_b", transpose_b)
   except core._NotOkStatusException as e:
     if name is not None:
@@ -1115,7 +1115,8 @@ class RemoteWorkerMicroBenchmarks(test.Benchmark):
         wall_time=mean_us,
         extras={"examples_per_sec": num_iters / total_time})
 
-  def benchmark_send_mirroring_off(self):
+  # TODO(b/136184459): Re-enabled once crash is fixed
+  def _DISABLED_benchmark_send_mirroring_off(self):
     remote.connect_to_remote_host(self._cached_server_target1)
 
     x = random_ops.random_uniform((2, 2)).cpu()
@@ -1135,7 +1136,8 @@ class RemoteWorkerMicroBenchmarks(test.Benchmark):
     # executed when their corresponding device and manager are still available.
     gc.collect()
 
-  def benchmark_send_mirroring_on(self):
+  # TODO(b/136184459): Re-enabled once crash is fixed
+  def _DISABLED_benchmark_send_mirroring_on(self):
     remote.connect_to_remote_host(self._cached_server_target1)
 
     x = random_ops.random_uniform((2, 2)).cpu()
@@ -1155,7 +1157,8 @@ class RemoteWorkerMicroBenchmarks(test.Benchmark):
     # executed when their corresponding device and manager are still available.
     gc.collect()
 
-  def benchmark_worker_mirroring_off(self):
+  # TODO(b/136184459): Re-enabled once crash is fixed
+  def _DISABLED_benchmark_worker_mirroring_off(self):
     remote.connect_to_remote_host(
         [self._cached_server_target1, self._cached_server_target2])
 
@@ -1177,7 +1180,8 @@ class RemoteWorkerMicroBenchmarks(test.Benchmark):
     # executed when their corresponding device and manager are still available.
     gc.collect()
 
-  def benchmark_worker_mirroring_on(self):
+  # TODO(b/136184459): Re-enabled once crash is fixed
+  def _DISABLED_benchmark_worker_mirroring_on(self):
     remote.connect_to_remote_host(
         [self._cached_server_target1, self._cached_server_target2])
 
diff --git a/tensorflow/python/eager/context.py b/tensorflow/python/eager/context.py
index 13200479ee3..e3dd3d4f748 100644
--- a/tensorflow/python/eager/context.py
+++ b/tensorflow/python/eager/context.py
@@ -186,6 +186,8 @@ class _ThreadLocalData(threading.local):
     self.summary_step = None
     self.function_call_options = None
     self.executor = None
+    self.op_callbacks = []
+    self.invoking_op_callbacks = False
 
 
 ContextSwitch = collections.namedtuple(
@@ -317,10 +319,6 @@ class _TensorCacheDeleter(object):
       del _tensor_caches_map[self._context_id]
 
 
-# Thread-local stack of execution callbacks.
-_post_execution_callbacks = threading.local()
-
-
 # TODO(agarwal): rename to EagerContext / EagerRuntime ?
 # TODO(agarwal): consider keeping the corresponding Graph here.
 class Context(object):
@@ -1006,41 +1004,51 @@ class Context(object):
     self.ensure_initialized()
     return bool(pywrap_tensorflow.TFE_ContextHasFunction(self._handle, name))
 
-  def add_post_execution_callback(self, callback):
-    """Add a post-execution callback to the context.
+  def add_op_callback(self, callback):
+    """Add a post-op callback to the context.
 
-    A post-execution callback is invoked immediately after an eager operation or
-    function has finished execution, providing access to the op's type, name
-    input and output tensors. Multiple execution callbacks can be added, in
-    which case the callbacks will be invoked in the order in which they are
-    added.
+    A post-op callback is invoked immediately after an eager operation or
+    function has finished execution or after a op has been added to a graph,
+    providing access to the op's type, name input and output tensors. Multiple
+    op callbacks can be added, in which case the callbacks will be invoked in
+    the order in which they are added.
 
     Args:
       callback: a callable of the signature
-      `f(op_type, op_name, attrs, inputs, outputs)`.
-      `op_type` is the type of the operation that was just executed (e.g.,
-        `MatMul`).
-      `op_name` is the name of the operation that has was just executed. This
-        name is set by the client who created the operation and can be `None` if
-        it is unset.
-      `attrs` contains the attributes of the operation as a `tuple` of
-        alternating attribute names and attribute values.
-      `inputs` is the `list` of input `Tensor`(s) to the op.
-      `outputs` is the `list` of output `Tensor`(s) from the op.
-       Return value(s) from the callback are ignored.
+        `f(op_type, inputs, attrs, outputs, op_name=None, graph=None)`.
+        See doc strings in `op_callbacks.py` for details on the function
+        signature and its semantics.
     """
-    self.post_execution_callbacks.append(callback)
+    if callback not in self._thread_local_data.op_callbacks:
+      self._thread_local_data.op_callbacks.append(callback)
 
-  def clear_post_execution_callbacks(self):
-    """Clear all post-execution callbacks added to the context."""
-    del self.post_execution_callbacks[:]
+  def remove_op_callback(self, callback):
+    """Remove an already-registered op callback.
+
+    Args:
+      callback: The op callback to be removed.
+
+    Raises:
+      KeyError: If `callback` is not already registered.
+    """
+    if callback not in self._thread_local_data.op_callbacks:
+      raise KeyError(
+          "The specified op callback has not been registered, "
+          "and hence cannot be removed.")
+    del self._thread_local_data.op_callbacks[
+        self._thread_local_data.op_callbacks.index(callback)]
 
   @property
-  def post_execution_callbacks(self):
-    """Get the list of post-execution callbacks added to the context."""
-    if not hasattr(_post_execution_callbacks, "callbacks"):
-      _post_execution_callbacks.callbacks = []
-    return _post_execution_callbacks.callbacks
+  def op_callbacks(self):
+    return self._thread_local_data.op_callbacks
+
+  @property
+  def invoking_op_callbacks(self):
+    return self._thread_local_data.invoking_op_callbacks
+
+  @invoking_op_callbacks.setter
+  def invoking_op_callbacks(self, value):
+    self._thread_local_data.invoking_op_callbacks = value
 
   def _initialize_physical_devices(self):
     """Get local devices visible to the system."""
diff --git a/tensorflow/python/eager/core_test.py b/tensorflow/python/eager/core_test.py
index da742b5b849..3d8138bf792 100644
--- a/tensorflow/python/eager/core_test.py
+++ b/tensorflow/python/eager/core_test.py
@@ -27,6 +27,7 @@ import numpy as np
 
 from tensorflow.core.protobuf import config_pb2
 from tensorflow.python import pywrap_tensorflow
+from tensorflow.python.compat import compat
 from tensorflow.python.eager import context
 from tensorflow.python.eager import core
 from tensorflow.python.eager import def_function
@@ -302,9 +303,13 @@ class TFETest(test_util.TensorFlowTestCase):
       with self.assertRaises(ValueError):
         bool(tf_a == tf_d)
       self.assertAllEqual(tf_a == tf_d, [[True, False], [True, False]])
-      # TODO(b/120678848): If shapes do not match we should instead return False
-      with self.assertRaises(errors.InvalidArgumentError):
-        bool(tf_a != tf_e)
+      if compat.forward_compatible(2019, 9, 25):
+        self.assertFalse(bool(tf_a == tf_e))
+        self.assertTrue(bool(tf_a != tf_e))
+        self.assertNotAllEqual(tf_a, tf_e)
+      else:
+        with self.assertRaises(errors.InvalidArgumentError):
+          bool(tf_a != tf_e)
 
       with self.assertRaises(ValueError):
         bool(np_a == np_b)
@@ -313,7 +318,9 @@ class TFETest(test_util.TensorFlowTestCase):
         bool(np_a == np_c)
       self.assertAllEqual(np_a == np_c, [[True, True], [True, True]])
       self.assertAllEqual(np_a == np_d, [[True, False], [True, False]])
-      bool(np_a != np_e)
+      self.assertFalse(bool(np_a == np_e))
+      self.assertTrue(bool(np_a != np_e))
+      self.assertNotAllEqual(np_a, np_e)
     finally:
       if default:
         ops.enable_tensor_equality()
diff --git a/tensorflow/python/eager/def_function.py b/tensorflow/python/eager/def_function.py
index f910986e2dd..078eea1b81f 100644
--- a/tensorflow/python/eager/def_function.py
+++ b/tensorflow/python/eager/def_function.py
@@ -20,6 +20,7 @@ from __future__ import division
 from __future__ import print_function
 
 import functools
+import threading
 import weakref
 
 from tensorflow.python.eager import context
@@ -37,6 +38,41 @@ from tensorflow.python.util import object_identity
 from tensorflow.python.util import tf_decorator
 from tensorflow.python.util.tf_export import tf_export
 
+FREQUENT_TRACING_WARNING_MAX_CALL_HISTORY = 10
+FREQUENT_TRACING_WARNING_THRESHOLD = 5
+
+
+class _CallCounter(object):
+  """Class keeping track of how many recent calls triggered tracing."""
+
+  def __init__(self, max_call_history):
+    self._max_call_history = max_call_history
+    self._calls_per_tracings = []
+    self.call_count = 0
+
+  def called_with_tracing(self):
+    self.call_count += 1
+    self._calls_per_tracings.append(1)
+
+    while self._calls_per_tracings:
+      if self.call_count - self._calls_per_tracings[0] > self._max_call_history:
+        self.call_count -= self._calls_per_tracings.pop(0)
+      else:
+        break
+
+  def called_without_tracing(self):
+    # TODO(kkimlabs): This is an unnecessary defensive check. Since this is last
+    # minute CL before 2.0 release, I've decided to be very defensive here to
+    # avoid a potential crash. Remove once we release 2.0.
+    if not self._calls_per_tracings:
+      return
+
+    self._calls_per_tracings[-1] += 1
+    self.call_count += 1
+
+  def get_tracing_count(self):
+    return len(self._calls_per_tracings)
+
 
 class UnliftedInitializerVariable(resource_variable_ops.UninitializedVariable):
   """Variable which does not lift its initializer out of function context.
@@ -298,6 +334,7 @@ class Function(object):
       ValueError: if `input_signature` is not None and the `python_function`'s
         argspec has keyword arguments.
     """
+    self._lock = threading.Lock()
     self._python_function = python_function
     self._function_spec = function_lib.FunctionSpec.from_function_and_signature(
         python_function, input_signature)
@@ -305,11 +342,12 @@ class Function(object):
     self._experimental_autograph_options = experimental_autograph_options
     self.experimental_relax_shapes = experimental_relax_shapes
     self._experimental_compile = experimental_compile
-    self._created_variables = None
-    self._stateful_fn = None
-    self._stateless_fn = None
+    self._created_variables = None  # GUARDED_BY(self._lock)
+    self._stateful_fn = None  # GUARDED_BY(self._lock)
+    self._stateless_fn = None  # GUARDED_BY(self._lock)
     self._descriptor_cache = weakref.WeakKeyDictionary()
     self._name = name
+    self._call_counter = _CallCounter(FREQUENT_TRACING_WARNING_MAX_CALL_HISTORY)
 
   def _defun_with_scope(self, scope):
     """Creates a defun wrapped inside a variable creator scope."""
@@ -426,16 +464,53 @@ class Function(object):
     self._function_spec = function_lib.FunctionSpec.from_function_and_signature(
         self._python_function, self.input_signature)
 
+  def _get_tracing_count(self):
+    result = self._stateless_fn.tracing_count if self._stateless_fn else 0
+    result += self._stateful_fn.tracing_count if self._stateful_fn else 0
+    return result
+
   def __call__(self, *args, **kwds):
-    """Calls the graph function."""
+    """Calls the graph function and warn too frequent tracings."""
     context.ensure_initialized()
     if RUN_FUNCTIONS_EAGERLY:
       return self._python_function(*args, **kwds)
+
+    tracing_count = self._get_tracing_count()
+    result = self._call(*args, **kwds)
+    if tracing_count == self._get_tracing_count():
+      self._call_counter.called_without_tracing()
+      return result
+
+    self._call_counter.called_with_tracing()
+    recent_tracing_count = self._call_counter.get_tracing_count()
+    if recent_tracing_count >= FREQUENT_TRACING_WARNING_THRESHOLD:
+      logging.warning(
+          "{} out of the last {} calls to {} triggered tf.function retracing. "
+          "Tracing is expensive and the excessive number of tracings is likely "
+          "due to passing python objects instead of tensors. Also, tf.function "
+          "has experimental_relax_shapes=True option that relaxes argument "
+          "shapes that can avoid unnecessary retracing. Please refer to "
+          "https://www.tensorflow.org/beta/tutorials/eager/tf_function#python_or_tensor_args"
+          " and https://www.tensorflow.org/api_docs/python/tf/function for more "
+          "details.".format(recent_tracing_count, self._call_counter.call_count,
+                            self._python_function))
+
+    return result
+
+  def _call(self, *args, **kwds):
+    """Calls the graph function."""
+    self._lock.acquire()
     if self._created_variables:
+      # Release the lock early so that multiple threads can perform the call
+      # in parallel.
+      self._lock.release()
       # In this case we have created variables on the first call, so we run the
       # defunned version which is guaranteed to never create variables.
       return self._stateless_fn(*args, **kwds)  # pylint: disable=not-callable
     elif self._stateful_fn is not None:
+      # Release the lock early so that multiple threads can perform the call
+      # in parallel.
+      self._lock.release()
       # In this case we have not created variables on the first call. So we can
       # run the first trace but we should fail if variables are created.
       results = self._stateful_fn(*args, **kwds)
@@ -444,9 +519,15 @@ class Function(object):
                          " decorated with tf.function.")
       return results
 
-    # This is the first call of __call__, so we have to initialize.
-    initializer_map = object_identity.ObjectIdentityDictionary()
-    self._initialize(args, kwds, add_initializers_to=initializer_map)
+    try:
+      # This is the first call of __call__, so we have to initialize.
+      initializer_map = object_identity.ObjectIdentityDictionary()
+      self._initialize(args, kwds, add_initializers_to=initializer_map)
+    finally:
+      # At this point we know that the initialization is complete (or less
+      # interestingly an exception was raised) so we no longer need a lock.
+      self._lock.release()
+
     if self._created_variables:
       try:
         # Attempt to initialize variables eagerly and without conds by lifting
@@ -578,14 +659,15 @@ class Function(object):
     Raises:
       RuntimeError: if called after the variables have been initialized.
     """
-    if self._stateful_fn is not None:
-      raise RuntimeError(
-          "get_initialization_function cannot be called after the function "
-          "has been used")
-    # Here we trace the function, collect the initializers, and attempt to
-    # extract them and run them eagerly. Fail only if we cannot do so.
-    initializer_map = object_identity.ObjectIdentityDictionary()
-    self._initialize(args, kwargs, add_initializers_to=initializer_map)
+    with self._lock:
+      if self._stateful_fn is not None:
+        raise RuntimeError(
+            "get_initialization_function cannot be called after the function "
+            "has been used")
+      # Here we trace the function, collect the initializers, and attempt to
+      # extract them and run them eagerly. Fail only if we cannot do so.
+      initializer_map = object_identity.ObjectIdentityDictionary()
+      self._initialize(args, kwargs, add_initializers_to=initializer_map)
 
     # Note: using defun here avoids an infinite recursion.
     @function_lib.defun
@@ -710,10 +792,11 @@ class Function(object):
     Raises:
       ValueError: if this object has not yet been called on concrete values.
     """
-    if self._stateful_fn is None:
-      initializer_map = object_identity.ObjectIdentityDictionary()
-      self._initialize(args, kwargs, add_initializers_to=initializer_map)
-      self._initialize_uninitialized_variables(initializer_map)
+    with self._lock:
+      if self._stateful_fn is None:
+        initializer_map = object_identity.ObjectIdentityDictionary()
+        self._initialize(args, kwargs, add_initializers_to=initializer_map)
+        self._initialize_uninitialized_variables(initializer_map)
 
     if self._created_variables:
       # In this case we have created variables on the first call, so we run the
diff --git a/tensorflow/python/eager/execute.py b/tensorflow/python/eager/execute.py
index 19f8887ec79..f365523f088 100644
--- a/tensorflow/python/eager/execute.py
+++ b/tensorflow/python/eager/execute.py
@@ -136,7 +136,7 @@ def execute_with_cancellation(op_name,
 def execute_with_callbacks(op_name, num_outputs, inputs, attrs, ctx, name=None):
   """Monkey-patch to execute to enable execution callbacks."""
   tensors = quick_execute(op_name, num_outputs, inputs, attrs, ctx, name)
-  for callback in ctx.post_execution_callbacks:
+  for callback in ctx.op_callbacks:
     callback(op_name, tuple(inputs), attrs, tensors, name)
 
   return tensors
diff --git a/tensorflow/python/eager/execution_callbacks.py b/tensorflow/python/eager/execution_callbacks.py
deleted file mode 100644
index 768f3e6ac10..00000000000
--- a/tensorflow/python/eager/execution_callbacks.py
+++ /dev/null
@@ -1,378 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Execution Callbacks for Eager Mode."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import contextlib
-import functools
-import enum  # pylint: disable=g-bad-import-order
-
-import numpy as np
-
-from tensorflow.python import pywrap_tensorflow
-from tensorflow.python.eager import context
-from tensorflow.python.eager import core
-from tensorflow.python.eager import execute
-from tensorflow.python.platform import tf_logging as logging
-
-
-class ExecutionCallback(enum.Enum):
-  """Valid callback actions.
-
-  These can be passed to `seterr` or `errstate` to create callbacks when
-  specific events occur (e.g. an operation produces `NaN`s).
-
-  IGNORE: take no action.
-  PRINT:  print a warning to `stdout`.
-  RAISE:  raise an error (e.g. `InfOrNanError`).
-  WARN:   print a warning using `tf.compat.v1.logging.warn`.
-  """
-
-  IGNORE = "ignore"
-  PRINT = "print"
-  RAISE = "raise"
-  WARN = "warn"
-
-_DEFAULT_CALLBACK_ACTION = ExecutionCallback.RAISE
-
-
-# TODO(cais): Consider moving this exception class to errors_impl.py.
-class InfOrNanError(Exception):
-  """Exception for inf and/or nan being present in tensor."""
-
-  def __init__(self,
-               op_type,
-               op_name,
-               output_index,
-               num_outputs,
-               value):
-    """Constructor of InfOrNanError.
-
-    Args:
-      op_type: Type name of the op that generated the tensor with
-        `inf`(s) or `nan`(s) (e.g., `Div`).
-      op_name: Name of the op that generated the tensor with `inf`(s) or
-        `nan`(s). This name is set by client and can be `None` if it is unset.
-      output_index: The 0-based output index of the tensor that contains
-        `inf`(s) or `nan`(s).
-      num_outputs: Total number of outputs of the operation.
-      value: The tensor value that contains `inf`(s) or `nan`(s).
-    """
-    self._op_type = op_type
-    self._op_name = op_name
-    self._output_index = output_index
-    self._num_outputs = num_outputs
-    self._value = value
-
-    self._total_count = np.size(value)
-    self._inf_count = np.count_nonzero(np.isinf(value))
-    self._nan_count = np.count_nonzero(np.isnan(value))
-
-    super(InfOrNanError, self).__init__(self._get_error_message())
-
-  def _get_error_message(self):
-    """Get the error message describing this InfOrNanError object."""
-    name_str = (("'%s'" % self._op_name) if self._op_name is not None
-                else str(self._op_name))
-    msg = "Output %d of %d of TFE operation %s (name: %s) contains " % (
-        self._output_index + 1, self._num_outputs, self._op_type, name_str)
-    if self._inf_count and self._nan_count:
-      msg += "%d inf(s) and %d nan(s) " % (self._inf_count, self._nan_count)
-    elif self._inf_count:
-      msg += "%d inf(s) " % self._inf_count
-    else:
-      msg += "%d nan(s) " % self._nan_count
-    msg += "out of a total of %d element(s). Tensor value: %s" % (
-        self._total_count, self._value)
-    return msg
-
-  @property
-  def op_type(self):
-    return self._op_type
-
-  @property
-  def op_name(self):
-    return self._op_name
-
-  @property
-  def output_index(self):
-    return self._output_index
-
-  @property
-  def num_outputs(self):
-    return self._num_outputs
-
-  @property
-  def value(self):
-    return self._value
-
-
-def inf_nan_callback(op_type,
-                     inputs,
-                     attrs,
-                     outputs,
-                     op_name,
-                     check_inf=True,
-                     check_nan=True,
-                     action=_DEFAULT_CALLBACK_ACTION):
-  """An execution callback that checks for `inf`s and `nan`s in output tensors.
-
-  This callback can be used with `tfe.add_execute_callback` to check for invalid
-  numeric values. E.g.,
-  ```python
-  tfe.add_execute_callback(tfe.inf_nan_callback)
-  ```
-
-  Args:
-    op_type: Name of the TFE operation type (e.g., `MatMul`).
-    inputs: The `list` of input tensors to the operation, currently unused by
-      this callback.
-    attrs: Attributes of the TFE operation, as a tuple of alternating attribute
-      names and attribute values.
-    outputs: The `list` of output tensors from the operation, checked by this
-      callback for `inf` and `nan` values.
-    op_name: Name of the TFE operation. This name is set by client and can be
-      `None` if it unset.
-    check_inf: (`bool`) Whether this callback should check for `inf` values in
-      the output tensor values.
-    check_nan: (`bool`) Whether this callback should check for `nan` values in
-      the output tensor values.
-    action: (`ExecutionCallback`) Action to be taken by the callback when
-      `inf` or `nan` values are detected.
-
-  Raises:
-    InfOrNanError: iff `inf` or `nan` values are seen in any of `outputs` and
-      `action` is `"raise"`.
-    ValueError: iff the value of `action` is invalid.
-  """
-  del attrs, inputs  # Not used.
-
-  action = ExecutionCallback(action)
-  ctx = context.context()
-
-  for index, output in enumerate(outputs):
-    if not output.dtype.is_numpy_compatible:
-      continue
-
-    numpy_dtype = output.dtype.as_numpy_dtype
-    if (np.issubdtype(numpy_dtype, np.floating) or
-        np.issubdtype(numpy_dtype, np.complex) or
-        np.issubdtype(numpy_dtype, np.integer)):
-      try:
-        check_numerics_op_attrs = (
-            "message", "Eager-mode inf/nan check",
-            "T", outputs[0].dtype.as_datatype_enum)
-        # TODO(cais): Consider moving this into execute.py.
-        # pylint: disable=protected-access
-        ctx.ensure_initialized()
-        pywrap_tensorflow.TFE_Py_Execute(
-            ctx._handle, output.device, "CheckNumerics", [output],
-            check_numerics_op_attrs, 1)
-        # pylint: enable=protected-access
-      except core._NotOkStatusException:  # pylint: disable=protected-access
-        value = output.numpy()
-        inf_detected = np.any(np.isinf(value)) and check_inf
-        nan_detected = np.any(np.isnan(value)) and check_nan
-        if not inf_detected and not nan_detected:
-          continue
-
-        error = InfOrNanError(op_type, op_name, index, len(outputs), value)
-        if action == ExecutionCallback.PRINT:
-          print("Warning: %s" % str(error))
-        elif action == ExecutionCallback.WARN:
-          logging.warn(str(error))
-        elif action == ExecutionCallback.RAISE:
-          raise error
-        else:
-          raise ValueError(
-              "Invalid action for inf_nan_callback: %s. Valid actions are: "
-              "{PRINT | WARN | RAISE}" % action)
-
-
-def inf_callback(op_type,
-                 inputs,
-                 attrs,
-                 outputs,
-                 op_name,
-                 action=_DEFAULT_CALLBACK_ACTION):
-  """A specialization of `inf_nan_callback` that checks for `inf`s only."""
-  inf_nan_callback(
-      op_type,
-      inputs,
-      attrs,
-      outputs,
-      op_name,
-      check_inf=True,
-      check_nan=False,
-      action=action)
-
-
-def nan_callback(op_type,
-                 inputs,
-                 attrs,
-                 outputs,
-                 op_name,
-                 action=_DEFAULT_CALLBACK_ACTION):
-  """A specialization of `inf_nan_callback` that checks for `nan`s only."""
-  inf_nan_callback(
-      op_type,
-      inputs,
-      attrs,
-      outputs,
-      op_name,
-      check_inf=False,
-      check_nan=True,
-      action=action)
-
-
-def add_execution_callback(callback):
-  """Add an execution callback to the default eager context.
-
-  An execution callback is invoked immediately after an eager operation or
-  function has finished execution, providing access to the op's type, name
-  input and output tensors. Multiple execution callbacks can be added, in
-  which case the callbacks will be invoked in the order in which they are
-  added. To clear all execution callbacks that have been added, use
-  `clear_execution_callbacks()`.
-
-  Example:
-  ```python
-  def print_even_callback(op_type, inputs, attrs, outputs, op_name):
-    # A callback that prints only the even output values.
-    if outputs[0].numpy() % 2 == 0:
-      print("Even output from %s: %s" % (op_name or op_type,  outputs))
-  tfe.add_execution_callback(print_even_callback)
-
-  x = tf.pow(2.0, 3.0) - 3.0
-  y = tf.multiply(x, tf.add(1.0, 5.0))
-  # When the line above is run, you will see all intermediate outputs that are
-  # even numbers printed to the console.
-
-  tfe.clear_execution_callbacks()
-  ```
-
-  Args:
-    callback: a callable of the signature
-      `f(op_type, inputs, attrs, outputs, op_name)`.
-      `op_type` is the type of the operation that was just executed (e.g.,
-         `MatMul`).
-      `inputs` is the `list` of input `Tensor`(s) to the op.
-      `attrs` contains the attributes of the operation as a `tuple` of
-         alternating attribute name and attribute value.
-      `outputs` is the `list` of output `Tensor`(s) from the op.
-      `op_name` is the name of the operation that was just executed. This
-         name is set by the client who created the operation and can be `None`
-         if it is unset.
-       Return value(s) from the callback are ignored.
-  """
-  execute.execute = execute.execute_with_callbacks
-  context.context().add_post_execution_callback(callback)
-
-
-def clear_execution_callbacks():
-  """Clear all execution callbacks from the default eager context."""
-  context.context().clear_post_execution_callbacks()
-
-
-def seterr(inf_or_nan=None):
-  """Set how abnormal conditions are handled by the default eager context.
-
-  Example:
-  ```python
-  tfe.seterr(inf_or_nan=ExecutionCallback.RAISE)
-  a = tf.constant(10.0)
-  b = tf.constant(0.0)
-  try:
-    c = a / b  # <-- Raises InfOrNanError.
-  except Exception as e:
-    print("Caught Exception: %s" % e)
-
-  tfe.seterr(inf_or_nan=ExecutionCallback.IGNORE)
-  c = a / b  # <-- Does NOT raise exception anymore.
-  ```
-
-  Args:
-    inf_or_nan: An `ExecutionCallback` determining the action for infinity
-      (`inf`) and NaN (`nan`) values. A value of `None` leads to no change in
-      the action of the condition.
-
-  Returns:
-    A dictionary of old actions.
-
-  Raises:
-    ValueError: If the value of any keyword arguments is invalid.
-  """
-  inf_or_nan = ExecutionCallback(inf_or_nan) if inf_or_nan is not None else None
-  old_settings = {"inf_or_nan": ExecutionCallback.IGNORE}
-  default_context = context.context()
-
-  carryover_callbacks = []
-  for callback in default_context.post_execution_callbacks:
-    # Check whether the callback is inf_nan_callback or a partial object of
-    # inf_nan_callback.
-    if (callback == inf_nan_callback or
-        isinstance(callback, functools.partial) and
-        callback.func == inf_nan_callback):
-      if callback == inf_nan_callback:
-        old_settings["inf_or_nan"] = _DEFAULT_CALLBACK_ACTION
-      else:
-        old_settings["inf_or_nan"] = callback.keywords.get(
-            "action", _DEFAULT_CALLBACK_ACTION)
-    elif inf_or_nan is not None:
-      carryover_callbacks.append(callback)
-
-  if inf_or_nan is not None:
-    default_context.clear_post_execution_callbacks()
-    for callback in carryover_callbacks:
-      default_context.add_post_execution_callback(callback)
-    if inf_or_nan != ExecutionCallback.IGNORE:
-      default_context.add_post_execution_callback(
-          functools.partial(inf_nan_callback, action=inf_or_nan))
-
-  return old_settings
-
-
-@contextlib.contextmanager
-def errstate(inf_or_nan=None):
-  """Context manager setting error state.
-
-  Example:
-  ```
-  c = tf.math.log(0.)  # -inf
-
-  with errstate(inf_or_nan=ExecutionCallback.RAISE):
-    tf.math.log(0.)  # <-- Raises InfOrNanError.
-  ```
-
-  Args:
-    inf_or_nan: An `ExecutionCallback` determining the action for infinity
-      (`inf`) and NaN (`nan`) values. A value of `None` leads to no change in
-      the action of the condition.
-
-  Yields:
-    None.
-
-  Raises:
-    ValueError: If the value of any keyword arguments is invalid.
-  """
-  if not context.executing_eagerly():
-    yield
-  else:
-    old_settings = seterr(inf_or_nan=inf_or_nan)
-    yield
-    seterr(**old_settings)
diff --git a/tensorflow/python/eager/execution_callbacks_test.py b/tensorflow/python/eager/execution_callbacks_test.py
deleted file mode 100644
index b8b786ad2ee..00000000000
--- a/tensorflow/python/eager/execution_callbacks_test.py
+++ /dev/null
@@ -1,58 +0,0 @@
-# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for eager execution_callbacks."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from tensorflow.python.eager import execution_callbacks
-from tensorflow.python.framework import constant_op
-from tensorflow.python.framework import ops
-from tensorflow.python.ops import math_ops
-from tensorflow.python.platform import test
-
-RAISE = execution_callbacks.ExecutionCallback.RAISE
-IGNORE = execution_callbacks.ExecutionCallback.IGNORE
-
-
-def log_zero():
-  """Computes `log(0.0)`."""
-  return math_ops.log(constant_op.constant(0.))
-
-
-class ExecutionCallbacksTest(test.TestCase):
-
-  def test_errstate_inf_raise(self):
-    with execution_callbacks.errstate(inf_or_nan=RAISE):
-      with self.assertRaises(execution_callbacks.InfOrNanError):
-        log_zero()
-
-  def test_errstate_inf_ignore(self):
-    with execution_callbacks.errstate(inf_or_nan=IGNORE):
-      self.assertEqual(-float("inf"), log_zero().numpy())
-
-  def test_errstate_nesting(self):
-    with execution_callbacks.errstate(inf_or_nan=RAISE):
-      with execution_callbacks.errstate(inf_or_nan=IGNORE):
-        self.assertEqual(-float("inf"), log_zero().numpy())
-
-      with self.assertRaises(execution_callbacks.InfOrNanError):
-        log_zero()
-
-
-if __name__ == "__main__":
-  ops.enable_eager_execution()
-  test.main()
diff --git a/tensorflow/python/eager/forwardprop_test.py b/tensorflow/python/eager/forwardprop_test.py
index cf299576982..0207484eba8 100644
--- a/tensorflow/python/eager/forwardprop_test.py
+++ b/tensorflow/python/eager/forwardprop_test.py
@@ -310,6 +310,15 @@ class ForwardpropTest(test.TestCase, parameterized.TestCase):
   def testKerasLayers(self, value, op_fn, atol=1e-6):
     layer = op_fn()
     input_value = constant_op.constant(value, dtype=dtypes.float32)
+    layer.build(input_value.shape)
+    # Make sure the test is deterministic by avoiding random variable
+    # initialization.
+    for v in layer.trainable_variables:
+      v.assign(array_ops.reshape(
+          math_ops.range(
+              -1., 1., 2. / array_ops.size(v, out_type=dtypes.float32),
+              dtype=dtypes.float32),
+          v.shape))
     _test_gradients(
         self, layer, [input_value], atol=atol,
         # These are linear, so second-order is pretty boring.
@@ -320,6 +329,12 @@ class ForwardpropTest(test.TestCase, parameterized.TestCase):
        ("NoFunction", lambda f: f)])
   def testVariablesHVP(self, decorator):
 
+    if test.is_built_with_rocm():
+      # TODO(rocm)
+      # This test was recently added and has never passed on the
+      # ROCm platform. Remove this skip once the test is passing again
+      self.skipTest("NoFunction decorator test fails on the ROCm platform")
+
     class _Model(module.Module):
 
       def __init__(self):
diff --git a/tensorflow/python/eager/function.py b/tensorflow/python/eager/function.py
index 89cd6860c81..5fd8df4697d 100644
--- a/tensorflow/python/eager/function.py
+++ b/tensorflow/python/eager/function.py
@@ -20,7 +20,6 @@ from __future__ import division
 from __future__ import print_function
 
 import collections
-import enum  # pylint: disable=g-bad-import-order
 import functools
 import itertools
 import threading
@@ -1315,11 +1314,11 @@ class _HigherOrderTapeGradientFunctions(_TapeGradientFunctions):
             num_output_tangents)
 
 
-class _PossibleTapeGradientTypes(enum.Enum):
-  """Represents the output of TFE_Py_TapeSetPossibleGradientTypes."""
-  NONE = 0
-  FIRST_ORDER = 1
-  HIGHER_ORDER = 2
+# Represents the output of TFE_Py_TapeSetPossibleGradientTypes. Real enums are
+# unfortunately too slow to use here.
+_POSSIBLE_GRADIENT_TYPES_NONE = 0
+_POSSIBLE_GRADIENT_TYPES_FIRST_ORDER = 1
+_POSSIBLE_GRADIENT_TYPES_HIGHER_ORDER = 2
 
 
 class _ForwardBackwardCall(object):
@@ -1409,6 +1408,9 @@ class ConcreteFunction(object):
         func_graph, self._attrs, self._garbage_collector)
     self._first_order_tape_functions = {}
     self._higher_order_tape_functions = {}
+    # Cache the inference function to avoid a (Python) function call when not
+    # building gradients.
+    self._inference_function = self._delayed_rewrite_functions.forward()
 
   def __call__(self, *args, **kwargs):
     """Executes the wrapped function.
@@ -1572,8 +1574,17 @@ class ConcreteFunction(object):
                          "on invocation of %s, the %d-th input (%s) was not a "
                          "Tensor." % (self._func_graph.name, i, str(arg)))
     args = tensor_inputs + captured_inputs
+    possible_gradient_type = (
+        pywrap_tensorflow.TFE_Py_TapeSetPossibleGradientTypes(args))
+    if (possible_gradient_type == _POSSIBLE_GRADIENT_TYPES_NONE
+        and executing_eagerly):
+      # No tape is watching; skip to running the function.
+      return self._build_call_outputs(self._inference_function.call(
+          ctx, args, cancellation_manager=cancellation_manager))
     forward_backward = self._select_forward_and_backward_functions(
-        args, executing_eagerly)
+        args,
+        possible_gradient_type,
+        executing_eagerly)
     forward_function, args_with_tangents = forward_backward.forward()
     if executing_eagerly:
       flat_outputs = forward_function.call(
@@ -1698,7 +1709,8 @@ class ConcreteFunction(object):
     """Registers a delayed-rewrite gradient function and returns the name."""
     return self._delayed_rewrite_functions.register()
 
-  def _select_forward_and_backward_functions(self, args, executing_eagerly):
+  def _select_forward_and_backward_functions(
+      self, args, possible_gradient_type, executing_eagerly):
     """Selects forward and backward functions based on the calling context.
 
     The forward function computes the "real" function outputs, `self._outputs`,
@@ -1707,6 +1719,7 @@ class ConcreteFunction(object):
     Args:
       args: A flat list of Tensors with all of the inputs to the forward
         function (including user-specified and captured inputs).
+      possible_gradient_type: One of _POSSIBLE_GRADIENT_TYPES_*.
       executing_eagerly: Boolean, the value of context.executing_eagerly().
 
     Returns:
@@ -1721,12 +1734,10 @@ class ConcreteFunction(object):
       input_tangents = forwardprop_util.TangentInfo()
     need_gradients_for_jvps = tape.should_record_backprop(
         input_tangents.tangents)
-    possible_gradient_type = _PossibleTapeGradientTypes(
-        pywrap_tensorflow.TFE_Py_TapeSetPossibleGradientTypes(args))
     # Allows re-use of forward and backward function pairs depending on the
     # tapes and forward accumulators watching its inputs.
     cache_key = (need_gradients_for_jvps, input_tangents.indices)
-    if possible_gradient_type == _PossibleTapeGradientTypes.FIRST_ORDER:
+    if possible_gradient_type == _POSSIBLE_GRADIENT_TYPES_FIRST_ORDER:
       if input_tangents.indices or executing_eagerly:
         # There is a single non-persistent tape active, so the user can only
         # request first-order gradients from a tape. We can spend less time
@@ -1757,7 +1768,7 @@ class ConcreteFunction(object):
         return _ForwardBackwardCall(
             self._delayed_rewrite_functions, args, input_tangents.tangents,
             tape_watching=True)
-    elif possible_gradient_type == _PossibleTapeGradientTypes.HIGHER_ORDER:
+    elif possible_gradient_type == _POSSIBLE_GRADIENT_TYPES_HIGHER_ORDER:
       # Either there's a persistent tape watching, or there are multiple nested
       # tapes. Either way, the user may request higher-order gradients. We'll
       # spend a bit more time and make sure higher-order gradients are correct.
@@ -1772,7 +1783,7 @@ class ConcreteFunction(object):
         self._higher_order_tape_functions[cache_key] = functions
       return _ForwardBackwardCall(functions, args, input_tangents.tangents,
                                   tape_watching=True)
-    # else possible_gradient_type == _PossibleTapeGradientTypes.NONE, meaning no
+    # else possible_gradient_type == _POSSIBLE_GRADIENT_TYPES_NONE, meaning no
     # tape is recording.
     return _ForwardBackwardCall(
         self._delayed_rewrite_functions, args, input_tangents.tangents,
@@ -2195,6 +2206,7 @@ class Function(object):
     self._function_cache = FunctionCache()
     self._function_attributes = attributes or {}
     self._capture_by_value = capture_by_value
+    self.tracing_count = 0
 
     self._lock = threading.Lock()
     # _descriptor_cache is a of instance of a class to an instance-specific
@@ -2401,6 +2413,8 @@ class Function(object):
 
   def _create_graph_function(self, args, kwargs, override_flat_arg_shapes=None):
     """Create a `ConcreteFunction` from `args` and `kwargs`."""
+    self.tracing_count += 1
+
     if self.input_signature is None:
       arglen = len(args)
     else:
diff --git a/tensorflow/python/eager/function_test.py b/tensorflow/python/eager/function_test.py
index 1e525e505b9..8690073952b 100644
--- a/tensorflow/python/eager/function_test.py
+++ b/tensorflow/python/eager/function_test.py
@@ -80,6 +80,11 @@ from tensorflow.python.util import compat
 from tensorflow.python.util import nest
 from tensorflow.python.util import tf_inspect
 
+try:
+  import attr  # pylint:disable=g-import-not-at-top
+except ImportError:
+  attr = None
+
 
 def total_function_cache(defined):
   # pylint: disable=protected-access
@@ -357,6 +362,34 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
                     math_ops.matmul(b, a).numpy())
     self.assertAllClose(out, expected)
 
+  @parameterized.named_parameters(
+      dict(testcase_name='Defun',
+           function_decorator=function.defun),
+      dict(testcase_name='DefFunction',
+           function_decorator=def_function.function))
+  def testNestedFunctionGraphNotOutOfDate(self, function_decorator):
+    @function_decorator
+    def f():
+      return constant_op.constant(1.)
+
+    class _Model(object):
+
+      @function_decorator
+      def g(self):
+        self.f = f.get_concrete_function()
+
+    model = _Model()
+    model.g()
+    concrete = model.f
+    weak_g_graph = weakref.ref(model.g.get_concrete_function().graph)
+    self.assertIs(weak_g_graph(), concrete.graph.outer_graph)
+    weak_g = weakref.ref(model.g)
+    del model
+    self.assertIsNone(weak_g())
+    self.assertIsNone(weak_g_graph())
+    self.assertIsNotNone(concrete.graph.outer_graph)
+    self.assertIs(ops.get_default_graph(), concrete.graph.outer_graph)
+
   def testGraphEagerIsolation(self):
 
     @function.defun
@@ -383,6 +416,25 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
     out = sq_op(t)
     self.assertAllEqual(out, math_ops.matmul(t, t).numpy())
 
+  def testGetConcreteFunctionThreadSafety(self):
+
+    @def_function.function
+    def sq():
+      t = constant_op.constant([[1.0, 2.0], [3.0, 4.0]])
+      return math_ops.matmul(t, t)
+
+    concrete_functions = []
+
+    def thread_func(_):
+      cf = sq.get_concrete_function()
+      concrete_functions.append(cf)
+
+    num_threads = 100
+    pool = multiprocessing.pool.ThreadPool(num_threads)
+    _ = pool.map(thread_func, list(range(num_threads)))
+
+    self.assertLen(set(concrete_functions), 1)
+
   def testInputSpecGraphFunction(self):
     matmul = def_function.function(math_ops.matmul)
 
@@ -2352,6 +2404,37 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
     defined([[a, b], c])
     self.assertLen(total_function_cache(defined), 2)
 
+  def testCacheKeyAttrsClass(self):
+    if attr is None:
+      self.skipTest('attr module is unavailable.')
+
+    @attr.s
+    class TestClass(object):
+      a = attr.ib()
+      b = attr.ib()
+
+    @function.defun
+    def defined(l):
+      return l
+
+    defined(
+        TestClass(
+            constant_op.constant(1.),
+            [constant_op.constant(2.),
+             constant_op.constant(3.)]))
+    self.assertLen(total_function_cache(defined), 1)
+    defined(
+        TestClass(
+            constant_op.constant(1.),
+            [constant_op.constant(2.),
+             constant_op.constant(3.)]))
+    self.assertLen(total_function_cache(defined), 1)
+
+    defined(
+        TestClass([constant_op.constant(1.),
+                   constant_op.constant(2.)], constant_op.constant(3.)))
+    self.assertLen(total_function_cache(defined), 2)
+
   def testDecoratedMethod(self):
     m = DefunnedMiniModel()
     instance_call_one = m.call(array_ops.ones([1, 2]), training=True)
diff --git a/tensorflow/python/eager/memory_tests/BUILD b/tensorflow/python/eager/memory_tests/BUILD
index 6f31546820b..5d03863cf40 100644
--- a/tensorflow/python/eager/memory_tests/BUILD
+++ b/tensorflow/python/eager/memory_tests/BUILD
@@ -30,6 +30,9 @@ cuda_py_test(
         "@six_archive//:six",
     ],
     tags = [
+        "manual",
+        "no_oss",
+        "notap",  #TODO(b/140640597): this test is flaky at the moment
         "optonly",  # The test is too slow in non-opt mode
     ],
     # TODO(b/140065350): Re-enable
@@ -51,5 +54,5 @@ cuda_py_test(
     tags = [
         "optonly",  # The test is too slow in non-opt mode
     ],
-    xla_enable_strict_auto_jit = True,
+    xla_enable_strict_auto_jit = False,  # b/140261762
 )
diff --git a/tensorflow/python/eager/pywrap_tensor.cc b/tensorflow/python/eager/pywrap_tensor.cc
index b81eddac077..df8f740dc48 100644
--- a/tensorflow/python/eager/pywrap_tensor.cc
+++ b/tensorflow/python/eager/pywrap_tensor.cc
@@ -104,7 +104,7 @@ PyObject* TFE_TensorHandleToNumpy(TFE_TensorHandle* handle, TF_Status* status) {
   Py_BEGIN_ALLOW_THREADS;
   tensor = tensorflow::make_safe(TFE_TensorHandleResolve(handle, status));
   Py_END_ALLOW_THREADS;
-  if (TF_GetCode(status) != TF_OK) {
+  if (!status->status.ok()) {
     return nullptr;
   }
 
@@ -112,7 +112,7 @@ PyObject* TFE_TensorHandleToNumpy(TFE_TensorHandle* handle, TF_Status* status) {
   auto cppstatus =
       tensorflow::TF_TensorToMaybeAliasedPyArray(std::move(tensor), &ret);
   tensorflow::Set_TF_Status_from_Status(status, cppstatus);
-  if (TF_GetCode(status) != TF_OK) {
+  if (!status->status.ok()) {
     Py_XDECREF(ret);
     return nullptr;
   }
@@ -227,19 +227,18 @@ TFE_TensorHandle* EagerCast(TFE_Context* ctx, TFE_TensorHandle* handle,
     TFE_DeleteOp(op); \
     return nullptr;   \
   }
-  if (TF_GetCode(out_status) != TF_OK) RETURN_ERROR
+  if (!out_status->status.ok()) RETURN_ERROR
   TFE_OpSetDevice(op, device_name, out_status);
-  if (TF_GetCode(out_status) != TF_OK) RETURN_ERROR
+  if (!out_status->status.ok()) RETURN_ERROR
   TFE_OpAddInput(op, handle, out_status);
-  if (TF_GetCode(out_status) != TF_OK) RETURN_ERROR
+  if (!out_status->status.ok()) RETURN_ERROR
   TFE_OpSetAttrType(op, "SrcT", src_type_enum);
   TFE_OpSetAttrType(op, "DstT", dst_type_enum);
   TFE_OpSetAttrBool(op, "Truncate", false);
   TFE_TensorHandle* output = nullptr;
   int num_outputs = 1;
   TFE_Execute(op, &output, &num_outputs, out_status);
-  if (TF_GetCode(out_status) != TF_OK || num_outputs != 1 ||
-      output == nullptr) {
+  if (!out_status->status.ok() || num_outputs != 1 || output == nullptr) {
     if (output != nullptr) {
       TFE_DeleteTensorHandle(output);
     }
@@ -326,7 +325,7 @@ TFE_TensorHandle* ConvertToEagerTensorUncached(TFE_Context* ctx,
       handle = tensorflow::make_safe(
           tensorflow::EagerCast(ctx, handle.get(), handle_dtype,
                                 static_cast<TF_DataType>(dtype), status.get()));
-      if (TF_GetCode(status.get()) != TF_OK) {
+      if (!status->status.ok()) {
         PyErr_SetString(PyExc_TypeError,
                         absl::StrCat("Error while casting from dtype ",
                                      tensorflow::DataTypeString(
@@ -404,7 +403,9 @@ TFE_TensorHandle* ConvertToEagerTensor(TFE_Context* ctx, PyObject* value,
     if (handle != nullptr) return handle;
     handle = ConvertToEagerTensorUncached(ctx, value, dtype, device_name);
     if (handle == nullptr) return nullptr;
-    cache->Insert(value, dtype, device_name, handle);
+    if (!PyFloat_Check(value) || Py_IS_FINITE(PyFloat_AS_DOUBLE(value))) {
+      cache->Insert(value, dtype, device_name, handle);
+    }
     return handle;
   } else {
     return ConvertToEagerTensorUncached(ctx, value, dtype, device_name);
@@ -443,7 +444,7 @@ typedef struct EagerTensor {
   // Status objects on different functions that operate on EagerTensor and need
   // to use a TF_Status object. However note that accesses to `status` are not
   // thread-safe.
-  TF_Status* status;
+  TF_Status status;
 
   // The eager Context (from eager/context.py) used by this Tensor.
   // This is currently used only to make sure context outlives TensorHandles.
@@ -503,7 +504,7 @@ int EagerTensor_init(EagerTensor* self, PyObject* args, PyObject* kwds) {
   self->handle_data = Py_None;
   Py_INCREF(Py_None);
   self->tensor_shape = Py_None;
-  self->status = TF_NewStatus();
+  self->status.status = tensorflow::Status::OK();
   self->dict = nullptr;
   self->weakreflist = nullptr;
   self->context = nullptr;
@@ -543,7 +544,6 @@ void EagerTensor_dealloc(EagerTensor* self) {
   // Needs to happen before any actual destruction.
   PyObject_ClearWeakRefs((PyObject*)self);
 
-  TF_DeleteStatus(self->status);
   Py_DECREF(self->handle_data);
   Py_DECREF(self->tensor_shape);
   // If an attribute dictionary has been created, release it. Note that this
@@ -579,21 +579,21 @@ static PyObject* EagerTensor_datatype_enum(EagerTensor* self) {
 // Getter for `_shape_tuple`.
 static PyObject* EagerTensor_shape_tuple(EagerTensor* self) {
   auto handle = self->handle;
-  int n = TFE_TensorHandleNumDims(handle, self->status);
-  if (MaybeRaiseExceptionFromTFStatus(self->status, PyExc_ValueError)) {
+  int n = TFE_TensorHandleNumDims(handle, &self->status);
+  if (MaybeRaiseExceptionFromTFStatus(&self->status, nullptr)) {
     // Cleanup self->status before returning.
-    TF_SetStatus(self->status, TF_OK, "");
+    self->status.status = tensorflow::Status::OK();
     return nullptr;
   }
   PyObject* shape = PyTuple_New(n);
   if (PyErr_Occurred()) return nullptr;
   for (int i = 0; i < n; ++i) {
     PyObject* dim =
-        PyLong_FromLongLong(TFE_TensorHandleDim(handle, i, self->status));
-    if (MaybeRaiseExceptionFromTFStatus(self->status, PyExc_ValueError) ||
+        PyLong_FromLongLong(TFE_TensorHandleDim(handle, i, &self->status));
+    if (MaybeRaiseExceptionFromTFStatus(&self->status, nullptr) ||
         dim == nullptr || PyTuple_SetItem(shape, i, dim) != 0) {
       // Cleanup self->status before returning.
-      TF_SetStatus(self->status, TF_OK, "");
+      self->status.status = tensorflow::Status::OK();
       Py_DECREF(shape);
       if (dim != nullptr) Py_DECREF(dim);
       PyErr_SetString(PyExc_RuntimeError, "Error while creating shape");
@@ -605,10 +605,10 @@ static PyObject* EagerTensor_shape_tuple(EagerTensor* self) {
 
 // Getter for `_rank`.
 static PyObject* EagerTensor_rank(EagerTensor* self) {
-  int num_dims = TFE_TensorHandleNumDims(self->handle, self->status);
-  if (MaybeRaiseExceptionFromTFStatus(self->status, PyExc_ValueError)) {
+  int num_dims = TFE_TensorHandleNumDims(self->handle, &self->status);
+  if (MaybeRaiseExceptionFromTFStatus(&self->status, nullptr)) {
     // Cleanup self->status before returning.
-    TF_SetStatus(self->status, TF_OK, "");
+    self->status.status = tensorflow::Status::OK();
     return nullptr;
   }
 #if PY_MAJOR_VERSION < 3
@@ -621,10 +621,10 @@ static PyObject* EagerTensor_rank(EagerTensor* self) {
 // Getter for `_num_elements`.
 static PyObject* EagerTensor_num_elements(EagerTensor* self) {
   auto handle = self->handle;
-  int n = TFE_TensorHandleNumElements(handle, self->status);
-  if (MaybeRaiseExceptionFromTFStatus(self->status, PyExc_ValueError)) {
+  int n = TFE_TensorHandleNumElements(handle, &self->status);
+  if (MaybeRaiseExceptionFromTFStatus(&self->status, nullptr)) {
     // Cleanup self->status before returning.
-    TF_SetStatus(self->status, TF_OK, "");
+    self->status.status = tensorflow::Status::OK();
     return nullptr;
   }
   return PyLong_FromLongLong(n);
@@ -670,27 +670,28 @@ static PyObject* EagerTensor_copy_to_device(EagerTensor* self, PyObject* args,
   // Note that this is a shallow copy and will share the underlying buffer
   // if copying to the same device.
   TFE_TensorHandle* handle = TFE_TensorHandleCopyToDevice(
-      self->handle, GetContextHandle(self->context), device_name, self->status);
-  if (MaybeRaiseExceptionFromTFStatus(self->status, PyExc_RuntimeError)) {
+      self->handle, GetContextHandle(self->context), device_name,
+      &self->status);
+  if (MaybeRaiseExceptionFromTFStatus(&self->status, PyExc_RuntimeError)) {
     // Cleanup self->status before returning.
-    TF_SetStatus(self->status, TF_OK, "");
+    self->status.status = tensorflow::Status::OK();
     return nullptr;
   }
 
   return EagerTensorFromHandle(handle);
 }
 
-// Function `_numpy`.
+// Function `_numpy_internal`.
 // Convert an EagerTensor to a Python numpy.ndarray object.
 // The two may share underlying storage so changes to one may reflect in the
 // other.
 // Note that if `self` is not on CPU, we raise an Exception.
-static PyObject* EagerTensor_numpy(EagerTensor* self) {
-  auto* py_array = TFE_TensorHandleToNumpy(self->handle, self->status);
-  if (MaybeRaiseExceptionFromTFStatus(self->status, PyExc_ValueError)) {
+static PyObject* EagerTensor_numpy_internal(EagerTensor* self) {
+  auto* py_array = TFE_TensorHandleToNumpy(self->handle, &self->status);
+  if (MaybeRaiseExceptionFromTFStatus(&self->status, nullptr)) {
     Py_XDECREF(py_array);
     // Cleanup self->status before returning.
-    TF_SetStatus(self->status, TF_OK, "");
+    self->status.status = tensorflow::Status::OK();
     return nullptr;
   } else {
     return PyArray_Return(reinterpret_cast<PyArrayObject*>(py_array));
@@ -699,10 +700,10 @@ static PyObject* EagerTensor_numpy(EagerTensor* self) {
 
 // Getter `device`.
 static PyObject* EagerTensor_device(EagerTensor* self) {
-  const char* device = TFE_TensorHandleDeviceName(self->handle, self->status);
-  if (MaybeRaiseExceptionFromTFStatus(self->status, PyExc_ValueError)) {
+  const char* device = TFE_TensorHandleDeviceName(self->handle, &self->status);
+  if (MaybeRaiseExceptionFromTFStatus(&self->status, PyExc_ValueError)) {
     // Cleanup self->status before returning.
-    TF_SetStatus(self->status, TF_OK, "");
+    self->status.status = tensorflow::Status::OK();
     return nullptr;
   }
 #if PY_MAJOR_VERSION >= 3
@@ -715,10 +716,10 @@ static PyObject* EagerTensor_device(EagerTensor* self) {
 // Getter `backing_device`.
 static PyObject* EagerTensor_backing_device(EagerTensor* self) {
   const char* device =
-      TFE_TensorHandleBackingDeviceName(self->handle, self->status);
-  if (MaybeRaiseExceptionFromTFStatus(self->status, PyExc_ValueError)) {
+      TFE_TensorHandleBackingDeviceName(self->handle, &self->status);
+  if (MaybeRaiseExceptionFromTFStatus(&self->status, PyExc_ValueError)) {
     // Cleanup self->status before returning.
-    TF_SetStatus(self->status, TF_OK, "");
+    self->status.status = tensorflow::Status::OK();
     return nullptr;
   }
 #if PY_MAJOR_VERSION >= 3
@@ -754,8 +755,8 @@ static PyMemberDef EagerTensor_members[] = {
 #endif
 
 static PyMethodDef EagerTensor_methods[] = {
-    {"_numpy", (PyCFunction)EagerTensor_numpy, METH_NOARGS,
-     PyDoc_STR("_numpy")},
+    {"_numpy_internal", (PyCFunction)EagerTensor_numpy_internal, METH_NOARGS,
+     PyDoc_STR("_numpy_internal")},
     {"_datatype_enum", (PyCFunction)EagerTensor_datatype_enum, METH_NOARGS,
      PyDoc_STR("_datatype_enum")},
     {"_shape_tuple", (PyCFunction)EagerTensor_shape_tuple, METH_NOARGS,
@@ -779,10 +780,10 @@ static int EagerTensor_getbuffer(EagerTensor* self, Py_buffer* view,
   // DT_STRING so the following is only slightly slower than a NumPy-free
   // implementation.
   auto py_array = tensorflow::make_safe(
-      TFE_TensorHandleToNumpy(self->handle, self->status));
-  if (MaybeRaiseExceptionFromTFStatus(self->status, PyExc_BufferError)) {
+      TFE_TensorHandleToNumpy(self->handle, &self->status));
+  if (MaybeRaiseExceptionFromTFStatus(&self->status, PyExc_BufferError)) {
     // Cleanup self->status before returning.
-    TF_SetStatus(self->status, TF_OK, "");
+    self->status.status = tensorflow::Status::OK();
     return -1;
   }
   if (PyObject_GetBuffer(py_array.get(), view, flags) < 0) {
@@ -899,7 +900,7 @@ PyObject* EagerTensorFromHandle(TFE_TensorHandle* handle) {
     Py_INCREF(Py_None);
     t->tensor_shape = Py_None;
     t->handle = handle;
-    t->status = TF_NewStatus();
+    t->status.status = tensorflow::Status::OK();
     t->weakreflist = nullptr;
     PyObject* py_context = GetPyEagerContext();
     if (py_context == nullptr) {
@@ -927,17 +928,16 @@ tensorflow::DataType PyEagerTensor_Dtype(const PyObject* tensor) {
       reinterpret_cast<const EagerTensor*>(tensor)->handle));
 }
 
-tensorflow::int64 PyEagerTensor_NumElements(const PyObject* tensor) {
+tensorflow::int64 PyEagerTensor_NumElements(PyObject* tensor) {
   DCHECK(EagerTensor_CheckExact(tensor));
-  const EagerTensor* as_c_eager_tensor =
-      reinterpret_cast<const EagerTensor*>(tensor);
+  EagerTensor* as_c_eager_tensor = reinterpret_cast<EagerTensor*>(tensor);
   tensorflow::int64 result = TFE_TensorHandleNumElements(
-      as_c_eager_tensor->handle, as_c_eager_tensor->status);
+      as_c_eager_tensor->handle, &as_c_eager_tensor->status);
 
-  if (MaybeRaiseExceptionFromTFStatus(as_c_eager_tensor->status,
+  if (MaybeRaiseExceptionFromTFStatus(&as_c_eager_tensor->status,
                                       PyExc_ValueError)) {
     // Cleanup status before returning.
-    TF_SetStatus(as_c_eager_tensor->status, TF_OK, "");
+    as_c_eager_tensor->status.status = tensorflow::Status::OK();
     return -1;
   }
 
@@ -1067,13 +1067,14 @@ PyObject* TFE_Py_TensorShapeSlice(PyObject* tensors, int slice_dim) {
   }
 
   Py_ssize_t num_tensors = PySequence_Fast_GET_SIZE(tensors);
+  PyObject** tensors_array = PySequence_Fast_ITEMS(tensors);
   int64_t num_tensors_int = static_cast<int64_t>(num_tensors);
   auto tensor = tensorflow::make_safe(TF_AllocateTensor(
       TF_INT32, &num_tensors_int, /*num_dims=*/1, /*len=*/4 * num_tensors_int));
   int32_t* data = reinterpret_cast<int32_t*>(TF_TensorData(tensor.get()));
   auto status = tensorflow::make_safe(TF_NewStatus());
   for (Py_ssize_t i = 0; i < num_tensors; ++i) {
-    PyObject* tensor_obj = PySequence_Fast_GET_ITEM(tensors, i);
+    PyObject* tensor_obj = tensors_array[i];
     if (!EagerTensor_CheckExact(tensor_obj)) {
       PyErr_SetString(PyExc_TypeError,
                       tensorflow::strings::StrCat(
@@ -1108,7 +1109,7 @@ PyObject* TFE_Py_TensorShapeSlice(PyObject* tensors, int slice_dim) {
   }
 
   TFE_TensorHandle* handle = TFE_NewTensorHandle(tensor.get(), status.get());
-  if (TF_GetCode(status.get()) != TF_OK) {
+  if (!status->status.ok()) {
     PyErr_SetString(
         PyExc_RuntimeError,
         tensorflow::strings::StrCat("Failed to construct new tensor handle: ",
@@ -1134,7 +1135,7 @@ PyObject* TFE_Py_TensorShapeOnDevice(PyObject* tensor) {
   auto status = tensorflow::make_safe(TF_NewStatus());
   TFE_TensorDebugInfo* debug_info =
       TFE_TensorHandleTensorDebugInfo(handle, status.get());
-  if (TF_GetCode(status.get()) != TF_OK) {
+  if (!status->status.ok()) {
     PyErr_SetString(
         PyExc_RuntimeError,
         tensorflow::strings::StrCat("Error retrieving tensor's device shape: ",
diff --git a/tensorflow/python/eager/pywrap_tensor.h b/tensorflow/python/eager/pywrap_tensor.h
index 0a462178e78..4c84b5ce6ea 100644
--- a/tensorflow/python/eager/pywrap_tensor.h
+++ b/tensorflow/python/eager/pywrap_tensor.h
@@ -23,7 +23,7 @@ limitations under the License.
 bool EagerTensor_CheckExact(const PyObject* o);
 tensorflow::int64 PyEagerTensor_ID(const PyObject* tensor);
 tensorflow::DataType PyEagerTensor_Dtype(const PyObject* tensor);
-tensorflow::int64 PyEagerTensor_NumElements(const PyObject* tensor);
+tensorflow::int64 PyEagerTensor_NumElements(PyObject* tensor);
 
 namespace tensorflow {
 
diff --git a/tensorflow/python/eager/pywrap_tfe_src.cc b/tensorflow/python/eager/pywrap_tfe_src.cc
index 1e8ca3ec95a..cdfd0b473e0 100644
--- a/tensorflow/python/eager/pywrap_tfe_src.cc
+++ b/tensorflow/python/eager/pywrap_tfe_src.cc
@@ -14,7 +14,6 @@ limitations under the License.
 ==============================================================================*/
 
 #include <cstring>
-#include <thread>
 
 #include "absl/strings/str_cat.h"
 #include "absl/types/variant.h"
@@ -44,6 +43,31 @@ using tensorflow::strings::Printf;
 
 namespace {
 
+thread_local std::unique_ptr<TFE_Op> thread_local_eager_operation =  // NOLINT
+    nullptr;
+
+TFE_Op* ReleaseThreadLocalOp() {
+  if (thread_local_eager_operation == nullptr) {
+    return nullptr;
+  }
+  return thread_local_eager_operation.release();
+}
+
+TFE_Op* CreateOrResetOp(TFE_Context* ctx, const char* op_or_function_name,
+                        TF_Status* status, TFE_Op* op_to_reset) {
+  if (op_to_reset) {
+    TFE_OpReset(ctx, op_or_function_name, status, op_to_reset);
+    return op_to_reset;
+  } else {
+    return TFE_NewOp(ctx, op_or_function_name, status);
+  }
+}
+
+void ClearAndReturnThreadLocalOp(TFE_Op* object) {
+  object->Clear();
+  thread_local_eager_operation.reset(object);
+}
+
 struct InputInfo {
   InputInfo(int i, bool is_list) : i(i), is_list(is_list) {}
 
@@ -99,8 +123,6 @@ AttrToInputsMap* GetAttrToInputsMap(const tensorflow::OpDef& op_def) {
 struct FastPathOpExecInfo {
   TFE_Context* ctx;
   const char* device_name;
-  // The op def of the main op being executed.
-  const tensorflow::OpDef* op_def;
 
   bool run_callbacks;
   bool run_post_exec_callbacks;
@@ -372,7 +394,7 @@ bool SetOpAttrList(
     }
     TFE_OpSetAttrShapeList(op, key, dims.get(), num_dims.get(), num_values,
                            status);
-    if (TF_GetCode(status) != TF_OK) return false;
+    if (!status->status.ok()) return false;
   } else if (type == TF_ATTR_FUNC) {
     std::unique_ptr<const TFE_Op*[]> funcs(new const TFE_Op*[num_values]);
     for (int i = 0; i < num_values; ++i) {
@@ -403,10 +425,10 @@ bool SetOpAttrList(
         }
       }
       funcs[i] = TFE_NewOp(ctx, func_name.data(), status);
-      if (TF_GetCode(status) != TF_OK) return false;
+      if (!status->status.ok()) return false;
     }
     TFE_OpSetAttrFunctionList(op, key, funcs.get(), num_values);
-    if (TF_GetCode(status) != TF_OK) return false;
+    if (!status->status.ok()) return false;
   } else {
     TF_SetStatus(status, TF_UNIMPLEMENTED,
                  tensorflow::strings::StrCat("Attr ", key,
@@ -422,9 +444,9 @@ TFE_Op* GetFunc(TFE_Context* ctx, const tensorflow::NameAttrList& func,
                 TF_Status* status) {
   TFE_Op* func_op = TFE_NewOp(ctx, func.name().data(), status);
   for (const auto& attr : func.attr()) {
-    if (TF_GetCode(status) != TF_OK) return nullptr;
+    if (!status->status.ok()) return nullptr;
     SetOpAttrValueScalar(ctx, func_op, attr.second, attr.first.data(), status);
-    if (TF_GetCode(status) != TF_OK) return nullptr;
+    if (!status->status.ok()) return nullptr;
   }
   return func_op;
 }
@@ -583,7 +605,7 @@ bool SetOpAttrScalar(
       }
       TFE_OpSetAttrShape(op, key, dims.get(), num_dims, status);
     }
-    if (TF_GetCode(status) != TF_OK) return false;
+    if (!status->status.ok()) return false;
   } else if (type == TF_ATTR_FUNC) {
     // Allow:
     // (1) String function name, OR
@@ -655,7 +677,7 @@ void SetOpAttrs(TFE_Context* ctx, TFE_Op* op, PyObject* attrs, int start_index,
 #endif
     unsigned char is_list = 0;
     const TF_AttrType type = TFE_OpGetAttrType(op, key, &is_list, out_status);
-    if (TF_GetCode(out_status) != TF_OK) return;
+    if (!out_status->status.ok()) return;
     if (is_list != 0) {
       if (!SetOpAttrList(ctx, op, key, py_value, type, nullptr, out_status))
         return;
@@ -677,7 +699,7 @@ void SetOpAttrWithDefaults(
     TF_Status* status) {
   unsigned char is_list = 0;
   const TF_AttrType type = TFE_OpGetAttrType(op, attr_name, &is_list, status);
-  if (TF_GetCode(status) != TF_OK) return;
+  if (!status->status.ok()) return;
   if (attr_value == Py_None) {
     if (is_list != 0) {
       SetOpAttrListDefault(ctx, op, attr, attr_name, type, attr_list_sizes,
@@ -738,34 +760,35 @@ void TFE_Py_ExecuteCancelable(TFE_Context* ctx, const char* device_name,
                               TFE_CancellationManager* cancellation_manager,
                               TFE_OutputTensorHandles* outputs,
                               TF_Status* out_status) {
-  TFE_Op* op = TFE_NewOp(ctx, op_name, out_status);
-  if (TF_GetCode(out_status) != TF_OK) return;
+  TFE_Op* op =
+      CreateOrResetOp(ctx, op_name, out_status, ReleaseThreadLocalOp());
+  auto cleaner =
+      tensorflow::gtl::MakeCleanup([op] { ClearAndReturnThreadLocalOp(op); });
+  if (!out_status->status.ok()) return;
   TFE_OpSetDevice(op, device_name, out_status);
-  if (TF_GetCode(out_status) == TF_OK) {
-    for (int i = 0; i < inputs->size() && TF_GetCode(out_status) == TF_OK;
-         ++i) {
+  if (out_status->status.ok()) {
+    for (int i = 0; i < inputs->size() && out_status->status.ok(); ++i) {
       TFE_OpAddInput(op, inputs->at(i), out_status);
     }
   }
-  if (cancellation_manager && TF_GetCode(out_status) == TF_OK) {
+  if (cancellation_manager && out_status->status.ok()) {
     TFE_OpSetCancellationManager(op, cancellation_manager, out_status);
   }
-  if (TF_GetCode(out_status) == TF_OK) {
+  if (out_status->status.ok()) {
     SetOpAttrs(ctx, op, attrs, 0, out_status);
   }
   Py_BEGIN_ALLOW_THREADS;
-  if (TF_GetCode(out_status) == TF_OK) {
+  if (out_status->status.ok()) {
     int num_outputs = outputs->size();
     TFE_Execute(op, outputs->data(), &num_outputs, out_status);
     outputs->resize(num_outputs);
   }
-  if (TF_GetCode(out_status) != TF_OK) {
+  if (!out_status->status.ok()) {
     TF_SetStatus(out_status, TF_GetCode(out_status),
                  tensorflow::strings::StrCat(TF_Message(out_status),
                                              " [Op:", op_name, "]")
                      .c_str());
   }
-  TFE_DeleteOp(op);
   Py_END_ALLOW_THREADS;
 }
 
@@ -853,7 +876,7 @@ void RaiseFallbackException(const char* message) {
 }
 
 int MaybeRaiseExceptionFromTFStatus(TF_Status* status, PyObject* exception) {
-  if (TF_GetCode(status) == TF_OK) return 0;
+  if (status->status.ok()) return 0;
   const char* msg = TF_Message(status);
   if (exception == nullptr) {
     tensorflow::mutex_lock l(exception_class_mutex);
@@ -1150,10 +1173,11 @@ class PyVSpace : public tensorflow::eager::VSpace<PyObject, PyBackwardFunction,
           "gradient function did not return a list");
     }
     int len = PySequence_Fast_GET_SIZE(seq);
+    PyObject** seq_array = PySequence_Fast_ITEMS(seq);
     VLOG(1) << "Gradient length is " << len;
     result->reserve(len);
     for (int i = 0; i < len; ++i) {
-      PyObject* item = PySequence_Fast_GET_ITEM(seq, i);
+      PyObject* item = seq_array[i];
       if (item == Py_None) {
         result->push_back(nullptr);
       } else {
@@ -1564,10 +1588,11 @@ static std::vector<tensorflow::int64> MakeIntList(PyObject* list) {
     return {};
   }
   int len = PySequence_Size(list);
+  PyObject** seq_array = PySequence_Fast_ITEMS(seq);
   std::vector<tensorflow::int64> tensor_ids;
   tensor_ids.reserve(len);
   for (int i = 0; i < len; ++i) {
-    PyObject* item = PySequence_Fast_GET_ITEM(seq, i);
+    PyObject* item = seq_array[i];
 #if PY_MAJOR_VERSION >= 3
     if (PyLong_Check(item)) {
 #else
@@ -1594,10 +1619,11 @@ bool TensorShapesAndDtypes(PyObject* tensors,
     return false;
   }
   int len = PySequence_Fast_GET_SIZE(seq.get());
+  PyObject** seq_array = PySequence_Fast_ITEMS(seq.get());
   tensor_ids->reserve(len);
   dtypes->reserve(len);
   for (int i = 0; i < len; ++i) {
-    PyObject* item = PySequence_Fast_GET_ITEM(seq.get(), i);
+    PyObject* item = seq_array[i];
     tensor_ids->push_back(FastTensorId(item));
     dtypes->push_back(FastTensorDtype(item));
   }
@@ -1729,8 +1755,9 @@ bool ListContainsNone(PyObject* list) {
   }
 
   int len = PySequence_Size(list);
+  PyObject** seq_array = PySequence_Fast_ITEMS(seq.get());
   for (int i = 0; i < len; ++i) {
-    PyObject* item = PySequence_Fast_GET_ITEM(seq.get(), i);
+    PyObject* item = seq_array[i];
     if (item == Py_None) return true;
   }
 
@@ -1797,10 +1824,10 @@ static PyTapeTensor TapeTensorFromTensor(PyObject* tensor) {
 bool TapeTensorsFromTensorSequence(PyObject* output_seq,
                                    std::vector<PyTapeTensor>* output_info) {
   Py_ssize_t output_len = PySequence_Fast_GET_SIZE(output_seq);
+  PyObject** output_seq_array = PySequence_Fast_ITEMS(output_seq);
   output_info->reserve(output_len);
   for (Py_ssize_t i = 0; i < output_len; ++i) {
-    output_info->push_back(
-        TapeTensorFromTensor(PySequence_Fast_GET_ITEM(output_seq, i)));
+    output_info->push_back(TapeTensorFromTensor(output_seq_array[i]));
     if (PyErr_Occurred() != nullptr) {
       return false;
     }
@@ -1814,10 +1841,11 @@ std::vector<tensorflow::int64> MakeTensorIDList(PyObject* tensors) {
     return {};
   }
   int len = PySequence_Fast_GET_SIZE(seq);
+  PyObject** seq_array = PySequence_Fast_ITEMS(seq);
   std::vector<tensorflow::int64> list;
   list.reserve(len);
   for (int i = 0; i < len; ++i) {
-    PyObject* tensor = PySequence_Fast_GET_ITEM(seq, i);
+    PyObject* tensor = seq_array[i];
     list.push_back(FastTensorId(tensor));
     if (PyErr_Occurred()) {
       Py_DECREF(seq);
@@ -1855,10 +1883,11 @@ std::vector<tensorflow::DataType> MakeTensorDtypeList(PyObject* tensors) {
     return {};
   }
   int len = PySequence_Fast_GET_SIZE(seq);
+  PyObject** seq_array = PySequence_Fast_ITEMS(seq);
   std::vector<tensorflow::DataType> list;
   list.reserve(len);
   for (int i = 0; i < len; ++i) {
-    PyObject* tensor = PySequence_Fast_GET_ITEM(seq, i);
+    PyObject* tensor = seq_array[i];
     list.push_back(FastTensorDtype(tensor));
   }
   Py_DECREF(seq);
@@ -1925,8 +1954,9 @@ bool TapeSetRecordForwardprop(
       PySequence_Fast(input_tensors, "expected a sequence of tensors"));
   if (input_seq == nullptr || PyErr_Occurred()) return false;
   Py_ssize_t input_len = PySequence_Fast_GET_SIZE(input_seq.get());
+  PyObject** output_seq_array = PySequence_Fast_ITEMS(output_seq);
   for (int i = 0; i < output_info.size(); ++i) {
-    RegisterForwardAccumulatorCleanup(PySequence_Fast_GET_ITEM(output_seq, i),
+    RegisterForwardAccumulatorCleanup(output_seq_array[i],
                                       output_info[i].GetID());
   }
   if (forwardprop_output_indices != nullptr &&
@@ -1945,33 +1975,34 @@ bool TapeSetRecordForwardprop(
               "between packing and unpacking."),
           nullptr);
     }
+    PyObject** indices_fast_array = PySequence_Fast_ITEMS(indices_fast.get());
     Py_ssize_t accumulator_index = 0;
     for (AccumulatorSet::const_reverse_iterator it = accumulator_set.rbegin();
          it != accumulator_set.rend(); ++it, ++accumulator_index) {
-      tensorflow::Safe_PyObjectPtr jvp_index_seq(PySequence_Fast(
-          PySequence_Fast_GET_ITEM(indices_fast.get(), accumulator_index),
-          "Expected a sequence of jvp indices."));
+      tensorflow::Safe_PyObjectPtr jvp_index_seq(
+          PySequence_Fast(indices_fast_array[accumulator_index],
+                          "Expected a sequence of jvp indices."));
       if (jvp_index_seq == nullptr || PyErr_Occurred()) {
         return false;
       }
       Py_ssize_t num_jvps = PySequence_Fast_GET_SIZE(jvp_index_seq.get());
+      PyObject** jvp_index_seq_array =
+          PySequence_Fast_ITEMS(jvp_index_seq.get());
       for (Py_ssize_t jvp_index = 0; jvp_index < num_jvps; ++jvp_index) {
-        PyObject* tuple =
-            PySequence_Fast_GET_ITEM(jvp_index_seq.get(), jvp_index);
+        PyObject* tuple = jvp_index_seq_array[jvp_index];
         tensorflow::int64 primal_tensor_id =
             output_info[MakeInt(PyTuple_GetItem(tuple, 0))].GetID();
         (*it)->accumulator->Watch(
             primal_tensor_id,
-            PySequence_Fast_GET_ITEM(output_seq,
-                                     MakeInt(PyTuple_GetItem(tuple, 1))));
+            output_seq_array[MakeInt(PyTuple_GetItem(tuple, 1))]);
       }
     }
   } else {
     std::vector<PyTapeTensor> input_info;
     input_info.reserve(input_len);
+    PyObject** input_seq_array = PySequence_Fast_ITEMS(input_seq.get());
     for (Py_ssize_t i = 0; i < input_len; ++i) {
-      input_info.push_back(
-          TapeTensorFromTensor(PySequence_Fast_GET_ITEM(input_seq.get(), i)));
+      input_info.push_back(TapeTensorFromTensor(input_seq_array[i]));
     }
     for (TFE_Py_ForwardAccumulator* accumulator : accumulator_set) {
       tensorflow::Status status = accumulator->accumulator->Accumulate(
@@ -2188,11 +2219,8 @@ std::vector<PyObject*> MakeTensorList(PyObject* tensors) {
     return {};
   }
   int len = PySequence_Fast_GET_SIZE(seq);
-  std::vector<PyObject*> list;
-  list.reserve(len);
-  for (int i = 0; i < len; ++i) {
-    list.push_back(PySequence_Fast_GET_ITEM(seq, i));
-  }
+  PyObject** seq_array = PySequence_Fast_ITEMS(seq);
+  std::vector<PyObject*> list(seq_array, seq_array + len);
   Py_DECREF(seq);
   return list;
 }
@@ -2231,12 +2259,13 @@ PyObject* TFE_Py_TapeGradient(PyObject* tape, PyObject* target,
   tensorflow::Safe_PyObjectPtr seq =
       tensorflow::make_safe(PySequence_Fast(target, "expected a sequence"));
   int len = PySequence_Fast_GET_SIZE(seq.get());
+  PyObject** seq_array = PySequence_Fast_ITEMS(seq.get());
   std::unordered_map<tensorflow::int64, PyTapeTensor>
       source_tensors_that_are_targets;
   for (int i = 0; i < len; ++i) {
     tensorflow::int64 target_id = target_vec[i];
     if (sources_set.find(target_id) != sources_set.end()) {
-      auto tensor = PySequence_Fast_GET_ITEM(seq.get(), i);
+      auto tensor = seq_array[i];
       source_tensors_that_are_targets.insert(
           std::make_pair(target_id, TapeTensorFromTensor(tensor)));
     }
@@ -2365,9 +2394,10 @@ PyObject* TFE_Py_PackForwardGradients(PyObject* tensors) {
     return nullptr;
   }
   std::vector<tensorflow::int64> augmented_input_ids;
-  for (Py_ssize_t position = 0;
-       position < PySequence_Fast_GET_SIZE(tensors_fast.get()); ++position) {
-    PyObject* input = PySequence_Fast_GET_ITEM(tensors_fast.get(), position);
+  int len = PySequence_Fast_GET_SIZE(tensors_fast.get());
+  PyObject** tensors_fast_array = PySequence_Fast_ITEMS(tensors_fast.get());
+  for (Py_ssize_t position = 0; position < len; ++position) {
+    PyObject* input = tensors_fast_array[position];
     if (input == Py_None) {
       continue;
     }
@@ -2507,9 +2537,10 @@ bool CheckInputsOk(PyObject* seq, int start_index,
       if (fast_item.get() == nullptr) {
         return false;
       }
-      for (Py_ssize_t j = 0; j < PySequence_Fast_GET_SIZE(fast_item.get());
-           j++) {
-        PyObject* inner_item = PySequence_Fast_GET_ITEM(fast_item.get(), j);
+      int len = PySequence_Fast_GET_SIZE(fast_item.get());
+      PyObject** fast_item_array = PySequence_Fast_ITEMS(fast_item.get());
+      for (Py_ssize_t j = 0; j < len; j++) {
+        PyObject* inner_item = fast_item_array[j];
         if (!CheckOneInput(inner_item)) {
           VLOG(1) << "Falling back to slow path for Op \"" << op_def.name()
                   << "\", Input \"" << op_def.input_arg(i).name()
@@ -2560,9 +2591,10 @@ tensorflow::DataType MaybeGetDTypeForAttr(const string& attr,
     if (input_info.is_list) {
       tensorflow::Safe_PyObjectPtr fast_item(
           PySequence_Fast(item, "Unable to allocate"));
-      for (int i = 0; i < PySequence_Fast_GET_SIZE(fast_item.get()); i++) {
-        auto dtype =
-            MaybeGetDType(PySequence_Fast_GET_ITEM(fast_item.get(), i));
+      int len = PySequence_Fast_GET_SIZE(fast_item.get());
+      PyObject** fast_item_array = PySequence_Fast_ITEMS(fast_item.get());
+      for (int i = 0; i < len; i++) {
+        auto dtype = MaybeGetDType(fast_item_array[i]);
         if (dtype != tensorflow::DT_INVALID) return dtype;
       }
     } else {
@@ -2726,13 +2758,15 @@ PyObject* CopySequenceSettingIndicesToNull(
     PyObject* seq, const tensorflow::gtl::FlatSet<int>& indices) {
   tensorflow::Safe_PyObjectPtr fast_seq(
       PySequence_Fast(seq, "unable to allocate"));
-  PyObject* result = PyTuple_New(PySequence_Fast_GET_SIZE(fast_seq.get()));
-  for (int i = 0; i < PySequence_Fast_GET_SIZE(fast_seq.get()); i++) {
+  int len = PySequence_Fast_GET_SIZE(fast_seq.get());
+  PyObject** fast_seq_array = PySequence_Fast_ITEMS(fast_seq.get());
+  PyObject* result = PyTuple_New(len);
+  for (int i = 0; i < len; i++) {
     PyObject* item;
     if (indices.find(i) != indices.end()) {
       item = Py_None;
     } else {
-      item = PySequence_Fast_GET_ITEM(fast_seq.get(), i);
+      item = fast_seq_array[i];
     }
     Py_INCREF(item);
     PyTuple_SET_ITEM(result, i, item);
@@ -2788,9 +2822,10 @@ tensorflow::Status CallForwardGradientFunction(
         "forward gradient function did not return a sequence.");
   }
   int len = PySequence_Fast_GET_SIZE(fast_result.get());
+  PyObject** fast_result_array = PySequence_Fast_ITEMS(fast_result.get());
   output_tangents->reserve(len);
   for (int i = 0; i < len; ++i) {
-    PyObject* item = PySequence_Fast_GET_ITEM(fast_result.get(), i);
+    PyObject* item = fast_result_array[i];
     if (item == Py_None) {
       output_tangents->push_back(nullptr);
     } else {
@@ -3096,25 +3131,6 @@ bool AddInputToOp(FastPathOpExecInfo* op_exec_info, PyObject* input,
   return true;
 }
 
-const tensorflow::OpDef* GetOpDef(PyObject* py_op_name) {
-  const char* op_name = TFE_GetPythonString(py_op_name);
-  if (op_name == nullptr) {
-    PyErr_SetString(PyExc_TypeError,
-                    Printf("expected a string for op_name, got %s instead",
-                           py_op_name->ob_type->tp_name)
-                        .c_str());
-    return nullptr;
-  }
-
-  const tensorflow::OpRegistrationData* op_reg_data = nullptr;
-  const tensorflow::Status lookup_status =
-      tensorflow::OpRegistry::Global()->LookUp(op_name, &op_reg_data);
-  if (MaybeRaiseExceptionFromStatus(lookup_status, nullptr)) {
-    return nullptr;
-  }
-  return &op_reg_data->op_def;
-}
-
 const char* GetDeviceName(PyObject* py_device_name) {
   if (py_device_name != Py_None) {
     return TFE_GetPythonString(py_device_name);
@@ -3146,34 +3162,32 @@ bool RaiseIfNotPySequence(PyObject* seq, const string& attr_name) {
 
 bool RunCallbacks(
     const FastPathOpExecInfo& op_exec_info, PyObject* args,
-    const std::vector<tensorflow::Safe_PyObjectPtr>* const flattened_inputs,
-    const std::vector<tensorflow::Safe_PyObjectPtr>* const flattened_attrs,
+    int num_inferred_attrs,
+    const std::vector<tensorflow::Safe_PyObjectPtr>& flattened_inputs,
+    const std::vector<tensorflow::Safe_PyObjectPtr>& flattened_attrs,
     PyObject* flattened_result) {
-  if (!op_exec_info.run_callbacks) return true;
+  DCHECK(op_exec_info.run_callbacks);
 
-  tensorflow::Safe_PyObjectPtr inputs(PyTuple_New(flattened_inputs->size()));
-  for (int i = 0; i < flattened_inputs->size(); i++) {
-    PyObject* input = (*flattened_inputs)[i].get();
+  tensorflow::Safe_PyObjectPtr inputs(PyTuple_New(flattened_inputs.size()));
+  for (int i = 0; i < flattened_inputs.size(); i++) {
+    PyObject* input = flattened_inputs[i].get();
     Py_INCREF(input);
     PyTuple_SET_ITEM(inputs.get(), i, input);
   }
 
-  int num_non_inferred_attrs = PyTuple_GET_SIZE(args) -
-                               op_exec_info.op_def->input_arg_size() -
-                               kFastPathExecuteInputStartIndex;
-  int num_attrs = flattened_attrs->size() + num_non_inferred_attrs;
+  int num_non_inferred_attrs = PyTuple_GET_SIZE(args) - num_inferred_attrs;
+  int num_attrs = flattened_attrs.size() + num_non_inferred_attrs;
   tensorflow::Safe_PyObjectPtr attrs(PyTuple_New(num_attrs));
 
   for (int i = 0; i < num_non_inferred_attrs; i++) {
-    auto* attr =
-        PyTuple_GET_ITEM(args, kFastPathExecuteInputStartIndex +
-                                   op_exec_info.op_def->input_arg_size() + i);
+    auto* attr = PyTuple_GET_ITEM(args, num_inferred_attrs + i);
     Py_INCREF(attr);
     PyTuple_SET_ITEM(attrs.get(), i, attr);
   }
+
   for (int i = num_non_inferred_attrs; i < num_attrs; i++) {
     PyObject* attr_or_name =
-        flattened_attrs->at(i - num_non_inferred_attrs).get();
+        flattened_attrs.at(i - num_non_inferred_attrs).get();
     Py_INCREF(attr_or_name);
     PyTuple_SET_ITEM(attrs.get(), i, attr_or_name);
   }
@@ -3241,13 +3255,9 @@ PyObject* TFE_Py_FastPathExecute_C(PyObject*, PyObject* args) {
 
   op_exec_info.device_name = GetDeviceName(PyTuple_GET_ITEM(args, 1));
   op_exec_info.op_name = PyTuple_GET_ITEM(args, 2);
-  op_exec_info.op_def = GetOpDef(op_exec_info.op_name);
-  if (op_exec_info.op_def == nullptr) return nullptr;
   op_exec_info.name = PyTuple_GET_ITEM(args, 3);
   op_exec_info.callbacks = PyTuple_GET_ITEM(args, 4);
 
-  const tensorflow::OpDef* op_def = op_exec_info.op_def;
-
   // TODO(nareshmodi): Add a benchmark for the fast-path with gradient callbacks
   // (similar to benchmark_tf_gradient_function_*). Also consider using an
   // InlinedVector for flattened_attrs and flattened_inputs if the benchmarks
@@ -3260,6 +3270,29 @@ PyObject* TFE_Py_FastPathExecute_C(PyObject*, PyObject* args) {
   op_exec_info.run_callbacks = op_exec_info.run_gradient_callback ||
                                op_exec_info.run_post_exec_callbacks;
 
+  TF_Status* status = TF_NewStatus();
+  const char* op_name = TFE_GetPythonString(op_exec_info.op_name);
+  if (op_name == nullptr) {
+    PyErr_SetString(PyExc_TypeError,
+                    Printf("expected a string for op_name, got %s instead",
+                           op_exec_info.op_name->ob_type->tp_name)
+                        .c_str());
+    return nullptr;
+  }
+
+  TFE_Op* op = CreateOrResetOp(op_exec_info.ctx, op_name, status,
+                               ReleaseThreadLocalOp());
+  auto cleaner = tensorflow::gtl::MakeCleanup([status, op] {
+    TF_DeleteStatus(status);
+    ClearAndReturnThreadLocalOp(op);
+  });
+  if (MaybeRaiseExceptionFromTFStatus(status, nullptr)) {
+    return nullptr;
+  }
+
+  const tensorflow::OpDef* op_def = op->inference_ctx->op_def;
+  if (op_def == nullptr) return nullptr;
+
   if (args_size < kFastPathExecuteInputStartIndex + op_def->input_arg_size()) {
     PyErr_SetString(
         PyExc_ValueError,
@@ -3280,16 +3313,6 @@ PyObject* TFE_Py_FastPathExecute_C(PyObject*, PyObject* args) {
 
   op_exec_info.attr_to_inputs_map = GetAttrToInputsMap(*op_def);
 
-  TF_Status* status = TF_NewStatus();
-  TFE_Op* op = TFE_NewOp(op_exec_info.ctx, op_def->name().c_str(), status);
-  auto cleaner = tensorflow::gtl::MakeCleanup([status, op] {
-    TF_DeleteStatus(status);
-    TFE_DeleteOp(op);
-  });
-  if (MaybeRaiseExceptionFromTFStatus(status, nullptr)) {
-    return nullptr;
-  }
-
   // Mapping of attr name to size - used to calculate the number of values
   // to be expected by the TFE_Execute run.
   tensorflow::gtl::FlatMap<string, tensorflow::int64> attr_list_sizes;
@@ -3311,7 +3334,7 @@ PyObject* TFE_Py_FastPathExecute_C(PyObject*, PyObject* args) {
         SetOpAttrWithDefaults(op_exec_info.ctx, op, attr, attr_name.data(),
                               py_attr_value, &attr_list_sizes, status);
 
-        if (TF_GetCode(status) != TF_OK) {
+        if (!status->status.ok()) {
           VLOG(1) << "Falling back to slow path for Op \"" << op_def->name()
                   << "\" since we are unable to set the value for attr \""
                   << attr.name() << "\" due to: " << TF_Message(status);
@@ -3368,6 +3391,7 @@ PyObject* TFE_Py_FastPathExecute_C(PyObject*, PyObject* args) {
         return nullptr;
       }
       Py_ssize_t len = PySequence_Fast_GET_SIZE(fast_input.get());
+      PyObject** fast_input_array = PySequence_Fast_ITEMS(fast_input.get());
 
       TFE_OpSetAttrInt(op, input_arg.number_attr().data(), len);
       if (op_exec_info.run_callbacks) {
@@ -3379,18 +3403,16 @@ PyObject* TFE_Py_FastPathExecute_C(PyObject*, PyObject* args) {
 
       if (len > 0) {
         // First item adds the type attr.
-        if (!AddInputToOp(&op_exec_info,
-                          PySequence_Fast_GET_ITEM(fast_input.get(), 0), true,
-                          input_arg, flattened_attrs.get(),
-                          flattened_inputs.get(), op, status)) {
+        if (!AddInputToOp(&op_exec_info, fast_input_array[0], true, input_arg,
+                          flattened_attrs.get(), flattened_inputs.get(), op,
+                          status)) {
           return nullptr;
         }
 
         for (Py_ssize_t j = 1; j < len; j++) {
           // Since the list is homogeneous, we don't need to re-add the attr.
-          if (!AddInputToOp(&op_exec_info,
-                            PySequence_Fast_GET_ITEM(fast_input.get(), j),
-                            false, input_arg, nullptr /* flattened_attrs */,
+          if (!AddInputToOp(&op_exec_info, fast_input_array[j], false,
+                            input_arg, nullptr /* flattened_attrs */,
                             flattened_inputs.get(), op, status)) {
             return nullptr;
           }
@@ -3408,13 +3430,14 @@ PyObject* TFE_Py_FastPathExecute_C(PyObject*, PyObject* args) {
       }
       const string& attr_name = input_arg.type_list_attr();
       Py_ssize_t len = PySequence_Fast_GET_SIZE(fast_input.get());
+      PyObject** fast_input_array = PySequence_Fast_ITEMS(fast_input.get());
       tensorflow::gtl::InlinedVector<TF_DataType, 4> attr_value(len);
       PyObject* py_attr_value = nullptr;
       if (op_exec_info.run_callbacks) {
         py_attr_value = PyTuple_New(len);
       }
       for (Py_ssize_t j = 0; j < len; j++) {
-        PyObject* py_input = PySequence_Fast_GET_ITEM(fast_input.get(), j);
+        PyObject* py_input = fast_input_array[j];
         tensorflow::Safe_PyObjectPtr py_eager_tensor;
         if (!ConvertToTensor(
                 op_exec_info, py_input, &py_eager_tensor,
@@ -3480,7 +3503,7 @@ PyObject* TFE_Py_FastPathExecute_C(PyObject*, PyObject* args) {
   TFE_Execute(op, retvals.data(), &num_retvals, status);
   Py_END_ALLOW_THREADS;
 
-  if (TF_GetCode(status) != TF_OK) {
+  if (!status->status.ok()) {
     // Augment the status with the op_name for easier debugging similar to
     // TFE_Py_Execute.
     TF_SetStatus(status, TF_GetCode(status),
@@ -3498,9 +3521,13 @@ PyObject* TFE_Py_FastPathExecute_C(PyObject*, PyObject* args) {
     PyList_SET_ITEM(flat_result.get(), i, EagerTensorFromHandle(retvals[i]));
   }
 
-  if (!RunCallbacks(op_exec_info, args, flattened_inputs.get(),
-                    flattened_attrs.get(), flat_result.get())) {
-    return nullptr;
+  if (op_exec_info.run_callbacks) {
+    if (!RunCallbacks(
+            op_exec_info, args,
+            kFastPathExecuteInputStartIndex + op_def->input_arg_size(),
+            *flattened_inputs, *flattened_attrs, flat_result.get())) {
+      return nullptr;
+    }
   }
 
   // Unflatten results.
@@ -3573,6 +3600,8 @@ const char kShapeDelim[] = "-";
 const char kDType[] = "d";
 const char kNone[] = "n";
 const char kCompositeTensor[] = "C";
+const char kAttrs[] = "A";
+const char kAttrsEnd[] = "a";
 
 struct EncodeResult {
   string str;
@@ -3662,12 +3691,13 @@ tensorflow::Status TFE_Py_EncodeTensor(PyObject* arg,
       shape_tuple.get(), "shape_tuple didn't return a sequence"));
 
   int len = PySequence_Fast_GET_SIZE(shape_seq.get());
+  PyObject** shape_seq_array = PySequence_Fast_ITEMS(shape_seq.get());
 
   if (include_tensor_ranks_only) {
     absl::StrAppend(&result->str, len);
   } else {
     for (int i = 0; i < len; ++i) {
-      PyObject* item = PySequence_Fast_GET_ITEM(shape_seq.get(), i);
+      PyObject* item = shape_seq_array[i];
       if (item == Py_None) {
         absl::StrAppend(&result->str, kNone);
       } else {
@@ -3692,8 +3722,9 @@ tensorflow::Status TFE_Py_EncodeSequence(PyObject* arg, const char* type,
 
   absl::StrAppend(&result->str, type);
   int len = PySequence_Fast_GET_SIZE(arg_seq.get());
+  PyObject** arg_seq_array = PySequence_Fast_ITEMS(arg_seq.get());
   for (int i = 0; i < len; ++i) {
-    PyObject* item = PySequence_Fast_GET_ITEM(arg_seq.get(), i);
+    PyObject* item = arg_seq_array[i];
     if (item == Py_None) {
       absl::StrAppend(&result->str, kNone);
     } else {
@@ -3747,6 +3778,20 @@ tensorflow::Status TFE_Py_EncodeArgHelper(PyObject* arg,
           "Error while reading CompositeTensor._type_spec.");
     }
     result->objects.push_back(type_spec);
+  } else if (tensorflow::swig::IsAttrs(arg)) {
+    absl::StrAppend(&result->str, kAttrs);
+    tensorflow::Safe_PyObjectPtr attrs(
+        PyObject_GetAttrString(arg, "__attrs_attrs__"));
+    tensorflow::Safe_PyObjectPtr iter(PyObject_GetIter(attrs.get()));
+    for (tensorflow::Safe_PyObjectPtr item(PyIter_Next(iter.get())); item;
+         item.reset(PyIter_Next(iter.get()))) {
+      tensorflow::Safe_PyObjectPtr name(
+          PyObject_GetAttrString(item.get(), "name"));
+      tensorflow::Safe_PyObjectPtr attr_arg(PyObject_GetAttr(arg, name.get()));
+      TF_RETURN_IF_ERROR(TFE_Py_EncodeArgHelper(
+          attr_arg.get(), include_tensor_ranks_only, result));
+    }
+    absl::StrAppend(&result->str, kAttrsEnd);
   } else {
     PyObject* object = PyWeakref_NewRef(arg, nullptr);
 
diff --git a/tensorflow/python/eager/remote_test.py b/tensorflow/python/eager/remote_test.py
index 33dcdb76c76..e97cc93a650 100644
--- a/tensorflow/python/eager/remote_test.py
+++ b/tensorflow/python/eager/remote_test.py
@@ -74,7 +74,9 @@ class SingleWorkerTest(test.TestCase):
 
     @def_function.function
     def remote_output(i):
-      return variable_b, i + variable_b
+      with ops.device('/job:worker/replica:0/task:0/cpu:0'):
+        c = variable_b + 1
+      return c, i + variable_b
 
     with self.assertRaises(errors.UnimplementedError) as cm:
       remote_output(constant_op.constant([1]))
diff --git a/tensorflow/python/eager/tensor_test.py b/tensorflow/python/eager/tensor_test.py
index 74b4b438e0f..eb336aad90f 100644
--- a/tensorflow/python/eager/tensor_test.py
+++ b/tensorflow/python/eager/tensor_test.py
@@ -32,6 +32,7 @@ from tensorflow.python.eager import core
 from tensorflow.python.eager import test
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
@@ -380,7 +381,8 @@ class TFETensorTest(test_util.TensorFlowTestCase):
 
   def test_numpyFailsForResource(self):
     v = variables.Variable(42)
-    with self.assertRaisesRegex(ValueError, "Cannot convert .+ resource"):
+    with self.assertRaisesRegex(errors.InvalidArgumentError,
+                                "Cannot convert .+ resource"):
       v._handle._numpy()
 
   def testMemoryviewFailsForResource(self):
diff --git a/tensorflow/python/feature_column/feature_column.py b/tensorflow/python/feature_column/feature_column.py
index 0ad8b9e6847..ed933301ca1 100644
--- a/tensorflow/python/feature_column/feature_column.py
+++ b/tensorflow/python/feature_column/feature_column.py
@@ -1421,9 +1421,9 @@ def _categorical_column_with_identity(key, num_buckets, default_value=None):
       column name and the dictionary key for feature parsing configs, feature
       `Tensor` objects, and feature columns.
     num_buckets: Range of inputs and outputs is `[0, num_buckets)`.
-    default_value: If `None`, this column's graph operations will fail for
-      out-of-range inputs. Otherwise, this value must be in the range
-      `[0, num_buckets)`, and will replace inputs in that range.
+    default_value: If set, values outside of range `[0, num_buckets)` will
+      be replaced with this value. If not set, values >= num_buckets will
+      cause a failure while values < 0 will be dropped.
 
   Returns:
     A `_CategoricalColumn` that returns identity values.
@@ -2872,30 +2872,13 @@ class _IdentityCategoricalColumn(
       raise ValueError(
           'Invalid input, not integer. key: {} dtype: {}'.format(
               self.key, input_tensor.dtype))
-
-    values = math_ops.cast(input_tensor.values, dtypes.int64, name='values')
-    num_buckets = math_ops.cast(
-        self.num_buckets, dtypes.int64, name='num_buckets')
-    zero = math_ops.cast(0, dtypes.int64, name='zero')
-    if self.default_value is None:
-      # Fail if values are out-of-range.
-      assert_less = check_ops.assert_less(
-          values,
-          num_buckets,
-          data=(values, num_buckets),
-          message='Bucket index for categorical column '
-          '"{}" exceeds number of buckets'.format(self.name),
-          name='assert_less_than_num_buckets')
-      assert_greater = check_ops.assert_greater_equal(
-          values,
-          zero,
-          data=(values,),
-          message='Negative bucket index for categorical column "{}"'.format(
-              self.name),
-          name='assert_greater_or_equal_0')
-      with ops.control_dependencies((assert_less, assert_greater)):
-        values = array_ops.identity(values)
-    else:
+    values = input_tensor.values
+    if input_tensor.values.dtype != dtypes.int64:
+      values = math_ops.cast(values, dtypes.int64, name='values')
+    if self.default_value is not None:
+      num_buckets = math_ops.cast(
+          self.num_buckets, dtypes.int64, name='num_buckets')
+      zero = math_ops.cast(0, dtypes.int64, name='zero')
       # Assign default for out-of-range values.
       values = array_ops.where(
           math_ops.logical_or(
@@ -2904,7 +2887,6 @@ class _IdentityCategoricalColumn(
               dims=array_ops.shape(values),
               value=math_ops.cast(self.default_value, dtypes.int64),
               name='default_values'), values)
-
     return sparse_tensor_lib.SparseTensor(
         indices=input_tensor.indices,
         values=values,
diff --git a/tensorflow/python/feature_column/feature_column_test.py b/tensorflow/python/feature_column/feature_column_test.py
index 58aa776aaa0..3ff9a7dbc15 100644
--- a/tensorflow/python/feature_column/feature_column_test.py
+++ b/tensorflow/python/feature_column/feature_column_test.py
@@ -4385,28 +4385,74 @@ class IdentityCategoricalColumnTest(test.TestCase):
           id_weight_pair.id_tensor.eval())
 
   @test_util.run_deprecated_v1
-  def test_get_sparse_tensors_with_inputs_too_small(self):
-    column = fc._categorical_column_with_identity(key='aaa', num_buckets=3)
-    inputs = sparse_tensor.SparseTensorValue(
-        indices=((0, 0), (1, 0), (1, 1)),
-        values=(1, -1, 0),
-        dense_shape=(2, 2))
-    with self.assertRaisesRegexp(
-        errors.InvalidArgumentError,
-        'Negative bucket index for categorical column "aaa"'):
-      column._get_sparse_tensors(_LazyBuilder({'aaa': inputs}))
+  def test_get_sparse_tensors_with_inputs_too_big(self):
+    # Inputs.
+    vocabulary_size = 2
+    sparse_input = sparse_tensor.SparseTensorValue(
+        indices=((0, 0), (1, 0), (1, 1)), values=(2, 1, 0), dense_shape=(2, 2))
+
+    # Embedding variable.
+    embedding_dimension = 2
+    embedding_values = (
+        (1., 2.),  # id 0
+        (3., 5.),  # id 1
+    )
+
+    def _initializer(shape, dtype, partition_info=None):
+      del shape, dtype, partition_info
+      return embedding_values
+
+    # Build columns.
+    categorical_column = fc._categorical_column_with_identity(
+        key='aaa', num_buckets=vocabulary_size)
+    embedding_column = fc._embedding_column(
+        categorical_column,
+        dimension=embedding_dimension,
+        initializer=_initializer)
+
+    # Provide sparse input and get dense result.
+    embedding_lookup = embedding_column._get_dense_tensor(
+        _LazyBuilder({'aaa': sparse_input}))
+
+    with _initialized_session():
+      with self.assertRaisesRegexp(errors.OpError,
+                                   r'indices\[0\] = 2 is not in \[0, 2\)'):
+        self.evaluate(embedding_lookup)
 
   @test_util.run_deprecated_v1
-  def test_get_sparse_tensors_with_inputs_too_big(self):
-    column = fc._categorical_column_with_identity(key='aaa', num_buckets=3)
-    inputs = sparse_tensor.SparseTensorValue(
-        indices=((0, 0), (1, 0), (1, 1)),
-        values=(1, 99, 0),
-        dense_shape=(2, 2))
-    with self.assertRaisesRegexp(
-        errors.InvalidArgumentError,
-        'Bucket index for categorical column "aaa" exceeds number of buckets'):
-      column._get_sparse_tensors(_LazyBuilder({'aaa': inputs}))
+  def test_get_sparse_tensors_with_inputs_too_small(self):
+    # Inputs.
+    vocabulary_size = 2
+    sparse_input = sparse_tensor.SparseTensorValue(
+        indices=((0, 0), (0, 0), (1, 1), (1, 2)),
+        values=(-9, 0, -6, 1),
+        dense_shape=(2, 4))
+
+    # Embedding variable.
+    embedding_dimension = 2
+    embedding_values = (
+        (1., 2.),  # id 0
+        (3., 5.),  # id 1
+    )
+
+    def _initializer(shape, dtype, partition_info=None):
+      del shape, dtype, partition_info
+      return embedding_values
+
+    # Build columns.
+    categorical_column = fc._categorical_column_with_identity(
+        key='aaa', num_buckets=vocabulary_size)
+    embedding_column = fc._embedding_column(
+        categorical_column,
+        dimension=embedding_dimension,
+        initializer=_initializer)
+
+    # Provide sparse input and get dense result.
+    embedding_lookup = embedding_column._get_dense_tensor(
+        _LazyBuilder({'aaa': sparse_input}))
+    expected_lookups = ((1., 2.), (3., 5))
+    with _initialized_session():
+      self.assertAllEqual(expected_lookups, self.evaluate(embedding_lookup))
 
   @test_util.run_deprecated_v1
   def test_get_sparse_tensors_with_default_value(self):
diff --git a/tensorflow/python/feature_column/feature_column_v2.py b/tensorflow/python/feature_column/feature_column_v2.py
index c63682541f6..dd84ac8b8fc 100644
--- a/tensorflow/python/feature_column/feature_column_v2.py
+++ b/tensorflow/python/feature_column/feature_column_v2.py
@@ -1853,9 +1853,9 @@ def categorical_column_with_identity(key, num_buckets, default_value=None):
       column name and the dictionary key for feature parsing configs, feature
       `Tensor` objects, and feature columns.
     num_buckets: Range of inputs and outputs is `[0, num_buckets)`.
-    default_value: If `None`, this column's graph operations will fail for
-      out-of-range inputs. Otherwise, this value must be in the range
-      `[0, num_buckets)`, and will replace inputs in that range.
+    default_value: If set, values outside of range `[0, num_buckets)` will
+      be replaced with this value. If not set, values >= num_buckets will
+      cause a failure while values < 0 will be dropped.
 
   Returns:
     A `CategoricalColumn` that returns identity values.
@@ -3832,30 +3832,14 @@ class IdentityCategoricalColumn(
       raise ValueError(
           'Invalid input, not integer. key: {} dtype: {}'.format(
               self.key, input_tensor.dtype))
-
-    values = math_ops.cast(input_tensor.values, dtypes.int64, name='values')
-    num_buckets = math_ops.cast(
-        self.num_buckets, dtypes.int64, name='num_buckets')
-    zero = math_ops.cast(0, dtypes.int64, name='zero')
-    if self.default_value is None:
-      # Fail if values are out-of-range.
-      assert_less = check_ops.assert_less(
-          values,
-          num_buckets,
-          data=(values, num_buckets),
-          message='Bucket index for categorical column '
-          '"{}" exceeds number of buckets'.format(self.name),
-          name='assert_less_than_num_buckets')
-      assert_greater = check_ops.assert_greater_equal(
-          values,
-          zero,
-          data=(values,),
-          message='Negative bucket index for categorical column "{}"'.format(
-              self.name),
-          name='assert_greater_or_equal_0')
-      with ops.control_dependencies((assert_less, assert_greater)):
-        values = array_ops.identity(values)
-    else:
+    values = input_tensor.values
+    if input_tensor.values.dtype != dtypes.int64:
+      values = math_ops.cast(values, dtypes.int64, name='values')
+    if self.default_value is not None:
+      values = math_ops.cast(input_tensor.values, dtypes.int64, name='values')
+      num_buckets = math_ops.cast(
+          self.num_buckets, dtypes.int64, name='num_buckets')
+      zero = math_ops.cast(0, dtypes.int64, name='zero')
       # Assign default for out-of-range values.
       values = array_ops.where_v2(
           math_ops.logical_or(
diff --git a/tensorflow/python/feature_column/feature_column_v2_test.py b/tensorflow/python/feature_column/feature_column_v2_test.py
index cdce2648d33..8e46356ec55 100644
--- a/tensorflow/python/feature_column/feature_column_v2_test.py
+++ b/tensorflow/python/feature_column/feature_column_v2_test.py
@@ -5011,16 +5011,43 @@ class IdentityCategoricalColumnTest(test.TestCase):
             dense_shape=(2, 2)), self.evaluate(id_weight_pair.id_tensor))
 
   def _test_get_sparse_tensors_with_inputs_too_small(self):
-    column = fc.categorical_column_with_identity(key='aaa', num_buckets=3)
-    inputs = sparse_tensor.SparseTensorValue(
-        indices=((0, 0), (1, 0), (1, 1)), values=(1, -1, 0), dense_shape=(2, 2))
-    with self.assertRaisesRegexp(
-        errors.InvalidArgumentError,
-        'Negative bucket index for categorical column "aaa"'):
-      column.get_sparse_tensors(
-          fc.FeatureTransformationCache({
-              'aaa': inputs
-          }), None)
+    # Inputs.
+    vocabulary_size = 2
+    sparse_input = sparse_tensor.SparseTensorValue(
+        indices=((0, 0), (0, 0), (1, 1), (1, 2)),
+        values=(-9, 0, -6, 1),
+        dense_shape=(2, 4))
+
+    # Embedding variable.
+    embedding_dimension = 2
+    embedding_values = (
+        (1., 2.),  # id 0
+        (3., 5.),  # id 1
+    )
+
+    def _initializer(shape, dtype, partition_info=None):
+      del shape, dtype, partition_info
+      return embedding_values
+
+    # Build columns.
+    categorical_column = fc.categorical_column_with_identity(
+        key='aaa', num_buckets=vocabulary_size)
+    embedding_column = fc.embedding_column(
+        categorical_column,
+        dimension=embedding_dimension,
+        initializer=_initializer)
+    state_manager = _TestStateManager()
+    embedding_column.create_state(state_manager)
+
+    # Provide sparse input and get dense result.
+    embedding_lookup = embedding_column.get_dense_tensor(
+        fc.FeatureTransformationCache({'aaa': sparse_input}), state_manager)
+
+    self.evaluate(variables_lib.global_variables_initializer())
+    self.evaluate(lookup_ops.tables_initializer())
+    expected_lookups = ((1., 2.), (3., 5))
+    with _initialized_session():
+      self.assertAllEqual(expected_lookups, self.evaluate(embedding_lookup))
 
   @test_util.run_deprecated_v1
   def test_get_sparse_tensors_with_inputs_too_small(self):
@@ -5032,16 +5059,42 @@ class IdentityCategoricalColumnTest(test.TestCase):
     self._test_get_sparse_tensors_with_inputs_too_small()
 
   def _test_get_sparse_tensors_with_inputs_too_big(self):
-    column = fc.categorical_column_with_identity(key='aaa', num_buckets=3)
-    inputs = sparse_tensor.SparseTensorValue(
-        indices=((0, 0), (1, 0), (1, 1)), values=(1, 99, 0), dense_shape=(2, 2))
-    with self.assertRaisesRegexp(
-        errors.InvalidArgumentError,
-        'Bucket index for categorical column "aaa" exceeds number of buckets'):
-      column.get_sparse_tensors(
-          fc.FeatureTransformationCache({
-              'aaa': inputs
-          }), None)
+    # Inputs.
+    vocabulary_size = 2
+    sparse_input = sparse_tensor.SparseTensorValue(
+        indices=((0, 0), (1, 0)), values=(2, 0), dense_shape=(4, 5))
+
+    # Embedding variable.
+    embedding_dimension = 2
+    embedding_values = (
+        (1., 2.),  # id 0
+        (3., 5.),  # id 1
+    )
+
+    def _initializer(shape, dtype, partition_info=None):
+      del shape, dtype, partition_info
+      return embedding_values
+
+    # Build columns.
+    categorical_column = fc.categorical_column_with_identity(
+        key='aaa', num_buckets=vocabulary_size)
+    embedding_column = fc.embedding_column(
+        categorical_column,
+        dimension=embedding_dimension,
+        initializer=_initializer)
+    state_manager = _TestStateManager()
+    embedding_column.create_state(state_manager)
+
+    # Provide sparse input and get dense result.
+    embedding_lookup = embedding_column.get_dense_tensor(
+        fc.FeatureTransformationCache({'aaa': sparse_input}), state_manager)
+
+    self.evaluate(variables_lib.global_variables_initializer())
+    self.evaluate(lookup_ops.tables_initializer())
+
+    with self.assertRaisesRegexp(errors.OpError,
+                                 r'indices\[0\] = 2 is not in \[0, 2\)'):
+      self.evaluate(embedding_lookup)
 
   @test_util.run_deprecated_v1
   def test_get_sparse_tensors_with_inputs_too_big(self):
diff --git a/tensorflow/python/feature_column/sequence_feature_column_integration_test.py b/tensorflow/python/feature_column/sequence_feature_column_integration_test.py
index 03fc1b62f22..1b93ec53418 100644
--- a/tensorflow/python/feature_column/sequence_feature_column_integration_test.py
+++ b/tensorflow/python/feature_column/sequence_feature_column_integration_test.py
@@ -88,7 +88,7 @@ class SequenceFeatureColumnIntegrationTest(test.TestCase):
     ds = ds.batch(20)
 
     # Test on a single batch
-    features = ds.make_one_shot_iterator().get_next()
+    features = dataset_ops.make_one_shot_iterator(ds).get_next()
 
     # Tile the context features across the sequence features
     sequence_input_layer = sfc.SequenceFeatures(seq_cols)
diff --git a/tensorflow/python/framework/constant_op.py b/tensorflow/python/framework/constant_op.py
index a0f21616b4c..8522f70afc5 100644
--- a/tensorflow/python/framework/constant_op.py
+++ b/tensorflow/python/framework/constant_op.py
@@ -250,7 +250,7 @@ def _constant_impl(
         # We don't have a Fill kernel for bool dtype on GPU. So we first run
         # Fill on CPU and then copy to GPU if needed.
         with ops.device("/device:CPU:0"):
-          x = _eager_fill(shape.as_list(), t.cpu(), ctx)
+          x = _eager_fill(shape.as_list(), _eager_identity(t, ctx), ctx)
         return _eager_identity(x, ctx)
       else:
         return _eager_fill(shape.as_list(), t, ctx)
diff --git a/tensorflow/python/framework/error_interpolation.py b/tensorflow/python/framework/error_interpolation.py
index f4036265e46..0c6feb64008 100644
--- a/tensorflow/python/framework/error_interpolation.py
+++ b/tensorflow/python/framework/error_interpolation.py
@@ -38,12 +38,31 @@ _INTERPOLATION_PATTERN = re.compile(_INTERPOLATION_REGEX, re.DOTALL)
 
 _ParseTag = collections.namedtuple("_ParseTag", ["type", "name"])
 
-_BAD_FILE_SUBSTRINGS = [
-    os.path.join("tensorflow", "python"),
-    os.path.join("tensorflow", "contrib"),
-    os.path.join("tensorflow_estimator", "python"),
-    os.path.join("tensorflow_estimator", "contrib"),
-    "<embedded",
+
+# Remove the last three path components from this module's file (i.e.
+# python/framework/error_interpolation.py) so that we have an absolute path
+# prefix to the root of the installation.
+_FRAMEWORK_COMMON_PREFIX = os.path.dirname(
+    os.path.dirname(os.path.dirname(__file__)))
+
+# Sub-directories under the common prefix that are considered part of the
+# framework.
+_FRAMEWORK_PATH_PREFIXES = [
+    os.path.join(_FRAMEWORK_COMMON_PREFIX, "python") + os.sep,
+    os.path.join(_FRAMEWORK_COMMON_PREFIX, "contrib") + os.sep,
+]
+
+# Patterns of filename patterns that should be considered internal to
+# the TensorFlow framework.
+_FRAMEWORK_FILENAME_PATTERNS = [
+    re.compile(r"<embedded"),
+]
+
+# Patterns of filename patterns that should be considered external to
+# TensorFlow regardless of framework prefix match.
+_EXTERNAL_FILENAME_PATTERNS = [
+    # Explicitly treat test frames as not part of the framework.
+    re.compile(r"_test\.py$"),
 ]
 
 
@@ -178,13 +197,39 @@ def _compute_colocation_summary_from_op(op, prefix=""):
   # pylint: enable=protected-access
 
 
+def _is_framework_filename(filename):
+  """Returns whether a filename should be considered a part of the framework.
+
+  A file is part of the framework if it does not match a pattern in
+  _EXTERNAL_FILENAME_PATTERNS and it either matches a pattern in
+  _FRAMEWORK_FILENAME_PATTERNS or starts with a _FRAMEWORK_PATH_PREFIXES prefix.
+
+  Args:
+    filename: A filename string.
+
+  Returns:
+    Whether the filename should be considered to be internal to the
+    TensorFlow framework for the purposes of reporting errors.
+  """
+  for pattern in _EXTERNAL_FILENAME_PATTERNS:
+    if pattern.search(filename):
+      return False
+  for pattern in _FRAMEWORK_FILENAME_PATTERNS:
+    if pattern.search(filename):
+      return True
+  for prefix in _FRAMEWORK_PATH_PREFIXES:
+    if filename.startswith(prefix):
+      return True
+  return False
+
+
 def _find_index_of_defining_frame_for_op(op):
   """Return index in op.traceback with first 'useful' frame.
 
   This method reads through the stack stored in op.traceback looking for the
   innermost frame which (hopefully) belongs to the caller.  It accomplishes this
-  by rejecting frames whose filename appears to come from TensorFlow (see
-  error_interpolation._BAD_FILE_SUBSTRINGS for the list of rejected substrings).
+  by rejecting frames deemed to be part of the TensorFlow framework (by
+  pattern matching the filename).
 
   Args:
     op: the Operation object for which we would like to find the defining
@@ -201,8 +246,9 @@ def _find_index_of_defining_frame_for_op(op):
   filenames = [frame.filename for frame in tf_traceback]
   # We process the filenames from the innermost frame to outermost.
   for idx, filename in enumerate(reversed(filenames)):
-    contains_bad_substrings = [ss in filename for ss in _BAD_FILE_SUBSTRINGS]
-    if not any(contains_bad_substrings):
+    is_framework = _is_framework_filename(filename)
+    if not is_framework:
+      # Consider this to be the defining frame.
       return size - idx - 1
   return 0
 
@@ -237,11 +283,13 @@ def _compute_useful_frames(op, num):
   return op.traceback[outermost_included:innermost_excluded]
 
 
-def create_graph_debug_info_def(operations):
+def create_graph_debug_info_def(func_named_operations):
   """Construct and returns a `GraphDebugInfo` protocol buffer.
 
   Args:
-    operations: An iterable of op.Operation objects having _traceback members.
+    func_named_operations: An iterable of (func_name, op.Operation) tuples
+      where the Operation instances have a _traceback members. The func_name
+      should be the empty string for operations in the top-level Graph.
 
   Returns:
     GraphDebugInfo protocol buffer.
@@ -256,9 +304,9 @@ def create_graph_debug_info_def(operations):
   # collects the unique file names.
   all_file_names = set()
   node_to_trace = {}
-  for func, op in operations:
+  for func_name, op in func_named_operations:
     # Gets the stack trace of the operation and then the file location.
-    node_name = func + op.name
+    node_name = op.name + "@" + func_name
     node_to_trace[node_name] = _compute_useful_frames(op, 10)
     for frame in node_to_trace[node_name]:
       all_file_names.add(frame.filename)
diff --git a/tensorflow/python/framework/error_interpolation_test.py b/tensorflow/python/framework/error_interpolation_test.py
index 58d53f937cd..3cf5a9288b9 100644
--- a/tensorflow/python/framework/error_interpolation_test.py
+++ b/tensorflow/python/framework/error_interpolation_test.py
@@ -48,7 +48,7 @@ def _make_frame_with_filename(op, idx, filename):
 def _modify_op_stack_with_filenames(op, num_user_frames, user_filename,
                                     num_inner_tf_frames):
   """Replace op._traceback with a new traceback using special filenames."""
-  tf_filename = "%d" + error_interpolation._BAD_FILE_SUBSTRINGS[0]
+  tf_filename = error_interpolation._FRAMEWORK_PATH_PREFIXES[0] + "%d.py"
   user_filename = os.path.join("%d", "my_favorite_file.py")
 
   num_requested_frames = num_user_frames + num_inner_tf_frames
@@ -122,10 +122,70 @@ class ComputeColocationSummaryFromOpTest(test.TestCase):
     self.assertIn("No node-device colocations", summary)
 
 
+# Note that the create_graph_debug_info_def needs to run on graph mode ops,
+# so it is excluded from eager tests. Even when used in eager mode, it is
+# via FunctionGraphs, and directly verifying in graph mode is the narrowest
+# way to unit test the functionality.
+@test_util.run_deprecated_v1
+class CreateGraphDebugInfoDefTest(test.TestCase):
+
+  def setUp(self):
+    super(CreateGraphDebugInfoDefTest, self).setUp()
+    ops.reset_default_graph()
+
+  def _getFirstStackTraceForFile(self, graph_debug_info, key, file_index):
+    self.assertIn(key, graph_debug_info.traces)
+    stack_trace = graph_debug_info.traces[key]
+    found_flc = None
+    for flc in stack_trace.file_line_cols:
+      if flc.file_index == file_index:
+        found_flc = flc
+        break
+    self.assertIsNotNone(found_flc,
+                         "Could not find a stack trace entry for file")
+    return found_flc
+
+  def testStackTraceExtraction(self):
+    # Since the create_graph_debug_info_def() function does not actually
+    # do anything special with functions except name mangling, just verify
+    # it with a loose op and manually provided function name.
+    # The following ops *must* be on consecutive lines (it will be verified
+    # in the resulting trace).
+    # pyformat: disable
+    global_op = constant_op.constant(0, name="Global").op
+    op1 = constant_op.constant(1, name="One").op
+    op2 = constant_op.constant(2, name="Two").op
+    # pyformat: enable
+
+    export_ops = [("", global_op), ("func1", op1), ("func2", op2)]
+    graph_debug_info = error_interpolation.create_graph_debug_info_def(
+        export_ops)
+    this_file_index = -1
+    for file_index, file_name in enumerate(graph_debug_info.files):
+      if "{}error_interpolation_test.py".format(os.sep) in file_name:
+        this_file_index = file_index
+    self.assertGreaterEqual(
+        this_file_index, 0,
+        "Could not find this file in trace:" + repr(graph_debug_info))
+
+    # Verify the traces exist for each op.
+    global_flc = self._getFirstStackTraceForFile(graph_debug_info, "Global@",
+                                                 this_file_index)
+    op1_flc = self._getFirstStackTraceForFile(graph_debug_info, "One@func1",
+                                              this_file_index)
+    op2_flc = self._getFirstStackTraceForFile(graph_debug_info, "Two@func2",
+                                              this_file_index)
+
+    global_line = global_flc.line
+    self.assertEqual(op1_flc.line, global_line + 1, "op1 not on next line")
+    self.assertEqual(op2_flc.line, global_line + 2, "op2 not on next line")
+
+
 @test_util.run_deprecated_v1
 class InterpolateFilenamesAndLineNumbersTest(test.TestCase):
 
   def setUp(self):
+    super(InterpolateFilenamesAndLineNumbersTest, self).setUp()
     ops.reset_default_graph()
     # Add nodes to the graph for retrieval by name later.
     constant_op.constant(1, name="One")
@@ -133,17 +193,6 @@ class InterpolateFilenamesAndLineNumbersTest(test.TestCase):
     three = constant_op.constant(3, name="Three")
     self.graph = three.graph
 
-    # Change the list of bad file substrings so that constant_op.py is chosen
-    # as the defining stack frame for constant_op.constant ops.
-    self.old_bad_strings = error_interpolation._BAD_FILE_SUBSTRINGS
-    error_interpolation._BAD_FILE_SUBSTRINGS = [
-        "%sops.py" % os.sep,
-        "%sutil" % os.sep,
-    ]
-
-  def tearDown(self):
-    error_interpolation._BAD_FILE_SUBSTRINGS = self.old_bad_strings
-
   def testFindIndexOfDefiningFrameForOp(self):
     local_op = constant_op.constant(42).op
     user_filename = "hope.py"
@@ -187,58 +236,50 @@ class InterpolateFilenamesAndLineNumbersTest(test.TestCase):
     two_tags_no_seps = "{{node One}}{{node Three}}"
     interpolated_string = error_interpolation.interpolate(
         two_tags_no_seps, self.graph)
-    self.assertRegexpMatches(interpolated_string,
-                             "constant_op.py:[0-9]+.*constant_op.py:[0-9]+")
+    self.assertRegexpMatches(
+        interpolated_string, r"error_interpolation_test\.py:[0-9]+."
+        r"*error_interpolation_test\.py:[0-9]+")
 
   def testTwoTagsWithSeps(self):
     two_tags_with_seps = ";;;{{node Two}},,,{{node Three}};;;"
     interpolated_string = error_interpolation.interpolate(
         two_tags_with_seps, self.graph)
-    expected_regex = (
-        r"^;;;.*constant_op.py:[0-9]+\) ,,,.*constant_op.py:[0-9]+\) ;;;$")
+    expected_regex = (r"^;;;.*error_interpolation_test\.py:[0-9]+\) "
+                      r",,,.*error_interpolation_test\.py:[0-9]+\) ;;;$")
     self.assertRegexpMatches(interpolated_string, expected_regex)
 
   def testNewLine(self):
     newline = "\n\n{{node One}}"
     interpolated_string = error_interpolation.interpolate(newline, self.graph)
-    self.assertRegexpMatches(interpolated_string, "constant_op.py:[0-9]+.*")
+    self.assertRegexpMatches(interpolated_string,
+                             r"error_interpolation_test\.py:[0-9]+.*")
 
 
 @test_util.run_deprecated_v1
 class InputNodesTest(test.TestCase):
 
   def setUp(self):
+    super(InputNodesTest, self).setUp()
     # Add nodes to the graph for retrieval by name later.
     one = constant_op.constant(1, name="One")
     two = constant_op.constant(2, name="Two")
     three = math_ops.add(one, two, name="Three")
     self.graph = three.graph
 
-    # Change the list of bad file substrings so that constant_op.py is chosen
-    # as the defining stack frame for constant_op.constant ops.
-    self.old_bad_strings = error_interpolation._BAD_FILE_SUBSTRINGS
-    error_interpolation._BAD_FILE_SUBSTRINGS = [
-        "%sops.py" % os.sep,
-        "%sutil" % os.sep,
-    ]
-
-  def tearDown(self):
-    error_interpolation._BAD_FILE_SUBSTRINGS = self.old_bad_strings
-
   def testNoInputs(self):
     two_tags_with_seps = ";;;{{node One}},,,{{node Two}};;;"
     interpolated_string = error_interpolation.interpolate(
         two_tags_with_seps, self.graph)
-    expected_regex = (
-        r"^;;;.*constant_op.py:[0-9]+\) ,,,.*constant_op.py:[0-9]+\) ;;;$")
+    expected_regex = (r"^;;;.*error_interpolation_test\.py:[0-9]+\) "
+                      r",,,.*error_interpolation_test\.py:[0-9]+\) ;;;$")
     self.assertRegexpMatches(interpolated_string, expected_regex)
 
   def testBasicInputs(self):
     tag = ";;;{{node Three}};;;"
     interpolated_string = error_interpolation.interpolate(tag, self.graph)
     expected_regex = re.compile(
-        r"^;;;.*op_def_library.py:[0-9]+\) ;;;.*Input.*constant_op.py:[0-9]+\)",
-        re.DOTALL)
+        r"^;;;.*error_interpolation_test\.py:[0-9]+\) "
+        r";;;.*Input.*error_interpolation_test\.py:[0-9]+\)", re.DOTALL)
     self.assertRegexpMatches(interpolated_string, expected_regex)
 
 
@@ -249,6 +290,7 @@ class InterpolateDeviceSummaryTest(test.TestCase):
     return "/cpu:*"
 
   def setUp(self):
+    super(InterpolateDeviceSummaryTest, self).setUp()
     ops.reset_default_graph()
     self.zero = constant_op.constant([0.0], name="zero")
     with ops.device("/cpu"):
@@ -290,6 +332,7 @@ class InterpolateDeviceSummaryTest(test.TestCase):
 class InterpolateColocationSummaryTest(test.TestCase):
 
   def setUp(self):
+    super(InterpolateColocationSummaryTest, self).setUp()
     ops.reset_default_graph()
     # Add nodes to the graph for retrieval by name later.
     node_one = constant_op.constant(1, name="One")
@@ -337,5 +380,23 @@ class InterpolateColocationSummaryTest(test.TestCase):
     self.assertNotIn("Two", result)
 
 
+class IsFrameworkFilenameTest(test.TestCase):
+
+  def testAllowsUnitTests(self):
+    self.assertFalse(
+        error_interpolation._is_framework_filename(
+            error_interpolation._FRAMEWORK_PATH_PREFIXES[0] + "foobar_test.py"))
+
+  def testFrameworkPythonFile(self):
+    self.assertTrue(
+        error_interpolation._is_framework_filename(
+            error_interpolation.__file__))
+
+  def testEmbedded(self):
+    self.assertTrue(
+        error_interpolation._is_framework_filename(
+            "<embedded stdlib>/context_lib.py"))
+
+
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/framework/func_graph.py b/tensorflow/python/framework/func_graph.py
index 23275255db9..5c7e8e30976 100644
--- a/tensorflow/python/framework/func_graph.py
+++ b/tensorflow/python/framework/func_graph.py
@@ -198,7 +198,14 @@ class FuncGraph(ops.Graph):
     self.structured_outputs = None
     self._weak_variables = []
     self._watched_variables = object_identity.ObjectIdentityWeakSet()
-    self.outer_graph = ops.get_default_graph()
+
+    outer_graph = ops.get_default_graph()
+    self._weak_outer_graph = weakref.ref(outer_graph)
+    while outer_graph.building_function:
+      outer_graph = outer_graph.outer_graph
+    # If self._weak_outer_graph is deleted, we revert to the outermost Graph
+    # active when the FuncGraph was traced. This will not be a FuncGraph.
+    self._fallback_outer_graph = outer_graph
     self._captures = py_collections.OrderedDict()
     # If not None, records the names of output args of this function. Used to
     # preserve the output names in the signature of a serialized+deserialized
@@ -410,6 +417,27 @@ class FuncGraph(ops.Graph):
           self._auto_cast_variable_read_dtype = old_auto_cast_var_read_dtype
     return inner_cm()
 
+  @property
+  def outer_graph(self):
+    """The Graph this FuncGraph is nested in.
+
+    Functions may capture Tensors from graphs they are nested in (transitive).
+
+    Returns:
+      A Graph object. Initially set to the current default graph when the
+      FuncGraph was created. If the previous `outer_graph` was deleted because
+      the function that owns it was deleted, `outer_graph` is reset to the
+      outermost default graph active when the FuncGraph was created. This
+      FuncGraph won't have captured anything from the new `outer_graph` (and
+      likely not from the previous setting, since that would have created a
+      strong reference), but it is returned so that FuncGraphs always have a
+      parent.
+    """
+    current = self._weak_outer_graph()
+    if current is None:
+      return self._fallback_outer_graph
+    return current
+
   @property
   def output_types(self):
     return [t.dtype for t in self.outputs]
diff --git a/tensorflow/python/framework/function.py b/tensorflow/python/framework/function.py
index 6a5813eadc8..7be74f9daef 100644
--- a/tensorflow/python/framework/function.py
+++ b/tensorflow/python/framework/function.py
@@ -725,6 +725,11 @@ class _FuncGraph(ops.Graph):
 
   # pylint: disable=g-doc-return-or-yield
 
+  @property
+  def outer_graph(self):
+    """The graph active when this _FuncGraph was created."""
+    return self._outer_graph
+
   @tf_contextlib.contextmanager
   def container(self, container_name):
     """Returns a context manager that specifies the resource container to use.
diff --git a/tensorflow/python/framework/function_def_to_graph.py b/tensorflow/python/framework/function_def_to_graph.py
index 7e12dffb9a2..01978b8d06f 100644
--- a/tensorflow/python/framework/function_def_to_graph.py
+++ b/tensorflow/python/framework/function_def_to_graph.py
@@ -18,6 +18,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.core.framework import function_pb2
 from tensorflow.core.framework import graph_pb2
 from tensorflow.core.framework import tensor_shape_pb2
 from tensorflow.core.framework import types_pb2
@@ -29,7 +30,7 @@ from tensorflow.python.framework import versions
 from tensorflow.python.framework.func_graph import FuncGraph
 
 
-def function_def_to_graph(fdef, input_shapes=None, copy_functions=True):
+def function_def_to_graph(fdef, input_shapes=None):
   """Converts a FunctionDef to a FuncGraph (sub-class Graph).
 
   The returned FuncGraph's `name`, `inputs` and `outputs` fields will be set.
@@ -45,9 +46,6 @@ def function_def_to_graph(fdef, input_shapes=None, copy_functions=True):
       specified, its length must match length of `fdef.signature.input_arg`. If
       a shape is None, the corresponding input placeholder will have unknown
       shape.
-    copy_functions: Whether to copy all functions that exists in default graph
-      (independently of being used or not) to the created FuncGraph. Functions
-      required for graph import will be copied regardless.
 
   Returns:
     A FuncGraph.
@@ -58,7 +56,7 @@ def function_def_to_graph(fdef, input_shapes=None, copy_functions=True):
     if input_shapes_attr is not None:
       input_shapes = input_shapes_attr.list.shape
   graph_def, nested_to_flat_tensor_name = function_def_to_graph_def(
-      fdef, input_shapes, copy_functions)
+      fdef, input_shapes)
 
   with func_graph.as_default():
     # Add all function nodes to the graph.
@@ -111,10 +109,17 @@ def is_function(fname):
   if context.executing_eagerly():
     return context.context().has_function(fname)
   else:
-    return ops.get_default_graph()._is_function(fname)  # pylint: disable=protected-access
+    graph = ops.get_default_graph()
+    while graph is not None:
+      if graph._is_function(fname):  # pylint: disable=protected-access
+        return True
+      if hasattr(graph, "outer_graph"):
+        graph = graph.outer_graph
+      else:
+        return False
 
 
-def function_def_to_graph_def(fdef, input_shapes=None, copy_functions=True):
+def function_def_to_graph_def(fdef, input_shapes=None):
   """Convert a FunctionDef to a GraphDef.
 
   Steps:
@@ -131,9 +136,6 @@ def function_def_to_graph_def(fdef, input_shapes=None, copy_functions=True):
       function inputs. If specified, its length must match length of
       `fdef.signature.input_arg`. If a shape is None, the corresponding input
       placeholder will have unknown shape.
-    copy_functions: Whether to copy all functions that exists in default graph
-      (independently of being used or not) to the created GraphDef. Directly
-      referenced functions are copied regardless.
 
   Returns:
     A tuple of (GraphDef, dict<string, string>). The dict contains a mapping
@@ -153,15 +155,6 @@ def function_def_to_graph_def(fdef, input_shapes=None, copy_functions=True):
 
   copied_functions = set()
 
-  # Copy *all* functions from outer graph to `graph_def` so that both direct
-  # and indirect references are safely handled.
-  if copy_functions:
-    # pylint: disable=protected-access
-    default_graph._copy_functions_to_graph_def(graph_def, 0)
-    for function_name in default_graph._functions.keys():
-      copied_functions.add(function_name)
-    # pylint: enable=protected-access
-
   if input_shapes and len(input_shapes) != len(fdef.signature.input_arg):
     raise ValueError("Length of input_shapes must match the number of " +
                      "input_args. len(input_shapes): {} len(input_arg): {}".
@@ -201,17 +194,28 @@ def function_def_to_graph_def(fdef, input_shapes=None, copy_functions=True):
     nested_to_flat_tensor_name[control_name] = control_name
 
   for node_def in fdef.node_def:
-    f = default_graph._functions.get(node_def.op, None)  # pylint: disable=protected-access
-    if f is not None and hasattr(f, "signature"):
-      op_def = f.signature
+    graph = default_graph
+    while True:
+      f = graph._functions.get(node_def.op, None)  # pylint: disable=protected-access
+      if f is not None or not hasattr(graph, "outer_graph"):
+        break
+      graph = graph.outer_graph
+
+    if f is not None:
+      op_def = f.definition.signature
       if node_def.op not in copied_functions:
         # Since this function is referenced as an op type, we have no choice but
         # to copy it into the GraphDef if we want downstream tools to process
         # it.
         graph_def.library.function.add().CopyFrom(f.definition)
         copied_functions.add(node_def.op)
+        if f.grad_func_name:
+          grad_def = function_pb2.GradientDef()
+          grad_def.function_name = f.name
+          grad_def.gradient_func = f.grad_func_name
+          graph_def.library.gradient.extend([grad_def])
     else:
-      op_def = ops.get_default_graph()._get_op_def(node_def.op)  # pylint: disable=protected-access
+      op_def = default_graph._get_op_def(node_def.op)  # pylint: disable=protected-access
 
     for attr in op_def.attr:
       if attr.type == "func":
diff --git a/tensorflow/python/framework/function_def_to_graph_test.py b/tensorflow/python/framework/function_def_to_graph_test.py
index 4c9f6702583..5ccdf896127 100644
--- a/tensorflow/python/framework/function_def_to_graph_test.py
+++ b/tensorflow/python/framework/function_def_to_graph_test.py
@@ -194,42 +194,6 @@ class FunctionDefToGraphDefTest(test.TestCase):
     self.assertEqual(g.node[0].attr["shape"].shape.unknown_rank, False)
     self.assertFalse("shape" in g.node[2].attr)
 
-  @test_util.run_deprecated_v1
-  def testFunctionCallsFromFunction(self):
-    ops.disable_tensor_equality()
-    x = constant_op.constant(5.0)
-    y = constant_op.constant(10.0)
-
-    @function.defun
-    def fn():
-
-      @function.defun
-      def inner_fn():
-        return x + y
-
-      return inner_fn()
-
-    @function.defun
-    def fn2():
-      return 2 * fn()
-
-    fn2_defun = fn2.get_concrete_function()
-
-    # Call `fn2` to make sure `fn` is correctly instantiated so
-    # `function_def_to_graph` can find it.
-    fn2_defun()
-
-    fdef = fn2_defun.function_def
-    func_graph = function_def_to_graph.function_def_to_graph(fdef)
-    with func_graph.as_default():
-      x_ph, y_ph = func_graph.inputs
-      with self.session(graph=func_graph) as sess:
-        self.assertEqual(
-            sess.run(func_graph.outputs[0], feed_dict={
-                x_ph: 5.0,
-                y_ph: 10.0
-            }), 30.0)
-
   def testControlDependencies(self):
 
     v = variables.Variable(1)
diff --git a/tensorflow/python/framework/meta_graph_test.py b/tensorflow/python/framework/meta_graph_test.py
index 948956e8bbd..eff613b4204 100644
--- a/tensorflow/python/framework/meta_graph_test.py
+++ b/tensorflow/python/framework/meta_graph_test.py
@@ -741,11 +741,11 @@ class ScopedMetaGraphTest(test.TestCase):
         biases1 = resource_variable_ops.ResourceVariable(
             [0.1] * 3, name="biases")
         nn_ops.relu(math_ops.matmul(images, weights1) + biases1, name="relu")
-    operations = []
+    func_named_operations = []
     for op in graph1.get_operations():
-      operations.append(("", op))
+      func_named_operations.append(("", op))
     debug_info_def = error_interpolation.create_graph_debug_info_def(
-        operations=operations)
+        func_named_operations)
 
     # The unique file names in all the stack traces should be larger or equal
     # than 1.
diff --git a/tensorflow/python/framework/op_callbacks.py b/tensorflow/python/framework/op_callbacks.py
index 9c7e776f20e..29f8dfb2c16 100644
--- a/tensorflow/python/framework/op_callbacks.py
+++ b/tensorflow/python/framework/op_callbacks.py
@@ -18,47 +18,12 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import threading
-
 from tensorflow.python.eager import context
 from tensorflow.python.eager import execute
 
-# A thread-local state object. It may hold the following attributes:
-#   - `callbacks`: the thread-local stack of op callbacks.
-#   - `invoking_callbacks`: a boolean used to keep track of whether
-#     we are currently invoking an op_callback.
-_state = threading.local()
 
-
-class _OpCallbackContextManager(object):
-  """Context manager for op callbacks."""
-
-  def __init__(self, callback_fn):
-    self._callback_fn = callback_fn
-
-  def __enter__(self):
-    """A method of when a scope of this context manager is being entered."""
-    # Monkey-patch `execute.execute()`.
-    execute.execute = execute.execute_with_callbacks
-    if not hasattr(_state, "callback_stack"):
-      _state.callback_stack = []
-      _state.invoking_callbacks = False
-    _state.callback_stack.append(self._callback_fn)
-
-    ctx = context.context()
-    if ctx.executing_eagerly():
-      ctx.post_execution_callbacks.append(self._callback_fn)
-
-  def __exit__(self, exec_type, exec_value, exec_traceback):
-    """A method of when a scope of this context manager is being exited."""
-    _state.callback_stack.pop()
-    ctx = context.context()
-    if ctx.executing_eagerly():
-      ctx.post_execution_callbacks.pop()
-
-
-def op_callback(callback_fn):
-  r"""Intercepts op execution and op creation.
+def add_op_callback(callback_fn):
+  r"""Add a thread-local callback that intercepts op execution and op creation.
 
   The `callback_fn` will be invoked immediately after any of the three types
   of events:
@@ -132,13 +97,8 @@ def op_callback(callback_fn):
         #     non-eager `Tensor`s. Their values will replace the original
         #     `outputs` for downstream graph construction.
 
-  Returns:
-    A thread-local context manager. Within the scope of the context
-    manager, all eager op/graph execution and graph op construction
-    will invoke `callback_fn`.
-
   Raises:
-    ValueEror: If `callback_fn` is not callable.
+    ValueEror: If `callback_fn` is `None` or not callable.
   """
   # TODO(b/139668041): Implement support for overriding `EagerTensor`s from
   # callback.
@@ -148,7 +108,11 @@ def op_callback(callback_fn):
     raise ValueError(
         "Callback function passed to op_callback() is expected to be callable, "
         "but is not. Recevied %s" % callback_fn)
-  return _OpCallbackContextManager(callback_fn)
+  ctx = context.context()
+  ctx.add_op_callback(callback_fn)
+  if ctx.executing_eagerly():
+    # Monkey-patch `execute.execute()`.
+    execute.execute = execute.execute_with_callbacks
 
 
 def should_invoke_op_callbacks():
@@ -158,9 +122,31 @@ def should_invoke_op_callbacks():
     A thread-local result (boolean) indicating whether any op callback(s) exist
     and should be invoked.
   """
-  return (
-      hasattr(_state, "callback_stack") and _state.callback_stack and
-      not (hasattr(_state, "invoking_callbacks") and _state.invoking_callbacks))
+  ctx = context.context()
+  return ctx.op_callbacks and not ctx.invoking_op_callbacks
+
+
+def remove_op_callback(op_callback):
+  """Remove an already-added op callback.
+
+  Args:
+    op_callback: The op callback to be removed.
+
+  Raises:
+    KeyError: If `op_callback` has not been registered using `add_op_callback()`
+      before.
+  """
+  ctx = context.context()
+  ctx.remove_op_callback(op_callback)
+  if ctx.executing_eagerly() and not ctx.op_callbacks:
+    # Undo monkey-patch of execute.execute if there are no more callbacks.
+    execute.execute = execute.quick_execute
+
+
+def clear_op_callbacks():
+  """Clear all op callbacks registered in the current thread."""
+  for callback in context.context().op_callbacks:
+    remove_op_callback(callback)
 
 
 def invoke_op_callbacks(op_type,
@@ -196,10 +182,11 @@ def invoke_op_callbacks(op_type,
     `None`, or a `list` or `tuple` of output tenors that will override the
     original (input) `outputs`.
   """
-  if _state.callback_stack:
+  ctx = context.context()
+  if ctx.op_callbacks:
     # Guards against stack overflow that can result from recursive invocation
     # due to op constructions inside client-supplied op callbacks.
-    _state.invoking_callbacks = True
+    ctx.invoking_op_callbacks = True
     try:
       if isinstance(attrs, dict):
         attrs_list = []
@@ -211,7 +198,7 @@ def invoke_op_callbacks(op_type,
         attrs_tuple = attrs
 
       new_outputs = outputs
-      for callback in reversed(_state.callback_stack):
+      for callback in ctx.op_callbacks:
         new_outputs = callback(
             op_type,
             inputs,
@@ -226,6 +213,6 @@ def invoke_op_callbacks(op_type,
               (len(new_outputs), op_name, len(outputs)))
       return new_outputs
     finally:
-      _state.invoking_callbacks = False
+      ctx.invoking_op_callbacks = False
   else:
     return outputs
diff --git a/tensorflow/python/framework/op_callbacks_test.py b/tensorflow/python/framework/op_callbacks_test.py
index c58c57c7a95..0027c74ba7f 100644
--- a/tensorflow/python/framework/op_callbacks_test.py
+++ b/tensorflow/python/framework/op_callbacks_test.py
@@ -25,6 +25,7 @@ import numpy as np
 from tensorflow.python import keras
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.eager import backprop
+from tensorflow.python.eager import context
 from tensorflow.python.eager import def_function
 from tensorflow.python.eager import test
 from tensorflow.python.framework import constant_op
@@ -36,12 +37,12 @@ from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import script_ops
 from tensorflow.python.ops import sparse_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.util import compat
 
+
 # Keep all the hard-coded op type strings in one place so they are easy to
 # change all at once in the face of any possible future op type name changes.
 _ADD_OP = b"AddV2"
@@ -64,6 +65,7 @@ _SQRT_OP = b"Sqrt"
 _SQUARE_OP = b"Square"
 _STATELESS_IF_OP = b"StatelessIf"
 _UNIQUE_OP = b"Unique"
+_VAR_HANDLE_OP = b"VarHandleOp"
 _WHILE_OP = b"While"
 
 
@@ -100,7 +102,8 @@ class _NumpyFunctionCallback(object):
       instrumented_outputs = []
       for output in outputs:
         if compat.as_bytes(op_type) in (
-            _IF_OP, _STATELESS_IF_OP, _WHILE_OP, _IDENTITY_OP):
+            _IF_OP, _STATELESS_IF_OP, _WHILE_OP, _IDENTITY_OP,
+            _VAR_HANDLE_OP):
           # TODO(cais): Overriding the output of StatelessIf, If and While ops
           # currently fails with error. Investigate (b/139668453).
           # Avoid instrumenting Identity ops as well, as they are inserted
@@ -145,27 +148,30 @@ class _NumpyFunctionCallback(object):
 
 class OpCallbacksTest(test_util.TensorFlowTestCase):
 
+  def tearDown(self):
+    op_callbacks.clear_op_callbacks()
+    super(OpCallbacksTest, self).tearDown()
+
   def testSingleThreadedStack(self):
+    ctx = context.context()
     instrument_0 = _NumpyFunctionCallback()
     instrument_1 = _NumpyFunctionCallback()
 
-    with op_callbacks.op_callback(instrument_0.callback):
-      self.assertEqual(1, len(op_callbacks._state.callback_stack))
-      self.assertEqual(instrument_0.callback,
-                       op_callbacks._state.callback_stack[0])
+    op_callbacks.add_op_callback(instrument_0.callback)
+    self.assertEqual(1, len(ctx.op_callbacks))
+    self.assertIn(instrument_0.callback, ctx.op_callbacks)
 
-      with op_callbacks.op_callback(instrument_1.callback):
-        self.assertEqual(2, len(op_callbacks._state.callback_stack))
-        self.assertEqual(instrument_0.callback,
-                         op_callbacks._state.callback_stack[0])
-        self.assertEqual(instrument_1.callback,
-                         op_callbacks._state.callback_stack[1])
+    op_callbacks.add_op_callback(instrument_1.callback)
+    self.assertEqual(2, len(ctx.op_callbacks))
+    self.assertIn(instrument_0.callback, ctx.op_callbacks)
+    self.assertIn(instrument_1.callback, ctx.op_callbacks)
 
-      self.assertEqual(1, len(op_callbacks._state.callback_stack))
-      self.assertEqual(instrument_0.callback,
-                       op_callbacks._state.callback_stack[0])
+    op_callbacks.remove_op_callback(instrument_1.callback)
+    self.assertEqual(1, len(ctx.op_callbacks))
+    self.assertIn(instrument_0.callback, ctx.op_callbacks)
 
-    self.assertEqual(0, len(op_callbacks._state.callback_stack))
+    op_callbacks.remove_op_callback(instrument_0.callback)
+    self.assertEqual(0, len(ctx.op_callbacks))
 
   def testMultiThreadedStacks(self):
     # Instrument for the main thread.
@@ -175,14 +181,14 @@ class OpCallbacksTest(test_util.TensorFlowTestCase):
     instrument_1 = _NumpyFunctionCallback()
 
     def thread1_job():
-      with op_callbacks.op_callback(instrument_1.callback):
+      op_callbacks.add_op_callback(instrument_1.callback)
 
-        @def_function.function
-        def func1(x):
-          return math_ops.sqrt(math_ops.log(x))
+      @def_function.function
+      def func1(x):
+        return math_ops.sqrt(math_ops.log(x))
 
-        x = constant_op.constant(4.0)
-        self.assertAllClose(func1(x), np.sqrt(np.log(4.0)))
+      x = constant_op.constant(4.0)
+      self.assertAllClose(func1(x), np.sqrt(np.log(4.0)))
 
     thread1 = threading.Thread(target=thread1_job)
 
@@ -190,14 +196,14 @@ class OpCallbacksTest(test_util.TensorFlowTestCase):
     thread1.start()
 
     # Run something on the main thread.
-    with op_callbacks.op_callback(instrument_0.callback):
+    op_callbacks.add_op_callback(instrument_0.callback)
 
-      @def_function.function
-      def func0(x):
-        return math_ops.square(math_ops.sin(x))
+    @def_function.function
+    def func0(x):
+      return math_ops.square(math_ops.sin(x))
 
-      x = constant_op.constant(4.0)
-      self.assertAllClose(func0(x), np.square(np.sin(4.0)))
+    x = constant_op.constant(4.0)
+    self.assertAllClose(func0(x), np.square(np.sin(4.0)))
 
     thread1.join()
 
@@ -216,10 +222,10 @@ class OpCallbacksTest(test_util.TensorFlowTestCase):
   def testEagerOpExecution(self):
     instrument = _NumpyFunctionCallback()
 
-    with op_callbacks.op_callback(instrument.callback):
-      x = constant_op.constant(6.0)
-      y = math_ops.square(math_ops.log(x))
-      self.assertAllClose(y, np.square(np.log(6.0)))
+    op_callbacks.add_op_callback(instrument.callback)
+    x = constant_op.constant(6.0)
+    y = math_ops.square(math_ops.log(x))
+    self.assertAllClose(y, np.square(np.log(6.0)))
 
     self.assertEqual(instrument.eager_op_types, [_LOG_OP, _SQUARE_OP])
     # Op names are unavailable under eager mode.
@@ -246,19 +252,21 @@ class OpCallbacksTest(test_util.TensorFlowTestCase):
     instrument_1 = _NumpyFunctionCallback()
 
     def thread_1_job():
-      with op_callbacks.op_callback(instrument_1.callback):
-        x = constant_op.constant(6.0)
-        y = math_ops.square(math_ops.log(x))
-        return y
+      op_callbacks.add_op_callback(instrument_1.callback)
+      x = constant_op.constant(6.0)
+      y = math_ops.square(math_ops.log(x))
+      op_callbacks.remove_op_callback(instrument_1.callback)
+      return y
 
     thread_1 = threading.Thread(target=thread_1_job)
     thread_1.start()
 
     # While thread_1 is ongoing, do something on the main thread.
-    with op_callbacks.op_callback(instrument_0.callback):
-      x = constant_op.constant(2.0)
-      y = math_ops.cos(x)
-      self.assertAllClose(y, np.cos(2.0))
+    op_callbacks.add_op_callback(instrument_0.callback)
+    x = constant_op.constant(2.0)
+    y = math_ops.cos(x)
+    self.assertAllClose(y, np.cos(2.0))
+    op_callbacks.remove_op_callback(instrument_0.callback)
 
     thread_1.join()
 
@@ -281,11 +289,16 @@ class OpCallbacksTest(test_util.TensorFlowTestCase):
     square_log(x_float32)
     square_log(x_float64)
 
-    with op_callbacks.op_callback(instrument.callback):
-      y = square_log(x_float32)
-      self.assertAllClose(y, np.square(np.log(6.0)))
-      y = square_log(x_float64)
-      self.assertAllClose(y, np.square(np.log(6.0)))
+    op_callbacks.add_op_callback(instrument.callback)
+    y = square_log(x_float32)
+    self.assertAllClose(y, np.square(np.log(6.0)))
+    y = square_log(x_float64)
+    self.assertAllClose(y, np.square(np.log(6.0)))
+
+    self.assertEqual(instrument.eager_op_names, [None, None])
+    self.assertFalse(instrument.graph_op_types)
+    self.assertFalse(instrument.graph_op_names)
+    self.assertFalse(instrument.graph_inputs)
 
     # Each of the two dtypes should be associated with its own FuncGraph.
     self.assertIn(
@@ -301,11 +314,6 @@ class OpCallbacksTest(test_util.TensorFlowTestCase):
     self.assertIsInstance(instrument.eager_inputs[1], tuple)
     self.assertEqual(instrument.eager_inputs[1][0], x_float64)
 
-    self.assertEqual(instrument.eager_op_names, [None, None])
-    self.assertFalse(instrument.graph_op_types)
-    self.assertFalse(instrument.graph_op_names)
-    self.assertFalse(instrument.graph_inputs)
-
   def testMultiThreadedEagerFunctionExecution(self):
     # Instrument for the main thread.
     instrument_0 = _NumpyFunctionCallback()
@@ -325,15 +333,15 @@ class OpCallbacksTest(test_util.TensorFlowTestCase):
     square_log(x_float64)
 
     def thread_1_job():
-      with op_callbacks.op_callback(instrument_1.callback):
-        square_log(x_float32)
+      op_callbacks.add_op_callback(instrument_1.callback)
+      square_log(x_float32)
 
     thread_1 = threading.Thread(target=thread_1_job)
     thread_1.start()
 
     # In the meantime, run some computation on the main thread.
-    with op_callbacks.op_callback(instrument_0.callback):
-      square_log(x_float64)
+    op_callbacks.add_op_callback(instrument_0.callback)
+    square_log(x_float64)
 
     thread_1.join()
 
@@ -349,20 +357,22 @@ class OpCallbacksTest(test_util.TensorFlowTestCase):
     self.assertEqual(instrument_1.eager_op_names, [None])
     self.assertFalse(instrument_1.graph_op_types)
 
+  @test_util.run_in_graph_and_eager_modes
   def testSimpleGraphConstructionScopeOutsideFunction(self):
     instrument = _NumpyFunctionCallback()
 
-    with op_callbacks.op_callback(instrument.callback):
+    op_callbacks.add_op_callback(instrument.callback)
 
-      @def_function.function
-      def log_2plus_unique_x(x):
-        unique_values, unique_pos = array_ops.unique(x)
-        return math_ops.log(2.0 + unique_values), unique_pos
+    @def_function.function
+    def log_2plus_unique_x(x):
+      unique_values, unique_pos = array_ops.unique(x)
+      return math_ops.log(2.0 + unique_values), unique_pos
+
+    x = constant_op.constant([-1.0, -1.0, 0.0], dtype=dtypes.float32)
+    y1, y2 = log_2plus_unique_x(x)
+    self.assertAllClose(y1, [0.0, np.log(2.0)])
+    self.assertAllClose(y2, [0, 0, 1])
 
-      x = constant_op.constant([-1.0, -1.0, 0.0], dtype=dtypes.float32)
-      y1, y2 = log_2plus_unique_x(x)
-      self.assertAllClose(y1, [0.0, np.log(2.0)])
-      self.assertAllClose(y2, [0, 0, 1])
     self.assertIn(_UNIQUE_OP, instrument.graph_op_types)
     self.assertIn(_ADD_OP, instrument.graph_op_types)
     self.assertIn(_LOG_OP, instrument.graph_op_types)
@@ -371,16 +381,23 @@ class OpCallbacksTest(test_util.TensorFlowTestCase):
 
     # Check the graph internal ndarrays recorded at runtime.
     unique_op_outputs = instrument.graph_internal_ndarrays[_UNIQUE_OP]
-    self.assertEqual(len(unique_op_outputs), 2)
+    if context.executing_eagerly():
+      # b/140810696: The run_in_graph_and_eager_modes decorator runs
+      # Session.run() twice. We can't assert on the number of outputs in
+      # that case.
+      self.assertEqual(len(unique_op_outputs), 2)
     self.assertAllClose(unique_op_outputs[0], [-1.0, 0.0])
     self.assertAllClose(unique_op_outputs[1], [0, 0, 1])
     add_op_outputs = instrument.graph_internal_ndarrays[b"add"]
-    self.assertEqual(len(add_op_outputs), 1)
+    if context.executing_eagerly():
+      self.assertEqual(len(add_op_outputs), 1)
     self.assertAllClose(add_op_outputs[0], [1.0, 2.0])
     log_op_outputs = instrument.graph_internal_ndarrays[_LOG_OP]
-    self.assertEqual(len(log_op_outputs), 1)
+    if context.executing_eagerly():
+      self.assertEqual(len(log_op_outputs), 1)
     self.assertAllClose(log_op_outputs[0], [0.0, np.log(2.0)])
 
+  @test_util.run_in_graph_and_eager_modes
   def testSimpleGraphConstructionWithCallbackReturningNone(self):
     """Test that callbacks that return None works."""
     op_types = []
@@ -393,30 +410,33 @@ class OpCallbacksTest(test_util.TensorFlowTestCase):
       del inputs, attrs, outputs, op_name, graph  # Unused.
       op_types.append(compat.as_bytes(op_type))
 
-    with op_callbacks.op_callback(no_return_callback):
-      @def_function.function
-      def log1p(x):
-        return math_ops.log(1.0 + x)
-      x = constant_op.constant(3.0)
-      y = log1p(x)
-      self.assertAllClose(y, np.log(4.0))
+    op_callbacks.add_op_callback(no_return_callback)
+
+    @def_function.function
+    def log1p(x):
+      return math_ops.log(1.0 + x)
+    x = constant_op.constant(3.0)
+    y = log1p(x)
+
+    self.assertAllClose(y, np.log(4.0))
     self.assertIn(_ADD_OP, op_types)
     self.assertIn(_LOG_OP, op_types)
 
+  @test_util.run_in_graph_and_eager_modes
   def testGraphConstructionInputsAndGraphAreCapturedCorrectly(self):
     instrument = _NumpyFunctionCallback(instrument_graph_ops=False)
 
-    with op_callbacks.op_callback(instrument.callback):
+    op_callbacks.add_op_callback(instrument.callback)
 
-      @def_function.function
-      def log_2plus_unique_x(x):
-        unique_values, unique_pos = array_ops.unique(x)
-        return math_ops.log(2.0 + unique_values), unique_pos
+    @def_function.function
+    def log_2plus_unique_x(x):
+      unique_values, unique_pos = array_ops.unique(x)
+      return math_ops.log(2.0 + unique_values), unique_pos
 
-      x = constant_op.constant([-1.0, -1.0, 0.0], dtype=dtypes.float32)
-      y1, y2 = log_2plus_unique_x(x)
-      self.assertAllClose(y1, [0.0, np.log(2.0)])
-      self.assertAllClose(y2, [0, 0, 1])
+    x = constant_op.constant([-1.0, -1.0, 0.0], dtype=dtypes.float32)
+    y1, y2 = log_2plus_unique_x(x)
+    self.assertAllClose(y1, [0.0, np.log(2.0)])
+    self.assertAllClose(y2, [0, 0, 1])
 
     # Check the recorded input tensors.
     self.assertEqual(
@@ -446,18 +466,21 @@ class OpCallbacksTest(test_util.TensorFlowTestCase):
     self.assertEqual(
         len(instrument.graph_graphs), len(instrument.graph_op_types))
     self.assertGreater(len(instrument.graph_graph_versions), 1)
-    for i in range(len(instrument.graph_graph_versions) - 1):
-      self.assertGreater(instrument.graph_graph_versions[i + 1],
-                         instrument.graph_graph_versions[i])
+    if context.executing_eagerly():
+      for i in range(len(instrument.graph_graph_versions) - 1):
+        self.assertGreater(instrument.graph_graph_versions[i + 1],
+                           instrument.graph_graph_versions[i])
 
+  @test_util.run_in_graph_and_eager_modes
   def testEagerGraphOpConstructionSimpleGraphScopeInsideFunction(self):
     instrument = _NumpyFunctionCallback()
 
     @def_function.function
     def log_2plus_unique_x(x):
-      with op_callbacks.op_callback(instrument.callback):
-        unique_values, _ = array_ops.unique(x)
-        y = math_ops.log(2.0 + unique_values)
+      op_callbacks.add_op_callback(instrument.callback)
+      unique_values, _ = array_ops.unique(x)
+      y = math_ops.log(2.0 + unique_values)
+      op_callbacks.remove_op_callback(instrument.callback)
       return math_ops.sin(y)
 
     x = constant_op.constant([-1.0, -1.0, 0.0], dtype=dtypes.float32)
@@ -489,11 +512,12 @@ class OpCallbacksTest(test_util.TensorFlowTestCase):
 
   def testEagerOpAttributesAreCapture(self):
     instrument = _NumpyFunctionCallback()
-    with op_callbacks.op_callback(instrument.callback):
-      m = constant_op.constant([[1.0, -1.0], [0.0, 1.0]])
-      x = constant_op.constant([[-2.0], [3.0]])
-      y = math_ops.matmul(m, x, transpose_a=True, transpose_b=False)
-      self.assertAllClose(y, [[-2.0], [5.0]])
+    op_callbacks.add_op_callback(instrument.callback)
+    m = constant_op.constant([[1.0, -1.0], [0.0, 1.0]])
+    x = constant_op.constant([[-2.0], [3.0]])
+    y = math_ops.matmul(m, x, transpose_a=True, transpose_b=False)
+    self.assertAllClose(y, [[-2.0], [5.0]])
+
     self.assertEqual(len(instrument.eager_attrs), 1)
     self.assertIsInstance(instrument.eager_attrs[0], tuple)
     self.assertEqual(
@@ -504,18 +528,20 @@ class OpCallbacksTest(test_util.TensorFlowTestCase):
                                   + 1], False)
     self.assertEqual(len(instrument.graph_attrs), 0)
 
+  @test_util.run_in_graph_and_eager_modes
   def testGraphOpAttributesAreCapture(self):
     instrument = _NumpyFunctionCallback()
-    with op_callbacks.op_callback(instrument.callback):
+    op_callbacks.add_op_callback(instrument.callback)
 
-      @def_function.function
-      def my_matmul(m, x):
-        return math_ops.matmul(m, x, transpose_a=True, transpose_b=False)
+    @def_function.function
+    def my_matmul(m, x):
+      return math_ops.matmul(m, x, transpose_a=True, transpose_b=False)
+
+    m = constant_op.constant([[1.0, -1.0], [0.0, 1.0]])
+    x = constant_op.constant([[-2.0], [3.0]])
+    y = my_matmul(m, x)
+    self.assertAllClose(y, [[-2.0], [5.0]])
 
-      m = constant_op.constant([[1.0, -1.0], [0.0, 1.0]])
-      x = constant_op.constant([[-2.0], [3.0]])
-      y = my_matmul(m, x)
-      self.assertAllClose(y, [[-2.0], [5.0]])
     index = instrument.graph_op_types.index(_MATMUL_OP)
     self.assertIsInstance(instrument.graph_attrs[index], tuple)
     self.assertEqual(
@@ -524,23 +550,24 @@ class OpCallbacksTest(test_util.TensorFlowTestCase):
     self.assertEqual(
         instrument.graph_attrs[index][
             instrument.graph_attrs[index].index("transpose_b") + 1].b, False)
-    self.assertEqual(len(instrument.eager_attrs), 1)
-    self.assertIsInstance(instrument.eager_attrs[0], tuple)
+    if context.executing_eagerly():
+      self.assertEqual(len(instrument.eager_attrs), 1)
+      self.assertIsInstance(instrument.eager_attrs[0], tuple)
 
+  @test_util.run_in_graph_and_eager_modes
   def testEagerGraphOpConstructionIfControlFlow(self):
     instrument = _NumpyFunctionCallback()
+    op_callbacks.add_op_callback(instrument.callback)
 
-    with op_callbacks.op_callback(instrument.callback):
+    @def_function.function
+    def my_function_with_cond(x):
+      if math_ops.greater(x, 0.0):
+        return x**2.0
+      else:
+        return x**3.0
 
-      @def_function.function
-      def my_function_with_cond(x):
-        if math_ops.greater(x, 0.0):
-          return x**2.0
-        else:
-          return x**3.0
-
-      x = constant_op.constant(-4.0)
-      self.assertAllClose(my_function_with_cond(x), -64.0)
+    x = constant_op.constant(-4.0)
+    self.assertAllClose(my_function_with_cond(x), -64.0)
 
     self.assertIn(_IF_OP, instrument.graph_op_types)
     self.assertIn(_GREATER_OP, instrument.graph_op_types)
@@ -558,19 +585,19 @@ class OpCallbacksTest(test_util.TensorFlowTestCase):
 
   def testEagerGraphOpConstructionWhileLoopControlFlow(self):
     instrument = _NumpyFunctionCallback()
+    op_callbacks.add_op_callback(instrument.callback)
 
-    with op_callbacks.op_callback(instrument.callback):
+    @def_function.function
+    def my_function_with_while(counter, lim, accum):
+      while math_ops.less(counter, lim):
+        accum.assign_add(accum)
+        counter.assign_add(1.0)
 
-      @def_function.function
-      def my_function_with_while(counter, lim, accum):
-        while math_ops.less(counter, lim):
-          accum.assign_add(accum)
-          counter.assign_add(1.0)
+    counter = variables.Variable(0.0)
+    lim = constant_op.constant(4.0, dtype=dtypes.float32)
+    accum = variables.Variable(1.0)
+    my_function_with_while(counter, lim, accum)
 
-      counter = variables.Variable(0.0)
-      lim = constant_op.constant(4.0, dtype=dtypes.float32)
-      accum = variables.Variable(1.0)
-      my_function_with_while(counter, lim, accum)
     self.assertAllClose(accum.read_value(), 16.0)
     self.assertIn(_WHILE_OP, instrument.graph_op_types)
     self.assertIn(_LESS_OP, instrument.graph_op_types)
@@ -585,61 +612,67 @@ class OpCallbacksTest(test_util.TensorFlowTestCase):
     less_op_outputs = instrument.graph_internal_ndarrays[_LESS_OP]
     self.assertAllClose(less_op_outputs, [True, True, True, True, False])
 
+  # TODO(cais): The following isn't decorated with
+  # `@test_util.run_in_graph_and_eager_modes` because of some apparent
+  # between `Dataset.map()` and `numpy_function()` used by
+  # `_NumpyFunctionCallback`. Maybe investigate.
   def testDatasetMapTest(self):
     instrument = _NumpyFunctionCallback()
 
-    with op_callbacks.op_callback(instrument.callback):
-      tensor = constant_op.constant(
-          [0.5, 1.0, 1.5, 2.0, 2.5, 3.0, 3.5, 4.0, 4.5, 5.0])
+    op_callbacks.add_op_callback(instrument.callback)
 
-      def map_fn(x):
-        return math_ops.log(math_ops.square(x) + 1)
+    tensor = constant_op.constant(
+        [0.5, 1.0, 1.5, 2.0, 2.5, 3.0, 3.5, 4.0, 4.5, 5.0])
 
-      dataset = dataset_ops.Dataset.from_tensor_slices(tensor).batch(2).map(
-          map_fn)
-      iterator = dataset.make_one_shot_iterator()
+    def map_fn(x):
+      return math_ops.log(math_ops.square(x) + 1)
 
-      self.assertAllClose(iterator.next(), np.log([1.25, 2]))
-      self.assertAllClose(iterator.next(), np.log([3.25, 5]))
+    dataset = dataset_ops.Dataset.from_tensor_slices(tensor).batch(2).map(
+        map_fn)
+    iterator = dataset_ops.make_one_shot_iterator(dataset)
 
-      self.assertIn(_SQUARE_OP, instrument.graph_op_types)
-      self.assertIn(_ADD_OP, instrument.graph_op_types)
-      self.assertIn(_LOG_OP, instrument.graph_op_types)
-      self.assertEqual(
-          len(instrument.eager_op_types), len(instrument.eager_op_names))
+    self.assertAllClose(iterator.next(), np.log([1.25, 2]))
+    self.assertAllClose(iterator.next(), np.log([3.25, 5]))
+
+    self.assertIn(_SQUARE_OP, instrument.graph_op_types)
+    self.assertIn(_ADD_OP, instrument.graph_op_types)
+    self.assertIn(_LOG_OP, instrument.graph_op_types)
+    self.assertEqual(
+        len(instrument.eager_op_types), len(instrument.eager_op_names))
 
   def testSparseTensorEagerExecution(self):
     instrument = _NumpyFunctionCallback()
+    op_callbacks.add_op_callback(instrument.callback)
 
-    with op_callbacks.op_callback(instrument.callback):
-      indices = [[1, 2], [2, 0], [3, 4]]
-      values = [0.0, 8.0, -2.0]
-      shape = [4, 5]
-      sp = sparse_tensor.SparseTensorValue(indices, values, shape)
-      w = ops.convert_to_tensor(np.ones([5, 1], np.float32))
+    indices = [[1, 2], [2, 0], [3, 4]]
+    values = [0.0, 8.0, -2.0]
+    shape = [4, 5]
+    sp = sparse_tensor.SparseTensorValue(indices, values, shape)
+    w = ops.convert_to_tensor(np.ones([5, 1], np.float32))
 
-      y = sparse_ops.sparse_tensor_dense_matmul(sp, w)
-      self.assertAllClose(y, [[0.0], [0.0], [8.0], [-2.0]])
-      self.assertIn(_SPARSE_TENSOR_DENSE_MATMUL_OP, instrument.eager_op_types)
-      self.assertFalse(instrument.graph_op_types)
+    y = sparse_ops.sparse_tensor_dense_matmul(sp, w)
+    self.assertAllClose(y, [[0.0], [0.0], [8.0], [-2.0]])
+    self.assertIn(_SPARSE_TENSOR_DENSE_MATMUL_OP, instrument.eager_op_types)
+    self.assertFalse(instrument.graph_op_types)
 
+  @test_util.run_in_graph_and_eager_modes
   def testSparseTensorFuncGraph(self):
     instrument = _NumpyFunctionCallback()
+    op_callbacks.add_op_callback(instrument.callback)
 
-    with op_callbacks.op_callback(instrument.callback):
+    @def_function.function
+    def dense_matmul(sp, w):
+      return sparse_ops.sparse_tensor_dense_matmul(sp, w)
 
-      @def_function.function
-      def dense_matmul(sp, w):
-        return sparse_ops.sparse_tensor_dense_matmul(sp, w)
-
-      indices = [[1, 2], [2, 0], [3, 4]]
-      values = [0.0, 8.0, -2.0]
-      shape = [4, 5]
-      sp = sparse_tensor.SparseTensorValue(indices, values, shape)
-      w = ops.convert_to_tensor(np.ones([5, 1], np.float32))
-      y = dense_matmul(sp, w)
-      self.assertAllClose(y, [[0.0], [0.0], [8.0], [-2.0]])
-      self.assertIn(_SPARSE_TENSOR_DENSE_MATMUL_OP, instrument.graph_op_types)
+    indices = [[1, 2], [2, 0], [3, 4]]
+    values = [0.0, 8.0, -2.0]
+    shape = [4, 5]
+    sp = sparse_tensor.SparseTensorValue(indices, values, shape)
+    w = ops.convert_to_tensor(np.ones([5, 1], np.float32))
+    y = dense_matmul(sp, w)
+    self.assertAllClose(y, [[0.0], [0.0], [8.0], [-2.0]])
+    self.assertIn(_SPARSE_TENSOR_DENSE_MATMUL_OP, instrument.graph_op_types)
+    if context.executing_eagerly():
       self.assertIn(
           dense_matmul.get_concrete_function(sp, w).name,
           instrument.eager_op_types)
@@ -647,98 +680,121 @@ class OpCallbacksTest(test_util.TensorFlowTestCase):
     # Check the graph internal ndarrays recorded at runtime.
     sparse_matmul_outputs = instrument.graph_internal_ndarrays[
         _SPARSE_TENSOR_DENSE_MATMUL_OP + b"/" + _SPARSE_TENSOR_DENSE_MATMUL_OP]
-    self.assertEqual(len(sparse_matmul_outputs), 1)
+    if context.executing_eagerly():
+      self.assertEqual(len(sparse_matmul_outputs), 1)
     self.assertAllClose(sparse_matmul_outputs[0], [[0.0], [0.0], [8.0], [-2.0]])
 
+  @test_util.run_in_graph_and_eager_modes
   def testOverrideDTypeInFuncGraph(self):
-
     def to_float64(op_type, inputs, attrs, outputs, op_name=None, graph=None):
       del op_type, inputs, attrs, op_name, graph  # Unused.
       return [math_ops.cast(output, dtypes.float64) for output in outputs]
 
-    with op_callbacks.op_callback(to_float64):
+    op_callbacks.add_op_callback(to_float64)
 
-      @def_function.function
-      def add_1_times_2(x):
-        return (x + 1.0) * 2.0
+    @def_function.function
+    def add_1_times_2(x):
+      return (x + 1.0) * 2.0
 
-      x = constant_op.constant(3.0, dtype=dtypes.float32)
-      y = add_1_times_2(x)
-      self.assertEqual(y.dtype, dtypes.float64)
-      self.assertAllClose(y, 8.0)
+    x = constant_op.constant(3.0, dtype=dtypes.float32)
+    y = add_1_times_2(x)
+    self.assertEqual(y.dtype, dtypes.float64)
+    self.assertAllClose(y, 8.0)
 
   def testNoOutputOpUnderEagerExecution(self):
     instrument = _NumpyFunctionCallback()
-    with op_callbacks.op_callback(instrument.callback):
-      x = constant_op.constant(10.0)
-      y = constant_op.constant(20.0)
-      z = x + y
-      w = control_flow_ops.group([z])
-      self.assertIsNone(w)
+    op_callbacks.add_op_callback(instrument.callback)
+
+    x = constant_op.constant(10.0)
+    y = constant_op.constant(20.0)
+    z = x + y
+    w = control_flow_ops.group([z])
+    self.assertIsNone(w)
     self.assertEqual(instrument.eager_op_types, [_ADD_OP])
 
+  @test_util.run_in_graph_and_eager_modes
   def testOpCallbackWorksWithGradientTape(self):
     instrument = _NumpyFunctionCallback()
+    op_callbacks.add_op_callback(instrument.callback)
 
-    with op_callbacks.op_callback(instrument.callback):
-      v = variables.Variable(3.0, dtype=dtypes.float32)
-      @def_function.function
-      def get_gradients():
-        with backprop.GradientTape() as tape:
-          loss = math_ops.sin(math_ops.square(v))
-          gradients = tape.gradient(loss, v)
-        return gradients
+    v = variables.Variable(3.0, dtype=dtypes.float32)
+    if not context.executing_eagerly():
+      self.evaluate(v.initializer)
 
-      gradients = get_gradients()
-      # Applying the chain rule.
-      self.assertAllClose(gradients, np.cos(3.0 * 3.0) * 3.0 * 2.0)
-      self.assertIn(_SQUARE_OP, instrument.graph_op_types)
-      self.assertIn(_SIN_OP, instrument.graph_op_types)
-      # The mul and cos ops are created for backprop.
-      self.assertIn(_MUL_OP, instrument.graph_op_types)
-      self.assertIn(_COS_OP, instrument.graph_op_types)
+    @def_function.function
+    def get_gradients():
+      with backprop.GradientTape() as tape:
+        loss = math_ops.sin(math_ops.square(v))
+        gradients = tape.gradient(loss, v)
+      return gradients
 
-      # Check the ndarrays from runtime.
-      cos_op_outputs = instrument.graph_internal_ndarrays[_COS_OP]
-      self.assertEqual(len(cos_op_outputs), 1)
-      self.assertAllClose(cos_op_outputs[0], np.cos(3.0 * 3.0))
+    gradients = get_gradients()
+    # Applying the chain rule.
+    self.assertAllClose(gradients, np.cos(3.0 * 3.0) * 3.0 * 2.0)
+    self.assertIn(_SQUARE_OP, instrument.graph_op_types)
+    self.assertIn(_SIN_OP, instrument.graph_op_types)
+    # The mul and cos ops are created for backprop.
+    self.assertIn(_MUL_OP, instrument.graph_op_types)
+    self.assertIn(_COS_OP, instrument.graph_op_types)
 
+    # Check the ndarrays from runtime.
+    cos_op_outputs = instrument.graph_internal_ndarrays[_COS_OP]
+    self.assertEqual(len(cos_op_outputs), 1)
+    self.assertAllClose(cos_op_outputs[0], np.cos(3.0 * 3.0))
+
+  @test_util.run_in_graph_and_eager_modes
   def testKeraModelFit(self):
     # TODO(cais): The purely PyFunc (numpy_function) based instrumentation
     # doesn't work for the entire Keras model and its fit() call, due to some
     # shape inference limitations. Use tfdbg's gen_debug_ops for testing
     # instead (b/139668469).
     instrument = _NumpyFunctionCallback(instrument_graph_ops=False)
+    op_callbacks.add_op_callback(instrument.callback)
 
-    with op_callbacks.op_callback(instrument.callback):
-      model = keras.Sequential()
-      model.add(keras.layers.Dense(10, input_shape=(8,), activation="relu"))
-      model.add(keras.layers.BatchNormalization())
-      model.add(keras.layers.Dense(1, activation="linear"))
-      model.compile(loss="mse", optimizer="adam")
+    model = keras.Sequential()
+    model.add(keras.layers.Dense(10, input_shape=(8,), activation="relu"))
+    model.add(keras.layers.BatchNormalization())
+    model.add(keras.layers.Dense(1, activation="linear"))
+    model.compile(loss="mse", optimizer="adam")
 
-      batch_size = 4
-      xs = random_ops.random_normal([batch_size, 8])
-      ys = random_ops.random_normal([batch_size, 1])
-      history = model.fit(xs, ys, epochs=2, verbose=0)
+    batch_size = 4
+    xs = np.ones([batch_size, 8])
+    ys = np.zeros([batch_size, 1])
+    history = model.fit(xs, ys, epochs=2, verbose=0)
 
-      # Simply assert that the training proceeded as expected and that
-      # op callbacks are invoked. We prefer not to assert on the details of the
-      # graph construction and the execution, in order to avoid future
-      # maintenance cost.
-      self.assertEqual(len(history.history["loss"]), 2)
-      self.assertTrue(instrument.graph_op_types)
-      self.assertEqual(len(instrument.graph_op_types),
-                       len(instrument.graph_op_names))
+    # Simply assert that the training proceeded as expected and that
+    # op callbacks are invoked. We prefer not to assert on the details of the
+    # graph construction and the execution, in order to avoid future
+    # maintenance cost.
+    self.assertEqual(len(history.history["loss"]), 2)
+    self.assertTrue(instrument.graph_op_types)
+    self.assertEqual(len(instrument.graph_op_types),
+                     len(instrument.graph_op_names))
+    if context.executing_eagerly():
       self.assertTrue(instrument.eager_op_types)
 
 
 class OpCallbacksErrorConditionsTest(test_util.TensorFlowTestCase):
 
+  def tearDown(self):
+    op_callbacks.clear_op_callbacks()
+    super(OpCallbacksErrorConditionsTest, self).tearDown()
+
   def testNonCallableObjectArgErrors(self):
     with self.assertRaisesRegex(ValueError, r"is expected to be callable"):
-      with op_callbacks.op_callback(1337):
-        pass
+      op_callbacks.add_op_callback(1337)
+
+  def testRemoveUnregisteredCallbackLeadsToError(self):
+    instrument = _NumpyFunctionCallback()
+    with self.assertRaisesRegex(KeyError, r"has not been registered"):
+      op_callbacks.remove_op_callback(instrument.callback)
+
+  def testRemovingCallbackTwiceLeadsToError(self):
+    instrument = _NumpyFunctionCallback()
+    op_callbacks.add_op_callback(instrument.callback)
+    op_callbacks.remove_op_callback(instrument.callback)
+    with self.assertRaisesRegex(KeyError, r"has not been registered"):
+      op_callbacks.remove_op_callback(instrument.callback)
 
   def testOverridingWithWrongNumberOfTensorOutputsErrors(self):
     def wrong_outputs_callback(op_type,
@@ -750,17 +806,17 @@ class OpCallbacksErrorConditionsTest(test_util.TensorFlowTestCase):
       del op_type, inputs, attrs, op_name, graph  # Unused.
       return outputs[0], math_ops.negative(outputs[0])
 
-    with op_callbacks.op_callback(wrong_outputs_callback):
+    op_callbacks.add_op_callback(wrong_outputs_callback)
 
-      @def_function.function
-      def log1p(x):
-        return math_ops.log(1.0 + x)
+    @def_function.function
+    def log1p(x):
+      return math_ops.log(1.0 + x)
 
-      x = constant_op.constant(3.0)
-      with self.assertRaisesRegex(
-          ValueError,
-          r"returned 2 tensors, .* does not match .* \(1\)"):
-        log1p(x)
+    x = constant_op.constant(3.0)
+    with self.assertRaisesRegex(
+        ValueError,
+        r"returned 2 tensors, .* does not match .* \(1\)"):
+      log1p(x)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/framework/op_def_library.py b/tensorflow/python/framework/op_def_library.py
index 1a759b52073..ab3cefd9f58 100644
--- a/tensorflow/python/framework/op_def_library.py
+++ b/tensorflow/python/framework/op_def_library.py
@@ -341,10 +341,9 @@ class OpDefLibrary(object):
       TypeError: On some errors.
       ValueError: On some errors.
     """
-    output_structure, is_stateful, op = self._apply_op_helper(
+    output_structure, is_stateful, op, outputs = self._apply_op_helper(
         op_type_name, name, **keywords)
     if output_structure:
-      outputs = op.outputs
       res = _Restructure(ops.convert_n_to_tensor(outputs), output_structure)
       if isinstance(res, list) and not res and is_stateful:
         return op
@@ -794,15 +793,19 @@ class OpDefLibrary(object):
                                    name=scope, input_types=input_types,
                                    attrs=attr_protos, op_def=op_def)
 
+      # `outputs` is returned as a separate return value so that the output
+      # tensors can the `op` per se can be decoupled so that the
+      # `op_callbacks` can function properly. See framework/op_callbacks.py
+      # for more details.
+      outputs = op.outputs
       # Conditionally invoke tfdbg v2's op callback(s).
       if op_callbacks.should_invoke_op_callbacks():
         callback_outputs = op_callbacks.invoke_op_callbacks(
-            op.node_def.op, tuple(op.inputs), attr_protos, tuple(op.outputs),
+            op.node_def.op, tuple(op.inputs), attr_protos, tuple(outputs),
             op_name=op.name, graph=g)
         if callback_outputs is not None:
-          for slot_index, callback_output in enumerate(callback_outputs):
-            op.outputs[slot_index] = callback_output
+          outputs = callback_outputs
 
-      return output_structure, op_def.is_stateful, op
+      return output_structure, op_def.is_stateful, op, outputs
 
 # pylint: enable=invalid-name
diff --git a/tensorflow/python/framework/ops.py b/tensorflow/python/framework/ops.py
index 83a973dd65a..78fbd8cf613 100644
--- a/tensorflow/python/framework/ops.py
+++ b/tensorflow/python/framework/ops.py
@@ -23,9 +23,11 @@ import copy
 import re
 import sys
 import threading
+import types
 
 import numpy as np
 import six
+from six.moves import map  # pylint: disable=redefined-builtin
 from six.moves import xrange  # pylint: disable=redefined-builtin
 
 from tensorflow.core.framework import attr_value_pb2
@@ -69,7 +71,9 @@ from tensorflow.python.util import tf_stack
 from tensorflow.python.util.compat import collections_abc
 from tensorflow.python.util.deprecation import deprecated_args
 from tensorflow.python.util.lazy_loader import LazyLoader
+from tensorflow.python.util.tf_export import kwarg_only
 from tensorflow.python.util.tf_export import tf_export
+from tensorflow.tools.docs.doc_controls import do_not_generate_docs
 
 ag_ctx = LazyLoader(
     "ag_ctx", globals(),
@@ -910,10 +914,21 @@ class _EagerTensorBase(Tensor):
     """Returns the length of the first dimension in the Tensor."""
     if not self.shape.ndims:
       raise TypeError("Scalar tensor has no `len()`")
-    return self._shape_tuple()[0]
+    # pylint: disable=protected-access
+    try:
+      return self._shape_tuple()[0]
+    except core._NotOkStatusException as e:
+      six.raise_from(core._status_to_exception(e.code, e.message), None)
+
+  def _numpy_internal(self):
+    raise NotImplementedError()
 
   def _numpy(self):
-    raise NotImplementedError()
+    # pylint: disable=protected-access
+    try:
+      return self._numpy_internal()
+    except core._NotOkStatusException as e:
+      six.raise_from(core._status_to_exception(e.code, e.message), None)
 
   @property
   def dtype(self):
@@ -1036,9 +1051,14 @@ class _EagerTensorBase(Tensor):
   @property
   def shape(self):
     if self._tensor_shape is None:  # pylint: disable=access-member-before-definition
-      # `_tensor_shape` is declared and defined in the definition of
-      # `EagerTensor`, in C.
-      self._tensor_shape = tensor_shape.TensorShape(self._shape_tuple())
+      # pylint: disable=protected-access
+      try:
+        # `_tensor_shape` is declared and defined in the definition of
+        # `EagerTensor`, in C.
+        self._tensor_shape = tensor_shape.TensorShape(self._shape_tuple())
+      except core._NotOkStatusException as e:
+        six.raise_from(core._status_to_exception(e.code, e.message), None)
+
     return self._tensor_shape
 
   def get_shape(self):
@@ -1558,8 +1578,8 @@ def _NodeDef(op_type, name, attrs=None):
 
 # Copied from core/framework/node_def_util.cc
 # TODO(mrry,josh11b): Consolidate this validation in C++ code.
-_VALID_OP_NAME_REGEX = re.compile("^[A-Za-z0-9.][A-Za-z0-9_.\\-/]*$")
-_VALID_SCOPE_NAME_REGEX = re.compile("^[A-Za-z0-9_.\\-/]*$")
+_VALID_OP_NAME_REGEX = re.compile("^[A-Za-z0-9.][A-Za-z0-9_.\\-/>]*$")
+_VALID_SCOPE_NAME_REGEX = re.compile("^[A-Za-z0-9_.\\-/>]*$")
 
 
 def _create_c_op(graph, node_def, inputs, control_inputs):
@@ -1784,15 +1804,22 @@ class Operation(object):
     self._graph._add_op(self, self._id_value, name)  # pylint: disable=protected-access
 
     if not c_op:
-      self._control_flow_post_processing()
+      self._control_flow_post_processing(input_tensors=inputs)
 
-  def _control_flow_post_processing(self):
+  def _control_flow_post_processing(self, input_tensors=None):
     """Add this op to its control flow context.
 
     This may add new ops and change this op's inputs. self.inputs must be
     available before calling this method.
+
+    Args:
+      input_tensors: (Optional.) A list of `Tensors` corresponding to the inputs
+        of this op, which should be equivalent to `self.inputs`. Pass this
+        argument to avoid evaluating `self.inputs` unnecessarily.
     """
-    for input_tensor in self.inputs:
+    if input_tensors is None:
+      input_tensors = self.inputs
+    for input_tensor in input_tensors:
       control_flow_util.CheckInputFromValidContext(self, input_tensor.op)
     if self._control_flow_context is not None:
       self._control_flow_context.AddOp(self)
@@ -2130,39 +2157,14 @@ class Operation(object):
     """The list of `Tensor` objects representing the outputs of this op."""
     return self._outputs
 
-  class _InputList(object):
-    """Immutable input list wrapper."""
-
-    def __init__(self, inputs):
-      self._inputs = inputs
-
-    def __iter__(self):
-      return iter(self._inputs)
-
-    def __len__(self):
-      return len(self._inputs)
-
-    def __bool__(self):
-      return bool(self._inputs)
-
-    # Python 3 wants __bool__, Python 2.7 wants __nonzero__
-    __nonzero__ = __bool__
-
-    def __getitem__(self, i):
-      return self._inputs[i]
-
   @property
   def inputs(self):
-    """The list of `Tensor` objects representing the data inputs of this op."""
+    """The sequence of `Tensor` objects representing the data inputs of this op."""
     if self._inputs_val is None:
-      tf_outputs = c_api.GetOperationInputs(self._c_op)
       # pylint: disable=protected-access
-      retval = [
-          self.graph._get_tensor_by_tf_output(tf_output)
-          for tf_output in tf_outputs
-      ]
+      self._inputs_val = tuple(map(self.graph._get_tensor_by_tf_output,
+                                   c_api.GetOperationInputs(self._c_op)))
       # pylint: enable=protected-access
-      self._inputs_val = Operation._InputList(retval)
     return self._inputs_val
 
   @property
@@ -6627,8 +6629,7 @@ def _op_to_colocate_with(v, graph):
   # colocation constraints altogether. Assuming that will
   # happen soon, perhaps this hack to work around the circular
   # import dependency is acceptable.
-  if hasattr(v, "handle") and hasattr(v.handle, "op") and isinstance(
-      v.handle.op, Operation):
+  if hasattr(v, "handle") and isinstance(v.handle, Tensor):
     if graph.building_function:
       return graph.capture(v.handle).op
     else:
@@ -6659,3 +6660,32 @@ internal_convert_n_to_tensor_or_indexed_slices = \
     indexed_slices.internal_convert_n_to_tensor_or_indexed_slices
 register_tensor_conversion_function = \
     tensor_conversion_registry.register_tensor_conversion_function
+
+
+# Helper functions for op wrapper modules generated by `python_op_gen`.
+
+
+def to_raw_op(f):
+  """Make a given op wrapper function `f` raw.
+
+  Raw op wrappers are not included in the docs, and can only be called
+  with keyword arguments.
+
+  Args:
+    f: An op wrapper function to make raw.
+
+  Returns:
+    Raw `f`.
+  """
+  # Copy `f` to get a new `__dict__`, otherwise `tf_export` will fail
+  # due to double-registration.
+  f = types.FunctionType(f.__code__, f.__globals__, f.__name__, f.__defaults__,
+                         f.__closure__)
+  return kwarg_only(do_not_generate_docs(f))
+
+
+def raise_from_not_ok_status(e, name):
+  message = e.message + (" name: " + name if name is not None else "")
+  # pylint: disable=protected-access
+  six.raise_from(core._status_to_exception(e.code, message), None)
+  # pylint: enable=protected-access
diff --git a/tensorflow/python/framework/ops_test.py b/tensorflow/python/framework/ops_test.py
index 5399e23098b..8f6167912a1 100644
--- a/tensorflow/python/framework/ops_test.py
+++ b/tensorflow/python/framework/ops_test.py
@@ -917,7 +917,7 @@ class OperationTest(test_util.TensorFlowTestCase):
       def test():
         output = control_flow_ops.while_loop(lambda x: x < 3, lambda x: x + 1,
                                              [1])
-        while_op = output.op.inputs[0].op
+        while_op = output.op
         self.assertEqual(while_op.type, "StatelessWhile")
         orig_num_inputs = len(while_op.inputs)
 
@@ -971,7 +971,7 @@ class OperationTest(test_util.TensorFlowTestCase):
       x = test_ops.int_output()
       op = test_ops.int_input_int_output(x, name="myop").op
     with self.assertRaisesRegexp(
-        AttributeError, "'_InputList' object has no attribute 'append'"):
+        AttributeError, "'tuple' object has no attribute 'append'"):
       op.inputs.append(None)
 
 
diff --git a/tensorflow/python/framework/python_op_gen.cc b/tensorflow/python/framework/python_op_gen.cc
index 864d7591796..7b140504992 100644
--- a/tensorflow/python/framework/python_op_gen.cc
+++ b/tensorflow/python/framework/python_op_gen.cc
@@ -316,12 +316,20 @@ string GenEagerPythonOp::Code() {
     if (!parameters.empty()) strings::StrAppend(&parameters, ", ");
     strings::StrAppend(&parameters, param.GetRenameTo());
   }
+  string parameters_with_defaults = parameters;
   for (const auto& param_and_default : params_with_default_) {
     if (!parameters.empty()) strings::StrAppend(&parameters, ", ");
-    strings::StrAppend(&parameters, param_and_default.first.GetRenameTo(), "=",
+    if (!parameters_with_defaults.empty())
+      strings::StrAppend(&parameters_with_defaults, ", ");
+    strings::StrAppend(&parameters, param_and_default.first.GetRenameTo());
+    strings::StrAppend(&parameters_with_defaults,
+                       param_and_default.first.GetRenameTo(), "=",
                        param_and_default.second);
   }
-  strings::StrAppend(&parameters, parameters.empty() ? "" : ", ", "name=None");
+
+  strings::StrAppend(&parameters, parameters.empty() ? "" : ", ", "name");
+  strings::StrAppend(&parameters_with_defaults,
+                     parameters_with_defaults.empty() ? "" : ", ", "name=None");
 
   // Add attr_expressions_ for attrs that are params.
   for (int i = 0; i < attrs_.size(); ++i) {
@@ -347,7 +355,7 @@ string GenEagerPythonOp::Code() {
 
   string eager_not_allowed_error = GetEagerNotAllowedError();
 
-  if (!AddEagerFastPathAndGraphCode(parameters, output_sizes,
+  if (!AddEagerFastPathAndGraphCode(parameters_with_defaults, output_sizes,
                                     eager_not_allowed_error)) {
     return result_;
   }
@@ -367,12 +375,13 @@ void GenEagerPythonOp::HandleGraphMode(
   if (api_def_.visibility() == ApiDef::VISIBLE) {
     strings::StrAppend(&result_, "  try:\n  ");
   }
-  strings::StrAppend(&result_, "  _, _, _op = _op_def_lib._apply_op_helper(\n");
+  strings::StrAppend(&result_,
+                     "  _, _, _op, _outputs = _op_def_lib._apply_op_helper(\n");
   AddBodyNoReturn(strings::StrCat("        \"", op_def_.name(), "\", "));
   AddDispatch("  ");
 
   if (num_outs_ > 0) {
-    strings::StrAppend(&result_, "  _result = _op.outputs[:]\n");
+    strings::StrAppend(&result_, "  _result = _outputs[:]\n");
     // Special case handling for stateful op with single list output
     // that might be empty.
     if (num_outs_ == 1 && op_def_.is_stateful() &&
@@ -693,11 +702,10 @@ bool GenEagerPythonOp::AddEagerFastPathAndGraphCode(
   AddDocStringOutputs();
   strings::StrAppend(&result_, "  \"\"\"\n");
 
-  strings::StrAppend(
-      &result_,
-      "  _ctx = _context._context or _context.context()\n"
-      "  if _ctx is not None and _ctx._thread_local_data.is_eager:",
-      "\n");
+  strings::StrAppend(&result_,
+                     "  _ctx = _context._context or _context.context()\n"
+                     "  if _ctx._thread_local_data.is_eager:",
+                     "\n");
   if (eager_not_allowed_error.empty()) {
     AddEagerFastPathExecute();
   } else {
@@ -722,20 +730,13 @@ bool GenEagerPythonOp::AddEagerFallbackCode(
     const string& num_outputs_expr, const string& eager_not_allowed_error) {
   AddDefLine(
       strings::StrCat(function_name_, kEagerFallbackSuffix),
-      strings::StrCat(parameters, parameters.empty() ? "" : ", ", "ctx=None"));
+      strings::StrCat(parameters, parameters.empty() ? "" : ", ", "ctx"));
 
   if (!eager_not_allowed_error.empty()) {
     strings::StrAppend(&result_, "  ", eager_not_allowed_error);
     return true;
   }
 
-  strings::StrAppend(
-      &result_, "  r\"\"\"This is the slowpath function for Eager mode.\n");
-  strings::StrAppend(&result_, "  This is for function ", function_name_,
-                     "\n  \"\"\"\n");
-
-  strings::StrAppend(&result_, "  _ctx = ctx if ctx else _context.context()\n");
-
   string function_setup;
   if (!GetEagerFunctionSetup("  ", &function_setup)) {
     result_ = function_setup;
@@ -759,7 +760,7 @@ bool GenEagerPythonOp::AddEagerFallbackCode(
 void GenEagerPythonOp::AddEagerFastPathExecute() {
   string fastpath_execute_params = strings::StrCat(
       "_ctx._context_handle, _ctx._thread_local_data.device_name, \"",
-      op_def_.name(), "\", ", "name, _ctx.post_execution_callbacks");
+      op_def_.name(), "\", ", "name, _ctx.op_callbacks");
   string fallback_params;
 
   for (int i = 0; i < api_def_.in_arg_size(); i++) {
@@ -819,14 +820,8 @@ void GenEagerPythonOp::AddEagerFastPathExecute() {
   // _NotOkStatusException.
   strings::StrAppend(&result_, "    ",
                      "except _core._NotOkStatusException as e:\n");
-  strings::StrAppend(&result_, "      ", "if name is not None:\n");
-  strings::StrAppend(&result_, "        ",
-                     "message = e.message + \" name: \" + name\n");
-  strings::StrAppend(&result_, "      ", "else:\n");
-  strings::StrAppend(&result_, "        ", "message = e.message\n");
-  strings::StrAppend(
-      &result_, "      ",
-      "_six.raise_from(_core._status_to_exception(e.code, message), None)\n");
+  strings::StrAppend(&result_, "      ",
+                     "_ops.raise_from_not_ok_status(e, name)\n");
 }
 
 void GenEagerPythonOp::AddEagerInferredAttrs(const string& indentation) {
@@ -841,7 +836,7 @@ void GenEagerPythonOp::AddEagerInferredAttrs(const string& indentation) {
         const string flattened =
             FlattenInputs(&arg_list->second, &output_sizes);
         string conversion = strings::StrCat("_execute.args_to_matching_eager(",
-                                            flattened, ", _ctx");
+                                            flattened, ", ctx");
         if (attr.has_default_value()) {
           strings::StrAppend(
               &conversion, ", ",
@@ -901,7 +896,7 @@ void GenEagerPythonOp::AddEagerInferredAttrs(const string& indentation) {
           conversion = "_execute.convert_to_mixed_eager_tensors";
         }
         strings::StrAppend(&result_, indentation, var_name, ", ", inputs_var,
-                           " = ", conversion, "(", inputs_var, ", _ctx)\n");
+                           " = ", conversion, "(", inputs_var, ", ctx)\n");
       }
     }
   }
@@ -948,7 +943,7 @@ void GenEagerPythonOp::AddEagerExecute(const string& indentation,
       strings::StrCat(indentation, "_result = _execute.execute(");
   const string return_args = strings::StrCat(
       "b\"", op_def_.name(), "\", ", num_outputs_expr,
-      ", inputs=_inputs_flat, attrs=_attrs, ctx=_ctx, name=name)");
+      ", inputs=_inputs_flat, attrs=_attrs, ctx=ctx, name=name)");
   strings::StrAppend(&result_,
                      // Wrap the arguments, and indent to the (.
                      WordWrap(return_prefix, return_args, kRightMargin), "\n");
@@ -968,32 +963,14 @@ void GenEagerPythonOp::AddDispatch(const string& prefix) {
 }
 
 void GenEagerPythonOp::AddRawOpExport(const string& parameters) {
-  string arguments;
-  for (const auto& param_names : param_names_) {
-    const string renamed = param_names.GetRenameTo();
-    strings::StrAppend(&arguments, arguments.empty() ? "" : ", ", renamed, "=",
-                       renamed);
-  }
-  strings::StrAppend(&arguments, arguments.empty() ? "" : ", ", "name=name");
-
+  // Example:
+  //
+  // Identity = tf_export("raw_ops.Identity")(_ops._to_raw_op(identity))
   const string raw_function_name =
       python_op_gen_internal::AvoidPythonReserved(op_def_.name());
-
-  strings::StrAppend(&result_, "def ", raw_function_name, "(", parameters,
-                     "):\n");
-  strings::StrAppend(&result_, "  return ", function_name_, "(", arguments,
-                     ")\n");
-
-  // Copy the __doc__ from the original op and apply the decorators.
-  strings::StrAppend(&result_, raw_function_name, ".__doc__", " = ",
-                     function_name_, ".__doc__\n");
-  strings::StrAppend(&result_, raw_function_name, " = ",
-                     "_doc_controls.do_not_generate_docs(_kwarg_only(",
-                     raw_function_name, "))\n");
-
-  // Export.
-  strings::StrAppend(&result_, "tf_export(\"raw_ops.", raw_function_name,
-                     "\")(", raw_function_name, ")\n");
+  strings::StrAppend(&result_, raw_function_name, " = tf_export(\"raw_ops.",
+                     raw_function_name, "\")", "(_ops.to_raw_op(",
+                     function_name_, "))\n");
 }
 
 string GetPythonOps(const OpList& ops, const ApiDefMap& api_defs,
@@ -1017,16 +994,13 @@ This file is MACHINE GENERATED! Do not edit.
 
   strings::StrAppend(&result, R"("""
 
-import collections as _collections
-import six as _six
+import collections
 
 from tensorflow.python import pywrap_tensorflow as _pywrap_tensorflow
 from tensorflow.python.eager import context as _context
 from tensorflow.python.eager import core as _core
 from tensorflow.python.eager import execute as _execute
 from tensorflow.python.framework import dtypes as _dtypes
-from tensorflow.python.framework import errors as _errors
-from tensorflow.python.framework import tensor_shape as _tensor_shape
 
 from tensorflow.core.framework import op_def_pb2 as _op_def_pb2
 # Needed to trigger the call to _set_call_cpp_shape_fn.
@@ -1037,8 +1011,6 @@ from tensorflow.python.framework import op_def_library as _op_def_library
 from tensorflow.python.util.deprecation import deprecated_endpoints
 from tensorflow.python.util import dispatch as _dispatch
 from tensorflow.python.util.tf_export import tf_export
-from tensorflow.python.util.tf_export import kwarg_only as _kwarg_only
-from tensorflow.tools.docs import doc_controls as _doc_controls
 
 )");
 
@@ -1109,11 +1081,6 @@ from tensorflow.tools.docs import doc_controls as _doc_controls
   return op_def_lib
 )");
 
-  result.append("# ");
-  auto ops_text = cleaned_ops.DebugString();
-  absl::StripTrailingAsciiWhitespace(&ops_text);
-  result.append(str_util::StringReplace(ops_text, "\n", "\n# ", true));
-  result.append("\n");
   strings::Appendf(&result, "_op_def_lib = _InitOpDefLibrary(b\"%s\")\n",
                    absl::CEscape(cleaned_ops.SerializeAsString()).c_str());
   return result;
diff --git a/tensorflow/python/framework/python_op_gen_internal.cc b/tensorflow/python/framework/python_op_gen_internal.cc
index e6d9f9563e5..7fd7dd238ca 100644
--- a/tensorflow/python/framework/python_op_gen_internal.cc
+++ b/tensorflow/python/framework/python_op_gen_internal.cc
@@ -108,8 +108,22 @@ bool IsOpWithUnderscorePrefix(const string& s) {
 }
 
 string AvoidPythonReserved(const string& s) {
-  if (IsPythonReserved(s)) return strings::StrCat(s, "_");
-  return s;
+  const char namespace_separator = '>';
+  const char joiner = '_';
+  const int last_index = s.size();
+  string result;
+  for (int i = 0; i < last_index; ++i) {
+    const char c = s[i];
+    // Convert namespace separators ('>' characters) to joiners
+    if (c == namespace_separator) {
+      result.push_back(joiner);
+    } else {
+      result.push_back(c);
+    }
+  }
+
+  if (IsPythonReserved(result)) return strings::StrCat(result, "_");
+  return result;
 }
 
 // Indent the first line by "initial" spaces and all following lines
@@ -467,20 +481,24 @@ string AttrValueToPython(const string& type, const AttrValue& value,
 
 void GenerateLowerCaseOpName(const string& str, string* result) {
   const char joiner = '_';
+  const char namespace_separator = '>';
   const int last_index = str.size() - 1;
   for (int i = 0; i <= last_index; ++i) {
     const char c = str[i];
     // Convert namespace separators ('>' characters) to joiners
-    if (c == '>') {
+    if (c == namespace_separator) {
       result->push_back(joiner);
       continue;
     }
 
     // Emit a joiner only if a previous-lower-to-now-upper or a
     // now-upper-to-next-lower transition happens.
+    // (But don't emit an extra joiner if we just saw a namespace separator
     if (isupper(c) && (i > 0)) {
       if (islower(str[i - 1]) || ((i < last_index) && islower(str[i + 1]))) {
-        result->push_back(joiner);
+        if (!(str[i - 1] == namespace_separator)) {
+          result->push_back(joiner);
+        }
       }
     }
     result->push_back(tolower(c));
@@ -766,36 +784,28 @@ void GenPythonOp::AddDocStringNameArg() {
 }
 
 void GenPythonOp::AddOutputGlobals() {
-  // Prepare a NamedTuple type to hold the outputs, if there are multiple
+  // Generate a namedtuple class to hold the outputs, if there are multiple.
+  // Example:
+  //
+  // _OpOutputs = collections.namedtuple(
+  //     "_OpOutputs",
+  //     "out1 out2 out3")
   if (num_outs_ > 1) {
-    // Prepare the list of output names
-    std::vector<string> out_names(num_outs_);
+    std::vector<string> out_names;
+    out_names.reserve(num_outs_);
     for (int i = 0; i < num_outs_; ++i) {
-      if (!api_def_.out_arg(i).rename_to().empty()) {
-        out_names[i] = api_def_.out_arg(i).rename_to();
-      } else {
-        out_names[i] = strings::StrCat("output", i);
-      }
+      const string out_name = !api_def_.out_arg(i).rename_to().empty()
+                                  ? api_def_.out_arg(i).rename_to()
+                                  : strings::StrCat("output", i);
+      out_names.push_back(strings::StrCat("\"", out_name, "\""));
     }
-    string out_names_list =
-        strings::StrCat("[\"", absl::StrJoin(out_names, "\", \""), "\"]");
-
-    // Provide the output names as a Python list
-    string lower_op_name_outputs =
-        strings::StrCat("_", function_name_, "_outputs");
-    const string outputs_prefix = strings::StrCat(lower_op_name_outputs, " = ");
-    strings::StrAppend(&prelude_, "\n",
-                       WordWrap(outputs_prefix, out_names_list, kRightMargin),
-                       "\n");
 
     strings::StrAppend(&prelude_, "_", op_def_.name(),
-                       "Output = _collections.namedtuple(\n");
-    const string tuple_type_prefix = "    ";
-    const string tuple_type_suffix = strings::StrCat(
-        "\"", op_def_.name(), "\", ", lower_op_name_outputs, ")");
-    strings::StrAppend(
-        &prelude_, WordWrap(tuple_type_prefix, tuple_type_suffix, kRightMargin),
-        "\n\n");
+                       "Output = collections.namedtuple(\n");
+    strings::StrAppend(&prelude_, "    \"", op_def_.name(), "\",\n");
+    strings::StrAppend(&prelude_, "    [", absl::StrJoin(out_names, ", "),
+                       "])");
+    strings::StrAppend(&prelude_, "\n\n");
   }
   strings::StrAppend(&prelude_, "\n");
 }
diff --git a/tensorflow/python/framework/test_util.py b/tensorflow/python/framework/test_util.py
index 01e257cfdd1..fa3699c026d 100644
--- a/tensorflow/python/framework/test_util.py
+++ b/tensorflow/python/framework/test_util.py
@@ -54,6 +54,7 @@ from tensorflow.python import pywrap_tensorflow
 from tensorflow.python import tf2
 from tensorflow.python.client import device_lib
 from tensorflow.python.client import session
+from tensorflow.python.compat.compat import forward_compatibility_horizon
 from tensorflow.python.eager import context
 from tensorflow.python.eager import def_function
 from tensorflow.python.eager import tape
@@ -1396,6 +1397,41 @@ def run_cuda_only(func=None):
   return decorator
 
 
+def with_forward_compatibility_horizons(*horizons):
+  """Executes the decorated test with the specified forward-compat horizons.
+
+  Args:
+    *horizons: A list of (year, month, day) tuples.  If the list includes
+      `None`, then the test will also be run with no forward-compatibility
+      horizon set.
+
+  Returns:
+    A decorator that will execute the test with the specified horizons.
+  """
+  if not horizons:
+    raise ValueError("Expected at least one horizon.")
+  for horizon in horizons:
+    if not ((horizon is None) or
+            (len(horizon) == 3 and all(isinstance(x, int) for x in horizon))):
+      raise ValueError("Bad horizon value: %r" % horizon)
+
+  def decorator(f):
+    if tf_inspect.isclass(f):
+      raise ValueError("`with_forward_compatibility_horizons` only "
+                       "supports test methods.")
+    def decorated(self, *args, **kwargs):
+      for horizon in horizons:
+        if horizon is None:
+          f(self, *args, **kwargs)
+        else:
+          (year, month, day) = horizon
+          with forward_compatibility_horizon(year, month, day):
+            f(self, *args, **kwargs)
+    return decorated
+
+  return decorator
+
+
 @tf_export("test.is_gpu_available")
 def is_gpu_available(cuda_only=False, min_cuda_compute_capability=None):
   """Returns whether TensorFlow can access a GPU.
diff --git a/tensorflow/python/framework/test_util_test.py b/tensorflow/python/framework/test_util_test.py
index 6657d887592..6278bb4e270 100644
--- a/tensorflow/python/framework/test_util_test.py
+++ b/tensorflow/python/framework/test_util_test.py
@@ -31,6 +31,7 @@ from google.protobuf import text_format
 
 from tensorflow.core.framework import graph_pb2
 from tensorflow.core.protobuf import meta_graph_pb2
+from tensorflow.python.compat import compat
 from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
@@ -768,6 +769,23 @@ class TestUtilTest(test_util.TensorFlowTestCase, parameterized.TestCase):
     self.assertTrue(test_object.graph_mode_tested)
     self.assertTrue(test_object.inside_function_tested)
 
+  def test_with_forward_compatibility_horizons(self):
+
+    tested_codepaths = set()
+    def some_function_with_forward_compat_behavior():
+      if compat.forward_compatible(2050, 1, 1):
+        tested_codepaths.add("future")
+      else:
+        tested_codepaths.add("present")
+
+    @test_util.with_forward_compatibility_horizons(None, [2051, 1, 1])
+    def some_test(self):
+      del self  # unused
+      some_function_with_forward_compat_behavior()
+
+    some_test(None)
+    self.assertEqual(tested_codepaths, set(["present", "future"]))
+
 
 # Its own test case to reproduce variable sharing issues which only pop up when
 # setUp() is overridden and super() is not called.
diff --git a/tensorflow/python/grappler/memory_optimizer_test.py b/tensorflow/python/grappler/memory_optimizer_test.py
index cc186e68a3e..d2b634a4c62 100644
--- a/tensorflow/python/grappler/memory_optimizer_test.py
+++ b/tensorflow/python/grappler/memory_optimizer_test.py
@@ -318,7 +318,7 @@ class MemoryOptimizerRecomputeTest(test.TestCase):
             memory_optimization=rewriter_config_pb2.RewriterConfig.MANUAL))
     rewritten_graph_def = tf_optimizer.OptimizeGraph(config, metagraph)
     self.assertEqual(
-        7,
+        9,
         len([
             node for node in rewritten_graph_def.node
             if 'Recomputed/' in node.name
diff --git a/tensorflow/python/keras/backend.py b/tensorflow/python/keras/backend.py
index 2ca2e7382e1..44b59ea3cb6 100644
--- a/tensorflow/python/keras/backend.py
+++ b/tensorflow/python/keras/backend.py
@@ -152,26 +152,30 @@ def cast_to_floatx(x):
   """Cast a Numpy array to the default Keras float type.
 
   Arguments:
-      x: Numpy array.
+      x: Numpy array or TensorFlow tensor.
 
   Returns:
-      The same Numpy array, cast to its new type.
+      The same array (Numpy array if `x` was a Numpy array, or TensorFlow tensor
+      if `x` was a tensor), cast to its new type.
 
   Example:
-  ```python
-      >>> from tensorflow.keras import backend as K
-      >>> K.floatx()
-      'float32'
-      >>> arr = numpy.array([1.0, 2.0], dtype='float64')
-      >>> arr.dtype
-      dtype('float64')
-      >>> new_arr = K.cast_to_floatx(arr)
-      >>> new_arr
-      array([ 1.,  2.], dtype=float32)
-      >>> new_arr.dtype
-      dtype('float32')
-  ```
+
+  >>> tf.keras.backend.floatx()
+  'float32'
+  >>> arr = np.array([1.0, 2.0], dtype='float64')
+  >>> arr.dtype
+  dtype('float64')
+  >>> new_arr = cast_to_floatx(arr)
+  >>> new_arr
+  array([1.,  2.], dtype=float32)
+  >>> new_arr.dtype
+  dtype('float32')
+
   """
+  if isinstance(x, (ops.Tensor,
+                    variables_module.Variable,
+                    sparse_tensor.SparseTensor)):
+    return math_ops.cast(x, dtype=floatx())
   return np.asarray(x, dtype=floatx())
 
 
@@ -193,12 +197,11 @@ def get_uid(prefix=''):
 
   Example:
 
-  ```
-    >>> get_uid('dense')
-    1
-    >>> get_uid('dense')
-    2
-  ```
+  >>> get_uid('dense')
+  1
+  >>> get_uid('dense')
+  2
+
   """
   graph = get_graph()
   if graph not in PER_GRAPH_OBJECT_NAME_UIDS:
@@ -692,15 +695,15 @@ def is_sparse(tensor):
       A boolean.
 
   Example:
-  ```python
-  >>> from keras import backend as K
-  >>> a = K.placeholder((2, 2), sparse=False)
-  >>> print(K.is_sparse(a))
+
+
+  >>> a = tf.keras.backend.placeholder((2, 2), sparse=False)
+  >>> print(tf.keras.backend.is_sparse(a))
   False
-  >>> b = K.placeholder((2, 2), sparse=True)
-  >>> print(K.is_sparse(b))
+  >>> b = tf.keras.backend.placeholder((2, 2), sparse=True)
+  >>> print(tf.keras.backend.is_sparse(b))
   True
-  ```
+
   """
   return isinstance(tensor, sparse_tensor.SparseTensor)
 
@@ -716,15 +719,15 @@ def to_dense(tensor):
       A dense tensor.
 
   Examples:
-  ```python
-  >>> from keras import backend as K
-  >>> b = K.placeholder((2, 2), sparse=True)
-  >>> print(K.is_sparse(b))
+
+
+  >>> b = tf.keras.backend.placeholder((2, 2), sparse=True)
+  >>> print(tf.keras.backend.is_sparse(b))
   True
-  >>> c = K.to_dense(b)
-  >>> print(K.is_sparse(c))
+  >>> c = tf.keras.backend.to_dense(b)
+  >>> print(tf.keras.backend.is_sparse(c))
   False
-  ```
+
   """
   if is_sparse(tensor):
     return sparse_ops.sparse_tensor_to_dense(tensor)
@@ -741,13 +744,13 @@ def name_scope(name):
 
   For example, to define a new Python op called `my_op`:
 
-  ```python
+
   def my_op(a):
     with tf.name_scope("MyOp") as scope:
       a = tf.convert_to_tensor(a, name="a")
       # Define some computation that uses `a`.
       return foo_op(..., name=scope)
-  ```
+
 
   When executed, the Tensor `a` will have the name `MyOp/a`.
 
@@ -775,19 +778,17 @@ def variable(value, dtype=None, name=None, constraint=None):
       A variable instance (with Keras metadata included).
 
   Examples:
-  ```python
-  >>> import numpy as np
-  >>> from keras import backend as K
+
   >>> val = np.array([[1, 2], [3, 4]])
-  >>> kvar = K.variable(value=val, dtype='float64', name='example_var')
-  >>> K.dtype(kvar)
+  >>> kvar = tf.keras.backend.variable(value=val, dtype='float64',
+  ...                                  name='example_var')
+  >>> tf.keras.backend.dtype(kvar)
   'float64'
   >>> print(kvar)
-  example_var
-  >>> kvar.eval()
-  array([[ 1.,  2.],
-         [ 3.,  4.]])
-  ```
+  <tf.Variable 'example_var:...' shape=(2, 2) dtype=float64, numpy=
+    array([[1., 2.],
+           [3., 4.]])>
+
   """
   if dtype is None:
     dtype = floatx()
@@ -855,10 +856,10 @@ def unique_object_name(name,
 
   Example:
 
-  ```python
+
   _unique_layer_name('dense')  # dense_1
   _unique_layer_name('dense')  # dense_2
-  ```
+
   """
   if name_uid_map is None:
     name_uid_map = get_default_graph_uid_map()
@@ -946,34 +947,31 @@ def is_keras_tensor(x):
       ValueError: In case `x` is not a symbolic tensor.
 
   Examples:
-  ```python
-  >>> import tensorflow as tf
-  >>> import numpy
-  >>> from keras import backend as K
-  >>> from keras.layers import Input, Dense
-  >>> np_var = numpy.array([1, 2])
-  >>> K.is_keras_tensor(np_var) # A numpy array is not a symbolic tensor.
-  ValueError
-  >>> k_var = tf.compat.v1.placeholder('float32', shape=(1,1))
-  >>> K.is_keras_tensor(k_var) # A variable indirectly created outside of
-  keras is not a Keras tensor.
+
+  >>> np_var = np.array([1, 2])
+  >>> # A numpy array is not a symbolic tensor.
+  >>> tf.keras.backend.is_keras_tensor(np_var)
+  Traceback (most recent call last):
+  ...
+  ValueError: Unexpectedly found an instance of type `<class 'numpy.ndarray'>`.
+  Expected a symbolic tensor instance.
+  >>> keras_var = tf.keras.backend.variable(np_var)
+  >>> # A variable created with the keras backend is not a Keras tensor.
+  >>> tf.keras.backend.is_keras_tensor(keras_var)
   False
-  >>> keras_var = K.variable(np_var)
-  >>> K.is_keras_tensor(keras_var)  # A variable created with the keras
-  backend is not a Keras tensor.
+  >>> keras_placeholder = tf.keras.backend.placeholder(shape=(2, 4, 5))
+  >>> # A placeholder is not a Keras tensor.
+  >>> tf.keras.backend.is_keras_tensor(keras_placeholder)
   False
-  >>> keras_placeholder = K.placeholder(shape=(2, 4, 5))
-  >>> K.is_keras_tensor(keras_placeholder)  # A placeholder is not a Keras
-  tensor.
-  False
-  >>> keras_input = Input([10])
-  >>> K.is_keras_tensor(keras_input) # An Input is a Keras tensor.
+  >>> keras_input = tf.keras.layers.Input([10])
+  >>> # An Input is a Keras tensor.
+  >>> tf.keras.backend.is_keras_tensor(keras_input)
   True
-  >>> keras_layer_output = Dense(10)(keras_input)
-  >>> K.is_keras_tensor(keras_layer_output) # Any Keras layer output is a
-  Keras tensor.
+  >>> keras_layer_output = tf.keras.layers.Dense(10)(keras_input)
+  >>> # Any Keras layer output is a Keras tensor.
+  >>> tf.keras.backend.is_keras_tensor(keras_layer_output)
   True
-  ```
+
   """
   if not isinstance(x, (ops.Tensor,
                         variables_module.Variable,
@@ -1014,12 +1012,12 @@ def placeholder(shape=None,
       Tensor instance (with Keras metadata included).
 
   Examples:
-  ```python
-  >>> from keras import backend as K
-  >>> input_ph = K.placeholder(shape=(2, 4, 5))
+
+
+  >>> input_ph = tf.keras.backend.placeholder(shape=(2, 4, 5))
   >>> input_ph
-  <tf.Tensor 'Placeholder_4:0' shape=(2, 4, 5) dtype=float32>
-  ```
+  <tf.Tensor 'Placeholder_...' shape=(2, 4, 5) dtype=float32>
+
   """
   if sparse and ragged:
     raise ValueError(
@@ -1130,23 +1128,14 @@ def shape(x):
 
   Examples:
 
-  ```python
-  # TensorFlow example
-  >>> from keras import backend as K
-  >>> tf_session = K.get_session()
   >>> val = np.array([[1, 2], [3, 4]])
-  >>> kvar = K.variable(value=val)
-  >>> input = keras.backend.placeholder(shape=(2, 4, 5))
-  >>> K.shape(kvar)
-  <tf.Tensor 'Shape_8:0' shape=(2,) dtype=int32>
-  >>> K.shape(input)
-  <tf.Tensor 'Shape_9:0' shape=(3,) dtype=int32>
-  # To get integer shape (Instead, you can use K.int_shape(x))
-  >>> K.shape(kvar).eval(session=tf_session)
-  array([2, 2], dtype=int32)
-  >>> K.shape(input).eval(session=tf_session)
-  array([2, 4, 5], dtype=int32)
-  ```
+  >>> kvar = tf.keras.backend.variable(value=val)
+  >>> tf.keras.backend.shape(kvar)
+  <tf.Tensor: id=327, shape=(2,), dtype=int32, numpy=array([2, 2], dtype=int32)>
+  >>> input = tf.keras.backend.placeholder(shape=(2, 4, 5))
+  >>> tf.keras.backend.shape(input)
+  <tf.Tensor 'Shape_...' shape=(3,) dtype=int32>
+
   """
   return array_ops.shape(x)
 
@@ -1162,16 +1151,15 @@ def int_shape(x):
       A tuple of integers (or None entries).
 
   Examples:
-  ```python
-  >>> from keras import backend as K
-  >>> input = K.placeholder(shape=(2, 4, 5))
-  >>> K.int_shape(input)
+
+  >>> input = tf.keras.backend.placeholder(shape=(2, 4, 5))
+  >>> tf.keras.backend.int_shape(input)
   (2, 4, 5)
   >>> val = np.array([[1, 2], [3, 4]])
-  >>> kvar = K.variable(value=val)
-  >>> K.int_shape(kvar)
+  >>> kvar = tf.keras.backend.variable(value=val)
+  >>> tf.keras.backend.int_shape(kvar)
   (2, 2)
-  ```
+
   """
   try:
     shape = x.shape
@@ -1193,16 +1181,16 @@ def ndim(x):
       Integer (scalar), number of axes.
 
   Examples:
-  ```python
-  >>> from keras import backend as K
-  >>> input = K.placeholder(shape=(2, 4, 5))
+
+
+  >>> input = tf.keras.backend.placeholder(shape=(2, 4, 5))
   >>> val = np.array([[1, 2], [3, 4]])
-  >>> kvar = K.variable(value=val)
-  >>> K.ndim(input)
+  >>> kvar = tf.keras.backend.variable(value=val)
+  >>> tf.keras.backend.ndim(input)
   3
-  >>> K.ndim(kvar)
+  >>> tf.keras.backend.ndim(kvar)
   2
-  ```
+
   """
   dims = x.shape._dims
   if dims is not None:
@@ -1221,22 +1209,23 @@ def dtype(x):
       String, dtype of `x`.
 
   Examples:
-  ```python
-  >>> from keras import backend as K
-  >>> K.dtype(K.placeholder(shape=(2,4,5)))
+
+  >>> tf.keras.backend.dtype(tf.keras.backend.placeholder(shape=(2,4,5)))
   'float32'
-  >>> K.dtype(K.placeholder(shape=(2,4,5), dtype='float32'))
+  >>> tf.keras.backend.dtype(tf.keras.backend.placeholder(shape=(2,4,5),
+  ...                                                     dtype='float32'))
   'float32'
-  >>> K.dtype(K.placeholder(shape=(2,4,5), dtype='float64'))
+  >>> tf.keras.backend.dtype(tf.keras.backend.placeholder(shape=(2,4,5),
+  ...                                                     dtype='float64'))
   'float64'
-  # Keras variable
-  >>> kvar = K.variable(np.array([[1, 2], [3, 4]]))
-  >>> K.dtype(kvar)
+  >>> kvar = tf.keras.backend.variable(np.array([[1, 2], [3, 4]]))
+  >>> tf.keras.backend.dtype(kvar)
   'float32'
-  >>> kvar = K.variable(np.array([[1, 2], [3, 4]]), dtype='float32')
-  >>> K.dtype(kvar)
+  >>> kvar = tf.keras.backend.variable(np.array([[1, 2], [3, 4]]),
+  ...                                  dtype='float32')
+  >>> tf.keras.backend.dtype(kvar)
   'float32'
-  ```
+
   """
   return x.dtype.base_dtype.name
 
@@ -1252,13 +1241,13 @@ def eval(x):
       A Numpy array.
 
   Examples:
-  ```python
-  >>> from keras import backend as K
-  >>> kvar = K.variable(np.array([[1, 2], [3, 4]]), dtype='float32')
-  >>> K.eval(kvar)
-  array([[ 1.,  2.],
-         [ 3.,  4.]], dtype=float32)
-  ```
+
+  >>> kvar = tf.keras.backend.variable(np.array([[1, 2], [3, 4]]),
+  ...                                  dtype='float32')
+  >>> tf.keras.backend.eval(kvar)
+  array([[1.,  2.],
+         [3.,  4.]], dtype=float32)
+
   """
   return get_value(to_dense(x))
 
@@ -1279,17 +1268,22 @@ def zeros(shape, dtype=None, name=None):
 
   Example:
 
-  ```python
-  from tensorflow.keras import backend as K
-  kvar = K.zeros((3,4))
-  K.eval(kvar)
-  # array([[ 0.,  0.,  0.,  0.], [ 0.,  0.,  0.,  0.],
-  #       [ 0.,  0.,  0.,  0.]], dtype=float32)
-  A = tf.constant([1,2,3])
-  kvar2 = K.zeros(A.shape) # [0., 0., 0.] float32 by default
-  kvar3 = K.zeros(A.shape,dtype=tf.int32) # [0, 0, 0] with int32 dtype
-  kvar4 = K.zeros([2,3]) # [[0., 0., 0.], [0., 0., 0.]]
-  ```
+  >>> kvar = tf.keras.backend.zeros((3,4))
+  >>> tf.keras.backend.eval(kvar)
+  array([[0.,  0.,  0.,  0.],
+         [0.,  0.,  0.,  0.],
+         [0.,  0.,  0.,  0.]], dtype=float32)
+  >>> A = tf.constant([1,2,3])
+  >>> kvar2 = tf.keras.backend.zeros(A.shape) # [0., 0., 0.]
+  >>> tf.keras.backend.eval(kvar2)
+  array([0., 0., 0.], dtype=float32)
+  >>> kvar3 = tf.keras.backend.zeros(A.shape,dtype=tf.int32)
+  >>> tf.keras.backend.eval(kvar3)
+  array([0, 0, 0], dtype=int32)
+  >>> kvar4 = tf.keras.backend.zeros([2,3])
+  >>> tf.keras.backend.eval(kvar4)
+  array([[0., 0., 0.],
+         [0., 0., 0.]], dtype=float32)
 
   """
   with ops.init_scope():
@@ -1318,14 +1312,14 @@ def ones(shape, dtype=None, name=None):
       and will return a dynamically-shaped tensor instead.
 
   Example:
-  ```python
-  >>> from keras import backend as K
-  >>> kvar = K.ones((3,4))
-  >>> K.eval(kvar)
-  array([[ 1.,  1.,  1.,  1.],
-         [ 1.,  1.,  1.,  1.],
-         [ 1.,  1.,  1.,  1.]], dtype=float32)
-  ```
+
+
+  >>> kvar = tf.keras.backend.ones((3,4))
+  >>> tf.keras.backend.eval(kvar)
+  array([[1.,  1.,  1.,  1.],
+         [1.,  1.,  1.,  1.],
+         [1.,  1.,  1.,  1.]], dtype=float32)
+
   """
   with ops.init_scope():
     if dtype is None:
@@ -1351,14 +1345,14 @@ def eye(size, dtype=None, name=None):
       A Keras variable, an identity matrix.
 
   Example:
-  ```python
-  >>> from keras import backend as K
-  >>> kvar = K.eye(3)
-  >>> K.eval(kvar)
-  array([[ 1.,  0.,  0.],
-         [ 0.,  1.,  0.],
-         [ 0.,  0.,  1.]], dtype=float32)
-  ```
+
+
+  >>> kvar = tf.keras.backend.eye(3)
+  >>> tf.keras.backend.eval(kvar)
+  array([[1.,  0.,  0.],
+         [0.,  1.,  0.],
+         [0.,  0.,  1.]], dtype=float32)
+
 
   """
   if dtype is None:
@@ -1382,13 +1376,13 @@ def zeros_like(x, dtype=None, name=None):
 
   Example:
 
-  ```python
+
   from tensorflow.keras import backend as K
   kvar = K.variable(np.random.random((2,3)))
   kvar_zeros = K.zeros_like(kvar)
   K.eval(kvar_zeros)
   # array([[ 0.,  0.,  0.], [ 0.,  0.,  0.]], dtype=float32)
-  ```
+
 
   """
   return array_ops.zeros_like(x, dtype=dtype, name=name)
@@ -1408,14 +1402,13 @@ def ones_like(x, dtype=None, name=None):
       A Keras variable with the shape of x filled with ones.
 
   Example:
-  ```python
-  >>> from keras import backend as K
-  >>> kvar = K.variable(np.random.random((2,3)))
-  >>> kvar_ones = K.ones_like(kvar)
-  >>> K.eval(kvar_ones)
-  array([[ 1.,  1.,  1.],
-         [ 1.,  1.,  1.]], dtype=float32)
-  ```
+
+  >>> kvar = tf.keras.backend.variable(np.random.random((2,3)))
+  >>> kvar_ones = tf.keras.backend.ones_like(kvar)
+  >>> tf.keras.backend.eval(kvar_ones)
+  array([[1.,  1.,  1.],
+         [1.,  1.,  1.]], dtype=float32)
+
   """
   return array_ops.ones_like(x, dtype=dtype, name=name)
 
@@ -1449,15 +1442,13 @@ def random_uniform_variable(shape, low, high, dtype=None, name=None, seed=None):
       A Keras variable, filled with drawn samples.
 
   Example:
-  ```python
+
   # TensorFlow example
-  >>> kvar = K.random_uniform_variable((2,3), 0, 1)
+  >>> kvar = tf.keras.backend.random_uniform_variable((2,3), 0, 1)
   >>> kvar
-  <tensorflow.python.ops.variables.Variable object at 0x10ab40b10>
-  >>> K.eval(kvar)
-  array([[ 0.10940075,  0.10047495,  0.476143  ],
-         [ 0.66137183,  0.00869417,  0.89220798]], dtype=float32)
-  ```
+  <tf.Variable 'Variable:0' shape=(2, 3) dtype=float32, numpy=...,
+  dtype=float32)>
+
   """
   if dtype is None:
     dtype = floatx()
@@ -1487,15 +1478,13 @@ def random_normal_variable(shape, mean, scale, dtype=None, name=None,
       A Keras variable, filled with drawn samples.
 
   Example:
-  ```python
+
   # TensorFlow example
-  >>> kvar = K.random_normal_variable((2,3), 0, 1)
+  >>> kvar = tf.keras.backend.random_normal_variable((2,3), 0, 1)
   >>> kvar
-  <tensorflow.python.ops.variables.Variable object at 0x10ab12dd0>
-  >>> K.eval(kvar)
-  array([[ 1.19591331,  0.68685907, -0.63814116],
-         [ 0.92629528,  0.28055015,  1.70484698]], dtype=float32)
-  ```
+  <tf.Variable 'Variable:0' shape=(2, 3) dtype=float32, numpy=...,
+  dtype=float32)>
+
   """
   if dtype is None:
     dtype = floatx()
@@ -1519,14 +1508,14 @@ def count_params(x):
       Integer, the number of scalars in `x`.
 
   Example:
-  ```python
-  >>> kvar = K.zeros((2,3))
-  >>> K.count_params(kvar)
+
+  >>> kvar = tf.keras.backend.zeros((2,3))
+  >>> tf.keras.backend.count_params(kvar)
   6
-  >>> K.eval(kvar)
-  array([[ 0.,  0.,  0.],
-         [ 0.,  0.,  0.]], dtype=float32)
-  ```
+  >>> tf.keras.backend.eval(kvar)
+  array([[0.,  0.,  0.],
+         [0.,  0.,  0.]], dtype=float32)
+
   """
   return np.prod(x.shape.as_list())
 
@@ -1547,18 +1536,14 @@ def cast(x, dtype):
   Examples:
       Cast a float32 variable to a float64 tensor
 
-  ```python
-  >>> import tensorflow as tf
-  >>> from tensorflow.keras import backend as K
-  >>> input = K.ones(shape=(1,3))
+  >>> input = tf.keras.backend.ones(shape=(1,3))
   >>> print(input)
-  >>> cast_input = K.cast(input, dtype='float64')
-  >>> print(cast_input)
-
   <tf.Variable 'Variable:0' shape=(1, 3) dtype=float32,
-       numpy=array([[1., 1., 1.]], dtype=float32)>
+  numpy=array([[1., 1., 1.]], dtype=float32)>
+  >>> cast_input = tf.keras.backend.cast(input, dtype='float64')
+  >>> print(cast_input)
   tf.Tensor([[1. 1. 1.]], shape=(1, 3), dtype=float64)
-  ```
+
   """
   return math_ops.cast(x, dtype)
 
@@ -1640,32 +1625,28 @@ def dot(x, y):
       A tensor, dot product of `x` and `y`.
 
   Examples:
-  ```python
-  # dot product between tensors
-  >>> x = K.placeholder(shape=(2, 3))
-  >>> y = K.placeholder(shape=(3, 4))
-  >>> xy = K.dot(x, y)
-  >>> xy
-  <tf.Tensor 'MatMul_9:0' shape=(2, 4) dtype=float32>
-  ```
 
-  ```python
   # dot product between tensors
-  >>> x = K.placeholder(shape=(32, 28, 3))
-  >>> y = K.placeholder(shape=(3, 4))
-  >>> xy = K.dot(x, y)
+  >>> x = tf.keras.backend.placeholder(shape=(2, 3))
+  >>> y = tf.keras.backend.placeholder(shape=(3, 4))
+  >>> xy = tf.keras.backend.dot(x, y)
   >>> xy
-  <tf.Tensor 'MatMul_9:0' shape=(32, 28, 4) dtype=float32>
-  ```
+  <tf.Tensor ... shape=(2, 4) dtype=float32>
+
+  # dot product between tensors
+  >>> x = tf.keras.backend.placeholder(shape=(32, 28, 3))
+  >>> y = tf.keras.backend.placeholder(shape=(3, 4))
+  >>> xy = tf.keras.backend.dot(x, y)
+  >>> xy
+  <tf.Tensor ... shape=(32, 28, 4) dtype=float32>
 
-  ```python
   # Theano-like behavior example
-  >>> x = K.random_uniform_variable(shape=(2, 3), low=0, high=1)
-  >>> y = K.ones((4, 3, 5))
-  >>> xy = K.dot(x, y)
-  >>> K.int_shape(xy)
+  >>> x = tf.keras.backend.random_uniform_variable(shape=(2, 3), low=0, high=1)
+  >>> y = tf.keras.backend.ones((4, 3, 5))
+  >>> xy = tf.keras.backend.dot(x, y)
+  >>> tf.keras.backend.int_shape(xy)
   (2, 4, 5)
-  ```
+
   """
   if ndim(x) is not None and (ndim(x) > 2 or ndim(y) > 2):
     x_shape = []
@@ -1708,84 +1689,192 @@ def batch_dot(x, y, axes=None):
   we use `expand_dims` to make sure that ndim is at least 2.
 
   Arguments:
-      x: Keras tensor or variable with `ndim >= 2`.
-      y: Keras tensor or variable with `ndim >= 2`.
-      axes: list of (or single) int with target dimensions.
-          The lengths of `axes[0]` and `axes[1]` should be the same.
+    x: Keras tensor or variable with `ndim >= 2`.
+    y: Keras tensor or variable with `ndim >= 2`.
+    axes: Tuple or list of integers with target dimensions, or single integer.
+      The sizes of `x.shape[axes[0]]` and `y.shape[axes[1]]` should be equal.
 
   Returns:
-      A tensor with shape equal to the concatenation of `x`'s shape
-      (less the dimension that was summed over) and `y`'s shape
-      (less the batch dimension and the dimension that was summed over).
-      If the final rank is 1, we reshape it to `(batch_size, 1)`.
+    A tensor with shape equal to the concatenation of `x`'s shape
+    (less the dimension that was summed over) and `y`'s shape
+    (less the batch dimension and the dimension that was summed over).
+    If the final rank is 1, we reshape it to `(batch_size, 1)`.
 
   Examples:
-      Assume `x = [[1, 2], [3, 4]]` and `y = [[5, 6], [7, 8]]`
-      `batch_dot(x, y, axes=1) = [[17, 53]]` which is the main diagonal
-      of `x.dot(y.T)`, although we never have to calculate the off-diagonal
-      elements.
+    Assume `x = [[1, 2], [3, 4]]` and `y = [[5, 6], [7, 8]]`
+    `batch_dot(x, y, axes=1) = [[17], [53]]` which is the main diagonal
+    of `x.dot(y.T)`, although we never have to calculate the off-diagonal
+    elements.
 
-      Shape inference:
-      Let `x`'s shape be `(100, 20)` and `y`'s shape be `(100, 30, 20)`.
-      If `axes` is (1, 2), to find the output shape of resultant tensor,
-          loop through each dimension in `x`'s shape and `y`'s shape:
+    Pseudocode:
+    ```
+    inner_products = []
+    for xi, yi in zip(x, y):
+        inner_products.append(xi.dot(yi))
+    result = stack(inner_products)
+    ```
 
-      * `x.shape[0]` : 100 : append to output shape
-      * `x.shape[1]` : 20 : do not append to output shape,
-          dimension 1 of `x` has been summed over. (`dot_axes[0]` = 1)
-      * `y.shape[0]` : 100 : do not append to output shape,
-          always ignore first dimension of `y`
-      * `y.shape[1]` : 30 : append to output shape
-      * `y.shape[2]` : 20 : do not append to output shape,
-          dimension 2 of `y` has been summed over. (`dot_axes[1]` = 2)
-      `output_shape` = `(100, 30)`
+    Shape inference:
+    Let `x`'s shape be `(100, 20)` and `y`'s shape be `(100, 30, 20)`.
+    If `axes` is (1, 2), to find the output shape of resultant tensor,
+        loop through each dimension in `x`'s shape and `y`'s shape:
+    * `x.shape[0]` : 100 : append to output shape
+    * `x.shape[1]` : 20 : do not append to output shape,
+        dimension 1 of `x` has been summed over. (`dot_axes[0]` = 1)
+    * `y.shape[0]` : 100 : do not append to output shape,
+        always ignore first dimension of `y`
+    * `y.shape[1]` : 30 : append to output shape
+    * `y.shape[2]` : 20 : do not append to output shape,
+        dimension 2 of `y` has been summed over. (`dot_axes[1]` = 2)
+    `output_shape` = `(100, 30)`
 
-  ```python
-  >>> x_batch = K.ones(shape=(32, 20, 1))
-  >>> y_batch = K.ones(shape=(32, 30, 20))
-  >>> xy_batch_dot = K.batch_dot(x_batch, y_batch, axes=[1, 2])
-  >>> K.int_shape(xy_batch_dot)
+  >>> x_batch = tf.keras.backend.ones(shape=(32, 20, 1))
+  >>> y_batch = tf.keras.backend.ones(shape=(32, 30, 20))
+  >>> xy_batch_dot = tf.keras.backend.batch_dot(x_batch, y_batch, axes=(1, 2))
+  >>> tf.keras.backend.int_shape(xy_batch_dot)
   (32, 1, 30)
-  ```
   """
+  x_shape = int_shape(x)
+  y_shape = int_shape(y)
+
+  x_ndim = len(x_shape)
+  y_ndim = len(y_shape)
+
+  if x_ndim < 2 or y_ndim < 2:
+    raise ValueError('Cannot do batch_dot on inputs '
+                     'with rank < 2. '
+                     'Received inputs with shapes ' +
+                     str(x_shape) + ' and ' +
+                     str(y_shape) + '.')
+
+  x_batch_size = x_shape[0]
+  y_batch_size = y_shape[0]
+
+  if x_batch_size is not None and y_batch_size is not None:
+    if x_batch_size != y_batch_size:
+      raise ValueError('Cannot do batch_dot on inputs '
+                       'with different batch sizes. '
+                       'Received inputs with shapes ' +
+                       str(x_shape) + ' and ' +
+                       str(y_shape) + '.')
   if isinstance(axes, int):
-    axes = (axes, axes)
-  x_ndim = ndim(x)
-  y_ndim = ndim(y)
+    axes = [axes, axes]
+
   if axes is None:
-    # behaves like tf.batch_matmul as default
-    axes = [x_ndim - 1, y_ndim - 2]
-  if x_ndim > y_ndim:
-    diff = x_ndim - y_ndim
-    y = array_ops.reshape(y,
-                          array_ops.concat(
-                              [array_ops.shape(y), [1] * (diff)], axis=0))
-  elif y_ndim > x_ndim:
-    diff = y_ndim - x_ndim
-    x = array_ops.reshape(x,
-                          array_ops.concat(
-                              [array_ops.shape(x), [1] * (diff)], axis=0))
-  else:
-    diff = 0
-  if ndim(x) == 2 and ndim(y) == 2:
-    if axes[0] == axes[1]:
-      out = math_ops.reduce_sum(math_ops.multiply(x, y), axes[0])
+    if y_ndim == 2:
+      axes = [x_ndim - 1, y_ndim - 1]
     else:
-      out = math_ops.reduce_sum(
-          math_ops.multiply(array_ops.transpose(x, [1, 0]), y), axes[1])
+      axes = [x_ndim - 1, y_ndim - 2]
+
+  if py_any([isinstance(a, (list, tuple)) for a in axes]):
+    raise ValueError('Multiple target dimensions are not supported. ' +
+                     'Expected: None, int, (int, int), ' +
+                     'Provided: ' + str(axes))
+
+  # if tuple, convert to list.
+  axes = list(axes)
+
+  # convert negative indices.
+  if axes[0] < 0:
+    axes[0] += x_ndim
+  if axes[1] < 0:
+    axes[1] += y_ndim
+
+  # sanity checks
+  if 0 in axes:
+    raise ValueError('Cannot perform batch_dot over axis 0. '
+                     'If your inputs are not batched, '
+                     'add a dummy batch dimension to your '
+                     'inputs using K.expand_dims(x, 0)')
+  a0, a1 = axes
+  d1 = x_shape[a0]
+  d2 = y_shape[a1]
+
+  if d1 is not None and d2 is not None and d1 != d2:
+    raise ValueError('Cannot do batch_dot on inputs with shapes ' +
+                     str(x_shape) + ' and ' + str(y_shape) +
+                     ' with axes=' + str(axes) + '. x.shape[%d] != '
+                     'y.shape[%d] (%d != %d).' % (axes[0], axes[1], d1, d2))
+
+  # backup ndims. Need them later.
+  orig_x_ndim = x_ndim
+  orig_y_ndim = y_ndim
+
+  # if rank is 2, expand to 3.
+  if x_ndim == 2:
+    x = array_ops.expand_dims(x, 1)
+    a0 += 1
+    x_ndim += 1
+  if y_ndim == 2:
+    y = array_ops.expand_dims(y, 2)
+    y_ndim += 1
+
+  # bring x's dimension to be reduced to last axis.
+  if a0 != x_ndim - 1:
+    pattern = list(range(x_ndim))
+    for i in range(a0, x_ndim - 1):
+      pattern[i] = pattern[i + 1]
+    pattern[-1] = a0
+    x = array_ops.transpose(x, pattern)
+
+  # bring y's dimension to be reduced to axis 1.
+  if a1 != 1:
+    pattern = list(range(y_ndim))
+    for i in range(a1, 1, -1):
+      pattern[i] = pattern[i - 1]
+    pattern[1] = a1
+    y = array_ops.transpose(y, pattern)
+
+  # normalize both inputs to rank 3.
+  if x_ndim > 3:
+    # squash middle dimensions of x.
+    x_shape = shape(x)
+    x_mid_dims = x_shape[1:-1]
+    x_squashed_shape = array_ops.stack(
+        [x_shape[0], -1, x_shape[-1]])
+    x = array_ops.reshape(x, x_squashed_shape)
+    x_squashed = True
   else:
-    adj_x = None if axes[0] == ndim(x) - 1 else True
-    adj_y = True if axes[1] == ndim(y) - 1 else None
-    out = math_ops.matmul(x, y, adjoint_a=adj_x, adjoint_b=adj_y)
-  if diff:
-    if x_ndim > y_ndim:
-      idx = x_ndim + y_ndim - 3
-    else:
-      idx = x_ndim - 1
-    out = array_ops.squeeze(out, list(range(idx, idx + diff)))
-  if ndim(out) == 1:
-    out = expand_dims(out, 1)
-  return out
+    x_squashed = False
+
+  if y_ndim > 3:
+    # squash trailing dimensions of y.
+    y_shape = shape(y)
+    y_trail_dims = y_shape[2:]
+    y_squashed_shape = array_ops.stack(
+        [y_shape[0], y_shape[1], -1])
+    y = array_ops.reshape(y, y_squashed_shape)
+    y_squashed = True
+  else:
+    y_squashed = False
+
+  result = math_ops.matmul(x, y)
+
+  # if inputs were squashed, we have to reshape the matmul output.
+  output_shape = array_ops.shape(result)
+  do_reshape = False
+
+  if x_squashed:
+    output_shape = array_ops.concat(
+        [output_shape[:1],
+         x_mid_dims,
+         output_shape[-1:]], 0)
+    do_reshape = True
+
+  if y_squashed:
+    output_shape = array_ops.concat([output_shape[:-1], y_trail_dims], 0)
+    do_reshape = True
+
+  if do_reshape:
+    result = array_ops.reshape(result, output_shape)
+
+  # if the inputs were originally rank 2, we remove the added 1 dim.
+  if orig_x_ndim == 2:
+    result = array_ops.squeeze(result, 1)
+  elif orig_y_ndim == 2:
+    result = array_ops.squeeze(result, -1)
+
+  return result
 
 
 @keras_export('keras.backend.transpose')
@@ -1799,27 +1888,24 @@ def transpose(x):
       A tensor.
 
   Examples:
-  ```python
-  >>> var = K.variable([[1, 2, 3], [4, 5, 6]])
-  >>> K.eval(var)
-  array([[ 1.,  2.,  3.],
-         [ 4.,  5.,  6.]], dtype=float32)
-  >>> var_transposed = K.transpose(var)
-  >>> K.eval(var_transposed)
-  array([[ 1.,  4.],
-         [ 2.,  5.],
-         [ 3.,  6.]], dtype=float32)
-  ```
 
-  ```python
-  >>> input = K.placeholder((2, 3))
+  >>> var = tf.keras.backend.variable([[1, 2, 3], [4, 5, 6]])
+  >>> tf.keras.backend.eval(var)
+  array([[1.,  2.,  3.],
+         [4.,  5.,  6.]], dtype=float32)
+  >>> var_transposed = tf.keras.backend.transpose(var)
+  >>> tf.keras.backend.eval(var_transposed)
+  array([[1.,  4.],
+         [2.,  5.],
+         [3.,  6.]], dtype=float32)
+  >>> input = tf.keras.backend.placeholder((2, 3))
   >>> input
-  <tf.Tensor 'Placeholder_11:0' shape=(2, 3) dtype=float32>
-  >>> input_transposed = K.transpose(input)
+  <tf.Tensor 'Placeholder_...' shape=(2, 3) dtype=float32>
+  >>> input_transposed = tf.keras.backend.transpose(input)
   >>> input_transposed
-  <tf.Tensor 'transpose_4:0' shape=(3, 2) dtype=float32>
+  <tf.Tensor 'transpose_...' shape=(3, 2) dtype=float32>
+
 
-  ```
   """
   return array_ops.transpose(x)
 
@@ -2309,7 +2395,7 @@ def maximum(x, y):
       A tensor with the element wise maximum value(s) of `x` and `y`.
 
   Examples:
-  ```python
+
   # maximum of two tensors
   >>> x = tf.Variable([[1, 2], [3, 4]])
   >>> y = tf.Variable([[2, 1], [0, -1]])
@@ -2318,7 +2404,7 @@ def maximum(x, y):
   <tf.Tensor: id=42, shape=(2, 2), dtype=int32, numpy=
   array([[2, 2],
          [3, 4]], dtype=int32)>
-  ```
+
   """
   return math_ops.maximum(x, y)
 
@@ -2569,7 +2655,7 @@ def concatenate(tensors, axis=-1):
       A tensor.
 
   Example:
-      ```python
+
       >>> a = tf.constant([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
       >>> b = tf.constant([[10, 20, 30], [40, 50, 60], [70, 80, 90]])
       >>> tf.keras.backend.concatenate((a, b), axis=-1)
@@ -2577,7 +2663,7 @@ def concatenate(tensors, axis=-1):
       array([[ 1,  2,  3, 10, 20, 30],
              [ 4,  5,  6, 40, 50, 60],
              [ 7,  8,  9, 70, 80, 90]], dtype=int32)>
-      ```
+
   """
   if axis < 0:
     rank = ndim(tensors[0])
@@ -2604,7 +2690,7 @@ def reshape(x, shape):
       A tensor.
 
   Example:
-    ```python
+
     >>> a = tf.constant([[1, 2, 3], [4, 5, 6], [7, 8, 9], [10, 11, 12]])
     >>> a
     <tf.Tensor: id=32, shape=(4, 3), dtype=int32, numpy=
@@ -2616,7 +2702,7 @@ def reshape(x, shape):
     <tf.Tensor: id=35, shape=(2, 6), dtype=int32, numpy=
     array([[ 1,  2,  3,  4,  5,  6],
            [ 7,  8,  9, 10, 11, 12]], dtype=int32)>
-    ```
+
   """
   return array_ops.reshape(x, shape)
 
@@ -2634,7 +2720,7 @@ def permute_dimensions(x, pattern):
       A tensor.
 
   Example:
-    ```python
+
     >>> a = tf.constant([[1, 2, 3], [4, 5, 6], [7, 8, 9], [10, 11, 12]])
     >>> a
     <tf.Tensor: id=49, shape=(4, 3), dtype=int32, numpy=
@@ -2647,7 +2733,7 @@ def permute_dimensions(x, pattern):
     array([[ 1,  4,  7, 10],
            [ 2,  5,  8, 11],
            [ 3,  6,  9, 12]], dtype=int32)>
-    ```
+
   """
   return array_ops.transpose(x, perm=pattern)
 
@@ -2763,12 +2849,12 @@ def repeat_elements(x, rep, axis):
       A tensor.
 
   Example:
-      ```python
+
       >>> b = tf.constant([1, 2, 3])
       >>> tf.keras.backend.repeat_elements(b, rep=2, axis=0)
       <tf.Tensor: id=70, shape=(6,), dtype=int32,
           numpy=array([1, 1, 2, 2, 3, 3], dtype=int32)>
-      ```
+
   """
   x_shape = x.shape.as_list()
   # For static axis
@@ -2823,7 +2909,7 @@ def repeat(x, n):
       A tensor.
 
   Example:
-      ```python
+
       >>> b = tf.constant([[1, 2], [3, 4]])
       >>> b
       <tf.Tensor: id=78, shape=(2, 2), dtype=int32, numpy=
@@ -2835,7 +2921,7 @@ def repeat(x, n):
               [1, 2]],
              [[3, 4],
               [3, 4]]], dtype=int32)>
-      ```
+
   """
   assert ndim(x) == 2
   x = array_ops.expand_dims(x, 1)
@@ -2864,12 +2950,12 @@ def arange(start, stop=None, step=1, dtype='int32'):
       An integer tensor.
 
   Example:
-      ```python
+
       >>> tf.keras.backend.arange(start=0, stop=10, step=1.5)
       <tf.Tensor: id=96, shape=(7,), dtype=float32,
           numpy=array([0. , 1.5, 3. , 4.5, 6. , 7.5, 9. ], dtype=float32)>
 
-      ```
+
 
   """
   # Match the behavior of numpy and Theano by returning an empty sequence.
@@ -2909,7 +2995,7 @@ def flatten(x):
       A tensor, reshaped into 1-D
 
   Example:
-      ```python
+
       >>> b = tf.constant([[1, 2], [3, 4]])
       >>> b
       <tf.Tensor: id=102, shape=(2, 2), dtype=int32, numpy=
@@ -2918,7 +3004,7 @@ def flatten(x):
       >>> tf.keras.backend.flatten(b)
       <tf.Tensor: id=105, shape=(4,), dtype=int32,
           numpy=array([1, 2, 3, 4], dtype=int32)>
-      ```
+
   """
   return array_ops.reshape(x, [-1])
 
@@ -2938,13 +3024,11 @@ def batch_flatten(x):
   Examples:
     Flattening a 3D tensor to 2D by collapsing the last dimension.
 
-  ```python
-  >>> from tensorflow.keras import backend as K
-  >>> x_batch = K.ones(shape=(2, 3, 4, 5))
-  >>> x_batch_flatten = K.batch_flatten(x_batch)
-  >>> K.int_shape(x_batch_flatten)
+  >>> x_batch = tf.keras.backend.ones(shape=(2, 3, 4, 5))
+  >>> x_batch_flatten = batch_flatten(x_batch)
+  >>> tf.keras.backend.int_shape(x_batch_flatten)
   (2, 60)
-  ```
+
   """
   x = array_ops.reshape(x, array_ops.stack([-1, prod(shape(x)[1:])]))
   return x
@@ -3082,7 +3166,7 @@ def stack(x, axis=0):
       A tensor.
 
   Example:
-      ```python
+
       >>> a = tf.constant([[1, 2],[3, 4]])
       >>> b = tf.constant([[10, 20],[30, 40]])
       >>> tf.keras.backend.stack((a, b))
@@ -3091,7 +3175,7 @@ def stack(x, axis=0):
               [ 3,  4]],
              [[10, 20],
               [30, 40]]], dtype=int32)>
-      ```
+
   """
   return array_ops.stack(x, axis=axis)
 
@@ -3147,7 +3231,7 @@ def get_value(x):
   """
   if not tensor_util.is_tensor(x):
     return x
-  if context.executing_eagerly():
+  if context.executing_eagerly() or isinstance(x, ops.EagerTensor):
     return x.numpy()
   if not getattr(x, '_in_graph_mode', True):
     # This is a variable which was created in an eager context, but is being
@@ -3159,7 +3243,8 @@ def get_value(x):
     # This method of evaluating works inside the Keras FuncGraph.
     return function([], x)(x)
 
-  return x.eval(session=get_session((x,)))
+  with x.graph.as_default():
+    return x.eval(session=get_session((x,)))
 
 
 @keras_export('keras.backend.batch_get_value')
@@ -3267,9 +3352,11 @@ def print_tensor(x, message=''):
 
   Example:
 
-  ```python
-  >>> x = K.print_tensor(x, message="x is: ")
-  ```
+  >>> x = tf.constant([[1.0, 2.0], [3.0, 4.0]])
+  >>> tf.keras.backend.print_tensor(x)
+  <tf.Tensor: id=6064, shape=(2, 2), dtype=float32, numpy=
+    array([[1., 2.],
+           [3., 4.]], dtype=float32)>
 
   Arguments:
       x: Tensor to print.
@@ -3867,13 +3954,14 @@ def rnn(step_function,
         else:
           prev_output = successive_outputs[-1]
 
-        output = array_ops.where(tiled_mask_t, output, prev_output)
+        output = array_ops.where_v2(tiled_mask_t, output, prev_output)
 
         return_states = []
         for state, new_state in zip(states, new_states):
           # (see earlier comment for tile explanation)
           tiled_mask_t = _expand_mask(mask_t, new_state)
-          return_states.append(array_ops.where(tiled_mask_t, new_state, state))
+          return_states.append(
+              array_ops.where_v2(tiled_mask_t, new_state, state))
         states = return_states
         successive_outputs.append(output)
         successive_states.append(states)
@@ -3882,13 +3970,11 @@ def rnn(step_function,
       outputs = array_ops.stack(successive_outputs)
 
       if zero_output_for_mask:
-        last_output = array_ops.where(
-            _expand_mask(mask_list[-1], last_output),
-            last_output,
+        last_output = array_ops.where_v2(
+            _expand_mask(mask_list[-1], last_output), last_output,
             zeros_like(last_output))
-        outputs = array_ops.where(
-            _expand_mask(mask, outputs, fixed_dim=2),
-            outputs,
+        outputs = array_ops.where_v2(
+            _expand_mask(mask, outputs, fixed_dim=2), outputs,
             zeros_like(outputs))
 
     else:
@@ -3989,8 +4075,8 @@ def rnn(step_function,
                             else nest.flatten(prev_output))
         tiled_mask_t = tuple(_expand_mask(mask_t, o) for o in flat_output)
         flat_new_output = tuple(
-            array_ops.where(m, o, zo) for m, o, zo in zip(
-                tiled_mask_t, flat_output, flat_mask_output))
+            array_ops.where_v2(m, o, zo)
+            for m, o, zo in zip(tiled_mask_t, flat_output, flat_mask_output))
 
         # mask states
         flat_state = nest.flatten(states)
@@ -4000,7 +4086,7 @@ def rnn(step_function,
             new_state.set_shape(state.shape)
         tiled_mask_t = tuple(_expand_mask(mask_t, s) for s in flat_state)
         flat_final_state = tuple(
-            array_ops.where(m, s, ps)
+            array_ops.where_v2(m, s, ps)
             for m, s, ps in zip(tiled_mask_t, flat_new_state, flat_state))
         new_states = nest.pack_sequence_as(new_states, flat_final_state)
 
@@ -4132,10 +4218,10 @@ def switch(condition, then_expression, else_expression):
       condition = array_ops.reshape(condition, cond_shape)
       expr_shape = array_ops.shape(then_expression)
       shape_diff = expr_shape - cond_shape
-      tile_shape = array_ops.where(shape_diff > 0, expr_shape,
-                                   array_ops.ones_like(expr_shape))
+      tile_shape = array_ops.where_v2(shape_diff > 0, expr_shape,
+                                      array_ops.ones_like(expr_shape))
       condition = array_ops.tile(condition, tile_shape)
-    x = array_ops.where(condition, then_expression, else_expression)
+    x = array_ops.where_v2(condition, then_expression, else_expression)
   return x
 
 
@@ -4272,7 +4358,7 @@ def elu(x, alpha=1.):
   if alpha == 1:
     return res
   else:
-    return array_ops.where(x > 0, res, alpha * res)
+    return array_ops.where_v2(x > 0, res, alpha * res)
 
 
 @keras_export('keras.backend.softmax')
@@ -4338,18 +4424,27 @@ def categorical_crossentropy(target, output, from_logits=False, axis=-1):
       ValueError: if `axis` is neither -1 nor one of the axes of `output`.
 
   Example:
-  ```python:
-  import tensorflow as tf
-  from tensorflow.keras import backend as K
-  a = tf.constant([1., 0., 0., 0., 1., 0., 0., 0., 1.], shape=[3,3])
-  print("a: ", a)
-  b = tf.constant([.9, .05, .05, .5, .89, .6, .05, .01, .94], shape=[3,3])
-  print("b: ", b)
-  loss = K.categorical_crossentropy(a, b)
-  print('Loss: ', loss) #Loss: tf.Tensor([0.10536055 0.8046684  0.06187541], shape=(3,), dtype=float32)
-  loss = K.categorical_crossentropy(a, a)
-  print('Loss: ', loss) #Loss:  tf.Tensor([1.1920929e-07 1.1920929e-07 1.1920929e-07], shape=(3,), dtype=float32)
-  ```
+
+  >>> a = tf.constant([1., 0., 0., 0., 1., 0., 0., 0., 1.], shape=[3,3])
+  >>> print(a)
+  tf.Tensor(
+    [[1. 0. 0.]
+     [0. 1. 0.]
+     [0. 0. 1.]], shape=(3, 3), dtype=float32)
+  >>> b = tf.constant([.9, .05, .05, .5, .89, .6, .05, .01, .94], shape=[3,3])
+  >>> print(b)
+  tf.Tensor(
+    [[0.9  0.05 0.05]
+     [0.5  0.89 0.6 ]
+     [0.05 0.01 0.94]], shape=(3, 3), dtype=float32)
+  >>> loss = tf.keras.backend.categorical_crossentropy(a, b)
+  >>> print(loss)
+  tf.Tensor([0.10536055 0.8046684  0.06187541], shape=(3,), dtype=float32)
+  >>> loss = tf.keras.backend.categorical_crossentropy(a, a)
+  >>> print(loss)
+  tf.Tensor([1.1920929e-07 1.1920929e-07 1.19...e-07], shape=(3,),
+  dtype=float32)
+
   """
   if not from_logits:
     if (isinstance(output, (ops.EagerTensor, variables_module.Variable)) or
@@ -4423,7 +4518,7 @@ def sparse_categorical_crossentropy(target, output, from_logits=False, axis=-1):
 
   target = cast(target, 'int64')
 
-  # Try to adjust the shape so that rank of labels = 1 - rank of logits.
+  # Try to adjust the shape so that rank of labels = rank of logits - 1.
   output_shape = array_ops.shape_v2(output)
   target_rank = target.shape.ndims
 
@@ -5513,7 +5608,7 @@ def random_binomial(shape, p=0.0, dtype=None, seed=None):
     dtype = floatx()
   if seed is None:
     seed = np.random.randint(10e6)
-  return array_ops.where(
+  return array_ops.where_v2(
       random_ops.random_uniform(shape, dtype=dtype, seed=seed) <= p,
       array_ops.ones(shape, dtype=dtype), array_ops.zeros(shape, dtype=dtype))
 
diff --git a/tensorflow/python/keras/backend_test.py b/tensorflow/python/keras/backend_test.py
index 1547a7747b2..9b6f92385ba 100644
--- a/tensorflow/python/keras/backend_test.py
+++ b/tensorflow/python/keras/backend_test.py
@@ -273,6 +273,13 @@ class BackendUtilsTest(test.TestCase):
     f = keras.backend.function(x, y)
     f(0)
 
+  def test_cast_to_floatx(self):
+    x = keras.backend.variable(1, dtype='float64')
+    x = keras.backend.cast_to_floatx(x)
+    self.assertEqual(x.dtype.name, 'float32')
+    x = keras.backend.cast_to_floatx(2)
+    self.assertEqual(x.dtype.name, 'float32')
+
 
 @test_util.run_all_in_graph_and_eager_modes
 class BackendVariableTest(test.TestCase):
@@ -338,7 +345,7 @@ class BackendVariableTest(test.TestCase):
 
 
 @test_util.run_all_in_graph_and_eager_modes
-class BackendLinearAlgebraTest(test.TestCase):
+class BackendLinearAlgebraTest(test.TestCase, parameterized.TestCase):
 
   def test_dot(self):
     x = keras.backend.ones(shape=(2, 3))
@@ -351,13 +358,47 @@ class BackendLinearAlgebraTest(test.TestCase):
     xy = keras.backend.dot(x, y)
     self.assertEqual(xy.shape.as_list(), [32, 28, 4])
 
-  def test_batch_dot(self):
-    x = keras.backend.ones(shape=(32, 20, 1))
-    y = keras.backend.ones(shape=(32, 30, 20))
-    xy = keras.backend.batch_dot(x, y, axes=[1, 2])
-    self.assertEqual(xy.shape.as_list(), [32, 1, 30])
+  @parameterized.parameters(
+      [(2, 3, 4, 5), (2, 5, 6, 7), (2, 3, 4, 6, 7), (3, 1)],
+      [(2, 20, 1), (2, 30, 20), (2, 1, 30), (1, 2)],
+      [(4, 2, 3), (4, 5, 3), (4, 2, 5), (2, 2)],
+      [(4, 2), (4, 2, 3), (4, 3), (1, 1)],
+      [(4, 2), (4, 2, 3), (4, 3), 1],
+      [(4, 2, 3), (4, 3), (4, 2), (2, 1)],
+  )
+  def test_batch_dot(self, x_shape, y_shape, output_shape, axes):
+    x_val = np.random.random(x_shape)
+    y_val = np.random.random(y_shape)
+    x = keras.backend.variable(x_val)
+    y = keras.backend.variable(y_val)
+    xy = keras.backend.batch_dot(x, y, axes=axes)
+    self.assertEqual(tuple(xy.shape.as_list()), output_shape)
+    xy_val = keras.backend.eval(xy)
+    ref_val = self._reference_batch_dot(x_val, y_val, axes)
+    self.assertAllClose(xy_val, ref_val, atol=1e-5)
 
-    # TODO(fchollet): insufficiently tested.
+  def _reference_batch_dot(self, x, y, axes):
+    if isinstance(axes, int):
+      axes = [axes, axes]
+    elif isinstance(axes, tuple):
+      axes = list(axes)
+    if axes is None:
+      if y.ndim == 2:
+        axes = [x.ndim - 1, y.ndim - 1]
+      else:
+        axes = [x.ndim - 1, y.ndim - 2]
+    if axes[0] < 0:
+      axes[0] += x.ndim
+    if axes[1] < 0:
+      axes[1] += y.ndim
+    result = []
+    axes = [axes[0] - 1, axes[1] - 1]
+    for xi, yi in zip(x, y):
+      result.append(np.tensordot(xi, yi, axes))
+    result = np.array(result)
+    if result.ndim == 1:
+      result = np.expand_dims(result, -1)
+    return result
 
   def test_reduction_ops(self):
     ops_to_test = [
diff --git a/tensorflow/python/keras/callbacks.py b/tensorflow/python/keras/callbacks.py
index 4af1ee5b95a..31c93b7fe05 100644
--- a/tensorflow/python/keras/callbacks.py
+++ b/tensorflow/python/keras/callbacks.py
@@ -142,18 +142,19 @@ def set_callback_parameters(callback_list,
       mode: String. One of ModeKeys.TRAIN, ModeKeys.TEST, or ModeKeys.PREDICT.
         Which loop mode to configure callbacks for.
   """
+  metric_names = model.metrics_names
   for cbk in callback_list:
     if isinstance(cbk, (BaseLogger, ProgbarLogger)):
-      cbk.stateful_metrics = model.metrics_names[1:]  # Exclude `loss`
+      cbk.stateful_metrics = metric_names[1:]  # Exclude `loss`
 
   # Set callback parameters
   callback_metrics = []
   # When we have deferred build scenario with iterator input, we will compile
   # when we standardize first batch of data.
-  if mode != ModeKeys.PREDICT and hasattr(model, 'metrics_names'):
-    callback_metrics = copy.copy(model.metrics_names)
+  if mode != ModeKeys.PREDICT:
+    callback_metrics = copy.copy(metric_names)
     if do_validation:
-      callback_metrics += ['val_' + n for n in model.metrics_names]
+      callback_metrics += ['val_' + n for n in metric_names]
   callback_params = {
       'batch_size': batch_size,
       'epochs': epochs,
@@ -174,10 +175,10 @@ def _is_generator_like(data):
 
 def make_logs(model, logs, outputs, mode, prefix=''):
   """Computes logs for sending to `on_batch_end` methods."""
-  if mode in {ModeKeys.TRAIN, ModeKeys.TEST}:
-    if hasattr(model, 'metrics_names'):
-      for label, output in zip(model.metrics_names, outputs):
-        logs[prefix + label] = output
+  metric_names = model.metrics_names
+  if mode in {ModeKeys.TRAIN, ModeKeys.TEST} and metric_names:
+    for label, output in zip(metric_names, outputs):
+      logs[prefix + label] = output
   else:
     logs['outputs'] = outputs
   return logs
@@ -716,6 +717,7 @@ class ProgbarLogger(Callback):
     else:
       raise ValueError('Unknown `count_mode`: ' + str(count_mode))
     self.stateful_metrics = set(stateful_metrics or [])
+    self.log_values = None
 
   def on_train_begin(self, logs=None):
     self.verbose = self.params['verbose']
diff --git a/tensorflow/python/keras/distribute/BUILD b/tensorflow/python/keras/distribute/BUILD
index 275bdd359fa..788ca0aa3d2 100644
--- a/tensorflow/python/keras/distribute/BUILD
+++ b/tensorflow/python/keras/distribute/BUILD
@@ -348,6 +348,7 @@ cuda_py_test(
     shard_count = 32,
     tags = [
         "no_oss",  # TODO(b/130369494): Investigate why it times out on OSS.
+        "nomsan",  # b/140958724
         # TODO(b/123307453): Add "multi_and_single_gpu",
     ],
     xla_enable_strict_auto_jit = True,
@@ -396,6 +397,7 @@ cuda_py_test(
     # TODO(b/132384649): Enable once fixed.
     tags = [
         "no_oss",
+        "nomsan",  # b/140958724
     ],
     xla_enable_strict_auto_jit = True,
 )
@@ -414,6 +416,7 @@ cuda_py_test(
     ],
     tags = [
         "multi_and_single_gpu",
+        "nomsan",  # b/140958724
     ],
     xla_enable_strict_auto_jit = True,
 )
diff --git a/tensorflow/python/keras/distribute/distribute_strategy_test.py b/tensorflow/python/keras/distribute/distribute_strategy_test.py
index e5a8f366b9e..a356acc556e 100644
--- a/tensorflow/python/keras/distribute/distribute_strategy_test.py
+++ b/tensorflow/python/keras/distribute/distribute_strategy_test.py
@@ -286,21 +286,10 @@ def strategy_and_optimizer_combinations():
               strategy_combinations.adamax_optimizer_keras_v2_fn,
               strategy_combinations.gradient_descent_optimizer_keras_v2_fn,
               strategy_combinations.nadam_optimizer_keras_v2_fn,
-              strategy_combinations.rmsprop_optimizer_keras_v2_fn
+              strategy_combinations.rmsprop_optimizer_keras_v2_fn,
+              strategy_combinations.ftrl_optimizer_keras_v2_fn
           ],
-          experimental_run_tf_function=[True, False])) + combinations.combine(
-              distribution=[
-                  strategy_combinations.one_device_strategy,
-              ],
-              mode=['graph', 'eager'],
-              # TODO(b/109941998):  Ftrl optimizer doesn't have a GPU kernel.
-              # That is why it's not tested with MirroredStrategies.  It is also
-              # not tested with Default strategy, because that fails with
-              # colocation error dense_12/MatMul/ReadVariableOp on GPU and
-              # ResourceApplyFtrl that can't be on GPU.  Add default strategy,
-              # one device GPU strategy and mirrored GPU strategies.
-              optimizer=strategy_combinations.ftrl_optimizer_keras_v2_fn,
-              experimental_run_tf_function=[True, False])
+          experimental_run_tf_function=[True, False]))
   tpu_strategies_graph = combinations.combine(
       distribution=tpu_strategies,
       mode=['graph'],
diff --git a/tensorflow/python/keras/engine/base_layer.py b/tensorflow/python/keras/engine/base_layer.py
index 8b74682bbd5..0c69f6e7ff6 100644
--- a/tensorflow/python/keras/engine/base_layer.py
+++ b/tensorflow/python/keras/engine/base_layer.py
@@ -302,7 +302,7 @@ class Layer(module.Module):
     self._trainable = trainable
     # A stateful layer is a layer whose updates are run during inference too,
     # for instance stateful RNNs.
-    self.stateful = False
+    self._stateful = False
     # Indicates whether `build` needs to be called upon layer call, to create
     # the layer's weights.
     self.built = False
@@ -908,9 +908,23 @@ class Layer(module.Module):
     return self._name
 
   @property
+  @trackable_layer_utils.cache_recursive_attribute('dynamic')
   def dynamic(self):
+    # NOTE(taylorrobie): Currently self._dynamic is read-only. If that changes
+    #                    then this cache logic must be updated.
     return self._dynamic
 
+  @property
+  @doc_controls.do_not_generate_docs
+  @trackable_layer_utils.cache_recursive_attribute('stateful')
+  def stateful(self):
+    return self._stateful
+
+  @stateful.setter
+  @trackable_layer_utils.invalidate_recursive_cache('stateful')
+  def stateful(self, value):
+    self._stateful = value
+
   @property
   def trainable(self):
     return self._trainable
@@ -950,7 +964,7 @@ class Layer(module.Module):
   def trainable_weights(self):
     if self.trainable:
       nested = self._gather_children_attribute('trainable_weights')
-      return self._trainable_weights + nested
+      return self._dedup_weights(self._trainable_weights + nested)
     else:
       return []
 
@@ -958,10 +972,12 @@ class Layer(module.Module):
   def non_trainable_weights(self):
     if self.trainable:
       nested = self._gather_children_attribute('non_trainable_weights')
-      return self._non_trainable_weights + nested
+      non_trainable_weights = self._non_trainable_weights + nested
     else:
       nested = self._gather_children_attribute('weights')
-      return self._trainable_weights + self._non_trainable_weights + nested
+      non_trainable_weights = (
+          self._trainable_weights + self._non_trainable_weights + nested)
+    return self._dedup_weights(non_trainable_weights)
 
   @property
   def weights(self):
@@ -2242,6 +2258,7 @@ class Layer(module.Module):
       super(tracking.AutoTrackable, self).__setattr__(
           '_layers',
           [l for l in self._layers if l is not existing_value])
+      self._attribute_sentinel.invalidate_all()
     if isinstance(existing_value, tf_variables.Variable):
       super(tracking.AutoTrackable, self).__setattr__(
           '_trainable_weights',
@@ -2250,6 +2267,13 @@ class Layer(module.Module):
           '_non_trainable_weights',
           [w for w in self._non_trainable_weights if w is not existing_value])
 
+    # Any time we change `_layers` (either by deleting the attribute or by
+    # reassigning it which will call __delattr__ from __setattr__) the topology
+    # of the subgraph of Layers may change. In that case we will need to
+    # recompute any attribute which depends on that subgraph.
+    if name == '_layers':
+      self._attribute_sentinel.invalidate_all()
+
   def __setattr__(self, name, value):
     if (name == '_self_setattr_tracking' or
         not getattr(self, '_self_setattr_tracking', True) or
@@ -2291,6 +2315,8 @@ class Layer(module.Module):
       # container types which compare equal.
       if not any((layer is value for layer in self._layers)):
         self._layers.append(value)
+        if hasattr(value, '_attribute_sentinel'):
+          value._attribute_sentinel.add_parent(self._attribute_sentinel)
         if hasattr(value, '_use_resource_variables'):
           # Legacy layers (V1 tf.layers) must always use
           # resource variables.
@@ -2339,6 +2365,11 @@ class Layer(module.Module):
               getattr(layer, attribute) for layer in nested_layers))
     return []
 
+  @property
+  @tracking.cached_per_instance
+  def _attribute_sentinel(self):
+    return trackable_layer_utils.AttributeSentinel()
+
   # This is a hack so that the is_layer (within
   # training/trackable/layer_utils.py) check doesn't get the weights attr.
   # TODO(b/110718070): Remove when fixed.
@@ -2347,6 +2378,7 @@ class Layer(module.Module):
 
   def _init_call_fn_args(self):
     # Clear cached call function arguments.
+    self.__class__._call_full_argspec.fget.cache.pop(self, None)
     self.__class__._call_fn_args.fget.cache.pop(self, None)
     self.__class__._call_accepts_kwargs.fget.cache.pop(self, None)
 
@@ -2356,10 +2388,17 @@ class Layer(module.Module):
     self._expects_mask_arg = ('mask' in call_fn_args or
                               self._call_accepts_kwargs)
 
+  @property
+  @tracking.cached_per_instance
+  def _call_full_argspec(self):
+    # Argspec inspection is expensive and the call spec is used often, so it
+    # makes sense to cache the result.
+    return tf_inspect.getfullargspec(self.call)
+
   @property
   @tracking.cached_per_instance
   def _call_fn_args(self):
-    all_args = tf_inspect.getfullargspec(self.call).args
+    all_args = self._call_full_argspec.args
     # Scrub `self` that appears if a decorator was applied.
     if all_args and all_args[0] == 'self':
       return all_args[1:]
@@ -2368,7 +2407,7 @@ class Layer(module.Module):
   @property
   @tracking.cached_per_instance
   def _call_accepts_kwargs(self):
-    return tf_inspect.getfullargspec(self.call).varkw is not None
+    return self._call_full_argspec.varkw is not None
 
   @property
   @tracking.cached_per_instance
@@ -2392,14 +2431,13 @@ class Layer(module.Module):
   def _eager_losses(self, losses):
     self._thread_local._eager_losses = losses
 
-  @property
-  def _unique_trainable_weights(self):
-    """Dedupe trainable weights while maintaining order as much as possible."""
-    trainable_weights = self.trainable_weights
+  def _dedup_weights(self, weights):
+    """Dedupe weights while maintaining order as much as possible."""
     output, seen_weights = [], object_identity.ObjectIdentitySet()
-    for w in trainable_weights:
+    for w in weights:
       if w not in seen_weights:
         output.append(w)
+        # Track the Variable's identity to avoid __eq__ issues.
         seen_weights.add(w)
     return output
 
diff --git a/tensorflow/python/keras/engine/base_layer_utils.py b/tensorflow/python/keras/engine/base_layer_utils.py
index fbcc452ecb8..8a88932b343 100644
--- a/tensorflow/python/keras/engine/base_layer_utils.py
+++ b/tensorflow/python/keras/engine/base_layer_utils.py
@@ -386,6 +386,7 @@ class CallContext(object):
     in_call: Whether currently inside the `call` of a Layer.
     training: Whether currently executing in training or inference mode.
     in_keras_graph: Whether executing inside the Keras Graph.
+    saving: Whether currently saving to SavedModel.
   """
 
   def __init__(self):
@@ -395,9 +396,10 @@ class CallContext(object):
     self.in_call = False
     self.training = None
     self._in_keras_graph = False
+    self.saving = False
 
   @tf_contextlib.contextmanager
-  def enter(self, layer, inputs, build_graph, training):
+  def enter(self, layer, inputs, build_graph, training, saving=None):
     """Push a Layer and its inputs and state onto the current call context."""
     prev_layer = self.layer
     prev_inputs = self.inputs
@@ -405,6 +407,7 @@ class CallContext(object):
     prev_in_call = self.in_call
     prev_training = self.training
     prev_in_keras_graph = self._in_keras_graph
+    prev_saving = self.saving
 
     self.layer = layer
     self.inputs = inputs
@@ -415,6 +418,7 @@ class CallContext(object):
         self._in_keras_graph or
         (build_graph and
          getattr(backend.get_graph(), 'name', None) == 'keras_graph'))
+    self.saving = prev_saving if saving is None else saving
 
     try:
       yield
@@ -425,6 +429,7 @@ class CallContext(object):
       self.in_call = prev_in_call
       self.training = prev_training
       self._in_keras_graph = prev_in_keras_graph
+      self.saving = prev_saving
 
   @property
   def in_keras_graph(self):
diff --git a/tensorflow/python/keras/engine/data_adapter.py b/tensorflow/python/keras/engine/data_adapter.py
index e98ea821885..5044a4bd01f 100644
--- a/tensorflow/python/keras/engine/data_adapter.py
+++ b/tensorflow/python/keras/engine/data_adapter.py
@@ -153,6 +153,19 @@ class DataAdapter(object):
     """
     raise NotImplementedError
 
+  def representative_batch_size(self):
+    """Return a representative size for batches in the dataset.
+
+    This is not guaranteed to be the batch size for all batches in the
+    dataset. It just needs to be a rough approximation for batch sizes in
+    the dataset.
+
+    Returns:
+      int, a representative size for batches found in the dataset,
+      or None if it is unknown.
+    """
+    return self.batch_size()
+
   @abc.abstractmethod
   def has_partial_batch(self):
     """Whether the dataset has partial batch at the end."""
@@ -562,9 +575,12 @@ class GeneratorDataAdapter(DataAdapter):
     # dataset, we have to take a peek for the python generator first. Since the
     # peeked data cannot be push back to generator, we create a new generator by
     # adding the peeked data at head.
+    def dynamic_shape_like(t):
+      return tuple(None for _ in t.shape)
+
     peek = next(x)
     nested_dtypes = nest.map_structure(lambda t: t.dtype, peek)
-    nested_shape = nest.map_structure(lambda t: t.shape, peek)
+    nested_shape = nest.map_structure(dynamic_shape_like, peek)
     # Note that dataset API takes a callable that creates a generator object,
     # rather than generator itself, which is why we define a function here.
     if workers > 0:
@@ -582,7 +598,7 @@ class GeneratorDataAdapter(DataAdapter):
       def generator_fn():
         return itertools.chain([peek], x)
 
-    self._batch_size = int(nest.flatten(peek)[0].shape[0])
+    self._first_batch_size = int(nest.flatten(peek)[0].shape[0])
     self._dataset = dataset_ops.DatasetV2.from_generator(
         generator_fn, nested_dtypes, output_shapes=nested_shape)
 
@@ -593,7 +609,10 @@ class GeneratorDataAdapter(DataAdapter):
     return None
 
   def batch_size(self):
-    return self._batch_size
+    return None
+
+  def representative_batch_size(self):
+    return self._first_batch_size
 
   def has_partial_batch(self):
     return False
@@ -618,9 +637,12 @@ class KerasSequenceAdapter(DataAdapter):
     if not is_none_or_empty(sample_weights):
       raise ValueError("`sample_weight` argument is not supported when using "
                        "`keras.utils.Sequence` as input.")
+    def dynamic_shape_like(t):
+      return tuple(None for _ in t.shape)
+
     peek = x[0]
     nested_dtypes = nest.map_structure(lambda t: t.dtype, peek)
-    nested_shape = nest.map_structure(lambda t: t.shape, peek)
+    nested_shape = nest.map_structure(dynamic_shape_like, peek)
 
     if workers > 0:
       def generator_fn():
@@ -638,7 +660,7 @@ class KerasSequenceAdapter(DataAdapter):
       dataset = dataset.shuffle(len(x))
     self._dataset = dataset
     self._size = len(x)
-    self._batch_size = int(nest.flatten(peek)[0].shape[0])
+    self._first_batch_size = int(nest.flatten(peek)[0].shape[0])
 
   def get_dataset(self):
     return self._dataset
@@ -647,7 +669,10 @@ class KerasSequenceAdapter(DataAdapter):
     return self._size
 
   def batch_size(self):
-    return self._batch_size
+    return None
+
+  def representative_batch_size(self):
+    return self._first_batch_size
 
   def has_partial_batch(self):
     return False
diff --git a/tensorflow/python/keras/engine/data_adapter_test.py b/tensorflow/python/keras/engine/data_adapter_test.py
index 850e162313f..16d29cdd657 100644
--- a/tensorflow/python/keras/engine/data_adapter_test.py
+++ b/tensorflow/python/keras/engine/data_adapter_test.py
@@ -291,7 +291,8 @@ class GeneratorDataAdapterTest(DataAdapterTestBase):
 
   def test_batch_size(self):
     adapter = self.adapter_cls(self.generator_input)
-    self.assertEqual(adapter.batch_size(), 5)
+    self.assertEqual(adapter.batch_size(), None)
+    self.assertEqual(adapter.representative_batch_size(), 5)
 
   def test_partial_batch(self):
     adapter = self.adapter_cls(self.generator_input)
@@ -343,7 +344,8 @@ class KerasSequenceAdapterTest(DataAdapterTestBase):
 
   def test_batch_size(self):
     adapter = self.adapter_cls(self.sequence_input)
-    self.assertEqual(adapter.batch_size(), 5)
+    self.assertEqual(adapter.batch_size(), None)
+    self.assertEqual(adapter.representative_batch_size(), 5)
 
   def test_partial_batch(self):
     adapter = self.adapter_cls(self.sequence_input)
diff --git a/tensorflow/python/keras/engine/input_layer.py b/tensorflow/python/keras/engine/input_layer.py
index 82c2e2d7d6a..3c374c14ada 100644
--- a/tensorflow/python/keras/engine/input_layer.py
+++ b/tensorflow/python/keras/engine/input_layer.py
@@ -25,6 +25,7 @@ from tensorflow.python.keras import backend
 from tensorflow.python.keras.distribute import distributed_training_utils
 from tensorflow.python.keras.engine import base_layer
 from tensorflow.python.keras.engine import node as node_module
+from tensorflow.python.keras.saving.saved_model import layer_serialization
 from tensorflow.python.keras.utils import tf_utils
 from tensorflow.python.util.tf_export import keras_export
 
@@ -158,6 +159,10 @@ class InputLayer(base_layer.Layer):
     }
     return config
 
+  @property
+  def _trackable_saved_model_saver(self):
+    return layer_serialization.InputLayerSavedModelSaver(self)
+
 
 @keras_export('keras.layers.Input', 'keras.Input')
 def Input(  # pylint: disable=invalid-name
diff --git a/tensorflow/python/keras/engine/network.py b/tensorflow/python/keras/engine/network.py
index e1854c47867..566408adbb6 100644
--- a/tensorflow/python/keras/engine/network.py
+++ b/tensorflow/python/keras/engine/network.py
@@ -27,10 +27,12 @@ import os
 import threading
 
 import numpy as np
+import six
 from six.moves import zip  # pylint: disable=redefined-builtin
 
 from tensorflow.python import pywrap_tensorflow
 from tensorflow.python.eager import context
+from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import errors_impl
 from tensorflow.python.framework import func_graph
@@ -322,6 +324,7 @@ class Network(base_layer.Layer):
     self._layer_call_argspecs = {}
     for layer in self._layers:
       self._layer_call_argspecs[layer] = tf_inspect.getfullargspec(layer.call)
+      layer._attribute_sentinel.add_parent(self._attribute_sentinel)
 
     self._track_layers(layers)
 
@@ -383,6 +386,7 @@ class Network(base_layer.Layer):
     self.built = False
 
   @property
+  @trackable_layer_utils.cache_recursive_attribute('dynamic')
   def dynamic(self):
     if self._is_graph_network:
       return any(layer.dynamic for layer in self.layers)
@@ -423,9 +427,11 @@ class Network(base_layer.Layer):
       try:
         self._is_graph_network
       except AttributeError:
-        raise RuntimeError('It looks like you are subclassing `Model` and you '
-                           'forgot to call `super(YourClass, self).__init__()`.'
-                           ' Always start with this line.')
+        # six.raise_from supresses the original AttributeError from being raised
+        six.raise_from(
+            RuntimeError('It looks like you are subclassing `Model` and you '
+                         'forgot to call `super(YourClass, self).__init__()`.'
+                         ' Always start with this line.'), None)
 
     super(Network, self).__setattr__(name, value)
 
@@ -437,9 +443,9 @@ class Network(base_layer.Layer):
       self._metrics.append(value)
 
   @property
+  @trackable_layer_utils.cache_recursive_attribute('stateful')
   def stateful(self):
-    return any((hasattr(layer, 'stateful') and layer.stateful)
-               for layer in self.layers)
+    return any(getattr(layer, 'stateful', False) for layer in self.layers)
 
   def reset_states(self):
     for layer in self.layers:
@@ -471,6 +477,11 @@ class Network(base_layer.Layer):
     Returns:
       A list of variables.
     """
+    return self._dedup_weights(self._undeduplicated_weights)
+
+  @property
+  def _undeduplicated_weights(self):
+    """Returns the undeduplicated list of all layer variables/weights."""
     self._assert_weights_created()
     weights = []
     for layer in self._layers:
@@ -495,8 +506,8 @@ class Network(base_layer.Layer):
 
   @property
   def layers(self):
-    return trackable_layer_utils.filter_empty_layer_containers(
-        self._layers)
+    return list(
+        trackable_layer_utils.filter_empty_layer_containers(self._layers))
 
   def get_layer(self, name=None, index=None):
     """Retrieves a layer based on either its name (unique) or index.
@@ -534,18 +545,21 @@ class Network(base_layer.Layer):
   @property
   def trainable_weights(self):
     self._assert_weights_created()
-    return trackable_layer_utils.gather_trainable_weights(
-        trainable=self.trainable,
-        sub_layers=self._layers,
-        extra_variables=self._trainable_weights)
+    return self._dedup_weights(
+        trackable_layer_utils.gather_trainable_weights(
+            trainable=self.trainable,
+            sub_layers=self._layers,
+            extra_variables=self._trainable_weights))
 
   @property
   def non_trainable_weights(self):
     self._assert_weights_created()
-    return trackable_layer_utils.gather_non_trainable_weights(
-        trainable=self.trainable,
-        sub_layers=self._layers,
-        extra_variables=self._non_trainable_weights + self._trainable_weights)
+    return self._dedup_weights(
+        trackable_layer_utils.gather_non_trainable_weights(
+            trainable=self.trainable,
+            sub_layers=self._layers,
+            extra_variables=self._non_trainable_weights +
+            self._trainable_weights))
 
   @property
   def input_spec(self):
@@ -634,7 +648,7 @@ class Network(base_layer.Layer):
           x = base_layer_utils.generate_placeholders_from_shape(input_shape)
 
         kwargs = {}
-        call_signature = tf_inspect.getfullargspec(self.call)
+        call_signature = self._call_full_argspec
         call_args = call_signature.args
         # Exclude `self`, `inputs`, and any argument with a default value.
         if len(call_args) > 2:
@@ -693,7 +707,9 @@ class Network(base_layer.Layer):
       raise NotImplementedError('When subclassing the `Model` class, you should'
                                 ' implement a `call` method.')
 
-    return self._run_internal_graph(inputs, training=training, mask=mask)
+    return self._run_internal_graph(
+        inputs, training=training, mask=mask,
+        convert_kwargs_to_constants=base_layer_utils.call_context().saving)
 
   def compute_output_shape(self, input_shape):
     if not self._is_graph_network:
@@ -767,7 +783,8 @@ class Network(base_layer.Layer):
     # Return shapes as TensorShapes.
     return output_shapes
 
-  def _run_internal_graph(self, inputs, training=None, mask=None):
+  def _run_internal_graph(self, inputs, training=None, mask=None,
+                          convert_kwargs_to_constants=False):
     """Computes output tensors for new inputs.
 
     # Note:
@@ -777,6 +794,9 @@ class Network(base_layer.Layer):
         inputs: Tensor or nested structure of Tensors.
         training: Boolean learning phase.
         mask: (Optional) Tensor or nested structure of Tensors.
+        convert_kwargs_to_constants: Whether to convert Tensor kwargs to
+          constants. This is used when tracing the model call function during
+          saving to ensure that external tensors aren't captured.
 
     Returns:
         Two lists: output_tensors, output_masks
@@ -824,6 +844,9 @@ class Network(base_layer.Layer):
 
           # Ensure `training` arg propagation if applicable.
           kwargs = copy.copy(node.arguments) if node.arguments else {}
+          if convert_kwargs_to_constants:
+            kwargs = _map_tensors_to_constants(kwargs)
+
           argspec = self._layer_call_argspecs[layer].args
           if 'training' in argspec:
             kwargs.setdefault('training', training)
@@ -974,133 +997,10 @@ class Network(base_layer.Layer):
     Raises:
         ValueError: In case of improperly formatted config dict.
     """
-    # Layer instances created during the graph reconstruction process.
-    created_layers = collections.OrderedDict()
-
-    # Dictionary mapping layer instances to
-    # node data that specifies a layer call.
-    # It acts as a queue that maintains any unprocessed
-    # layer call until it becomes possible to process it
-    # (i.e. until the input tensors to the call all exist).
-    unprocessed_nodes = {}
-
-    def add_unprocessed_node(layer, node_data):
-      if layer not in unprocessed_nodes:
-        unprocessed_nodes[layer] = [node_data]
-      else:
-        unprocessed_nodes[layer].append(node_data)
-
-    def process_node(layer, node_data):
-      """Deserialize a node.
-
-      Arguments:
-          layer: layer instance.
-          node_data: Nested structure of `ListWrapper`.
-
-      Raises:
-          ValueError: In case of improperly formatted `node_data`.
-      """
-      input_tensors = []
-      for input_data in nest.flatten(node_data):
-        input_data = input_data.as_list()
-        inbound_layer_name = input_data[0]
-        inbound_node_index = input_data[1]
-        inbound_tensor_index = input_data[2]
-        if len(input_data) == 3:
-          kwargs = {}
-        elif len(input_data) == 4:
-          kwargs = input_data[3]
-          kwargs = _deserialize_keras_tensors(kwargs, created_layers)
-        else:
-          raise ValueError('Improperly formatted model config.')
-
-        inbound_layer = created_layers[inbound_layer_name]
-        if len(inbound_layer._inbound_nodes) <= inbound_node_index:
-          add_unprocessed_node(layer, node_data)
-          return
-        inbound_node = inbound_layer._inbound_nodes[inbound_node_index]
-        input_tensors.append(
-            nest.flatten(inbound_node.output_tensors)[inbound_tensor_index])
-      input_tensors = nest.pack_sequence_as(node_data, input_tensors)
-      # Call layer on its inputs, thus creating the node
-      # and building the layer if needed.
-      if input_tensors is not None:
-        # Preserve compatibility with older configs
-        flat_input_tensors = nest.flatten(input_tensors)
-        # If this is a single element but not a dict, unwrap. If this is a dict,
-        # assume the first layer expects a dict (as is the case with a
-        # DenseFeatures layer); pass through.
-        if not isinstance(input_tensors, dict) and len(flat_input_tensors) == 1:
-          input_tensors = flat_input_tensors[0]
-        layer(input_tensors, **kwargs)
-
-    def process_layer(layer_data):
-      """Deserializes a layer, then call it on appropriate inputs.
-
-      Arguments:
-          layer_data: layer config dict.
-
-      Raises:
-          ValueError: In case of improperly formatted `layer_data` dict.
-      """
-      layer_name = layer_data['name']
-
-      # Instantiate layer.
-      from tensorflow.python.keras.layers import deserialize as deserialize_layer  # pylint: disable=g-import-not-at-top
-
-      layer = deserialize_layer(layer_data, custom_objects=custom_objects)
-      created_layers[layer_name] = layer
-
-      # Gather layer inputs and convert to `ListWrapper` objects.
-      inbound_nodes_data = layer_data['inbound_nodes']
-      inbound_nodes_data = tf_utils.convert_inner_node_data(
-          inbound_nodes_data, wrap=True)
-      for node_data in inbound_nodes_data:
-        # We don't process nodes (i.e. make layer calls)
-        # on the fly because the inbound node may not yet exist,
-        # in case of layer shared at different topological depths
-        # (e.g. a model such as A(B(A(B(x)))))
-        add_unprocessed_node(layer, node_data)
-
-    # First, we create all layers and enqueue nodes to be processed
-    for layer_data in config['layers']:
-      process_layer(layer_data)
-    # Then we process nodes in order of layer depth.
-    # Nodes that cannot yet be processed (if the inbound node
-    # does not yet exist) are re-enqueued, and the process
-    # is repeated until all nodes are processed.
-    while unprocessed_nodes:
-      for layer_data in config['layers']:
-        layer = created_layers[layer_data['name']]
-        if layer in unprocessed_nodes:
-          for node_data in unprocessed_nodes.pop(layer):
-            process_node(layer, node_data)
-
-    name = config.get('name')
-    input_tensors = []
-    output_tensors = []
-
-    input_layers = tf_utils.convert_inner_node_data(
-        config['input_layers'], wrap=True)
-    for layer_data in nest.flatten(input_layers):
-      layer_name, node_index, tensor_index = layer_data.as_list()
-      assert layer_name in created_layers
-      layer = created_layers[layer_name]
-      layer_output_tensors = layer._inbound_nodes[node_index].output_tensors
-      input_tensors.append(nest.flatten(layer_output_tensors)[tensor_index])
-
-    output_layers = tf_utils.convert_inner_node_data(
-        config['output_layers'], wrap=True)
-    for layer_data in nest.flatten(output_layers):
-      layer_name, node_index, tensor_index = layer_data.as_list()
-      assert layer_name in created_layers
-      layer = created_layers[layer_name]
-      layer_output_tensors = layer._inbound_nodes[node_index].output_tensors
-      output_tensors.append(nest.flatten(layer_output_tensors)[tensor_index])
-
-    input_tensors = nest.pack_sequence_as(input_layers, input_tensors)
-    output_tensors = nest.pack_sequence_as(output_layers, output_tensors)
-    model = cls(inputs=input_tensors, outputs=output_tensors, name=name)
+    input_tensors, output_tensors, created_layers = reconstruct_from_config(
+        config, custom_objects)
+    model = cls(inputs=input_tensors, outputs=output_tensors,
+                name=config.get('name'))
 
     # Layers not connected to outputs, such as those added in `add_loss`.
     ancillary_layers = [
@@ -1589,6 +1489,10 @@ class Network(base_layer.Layer):
       if layer not in layer_set:
         self._layers.append(layer)
         self._layer_call_argspecs[layer] = tf_inspect.getfullargspec(layer.call)
+
+        # This allows the added layer to broadcast mutations to the current
+        # layer, which is necessary to ensure cache correctness.
+        layer._attribute_sentinel.add_parent(self._attribute_sentinel)
         layer_set.add(layer)
 
   def _assert_weights_created(self):
@@ -1874,6 +1778,15 @@ def _serialize_tensors(kwargs):
 
   return nest.map_structure(_serialize_keras_tensor, kwargs)
 
+def _map_tensors_to_constants(kwargs):
+
+  def _map_to_constants(t):
+    if not hasattr(t, '_keras_history') and isinstance(t, ops.Tensor):
+      return constant_op.constant(backend.get_value(t))
+    return t
+
+  return nest.map_structure(_map_to_constants, kwargs)
+
 
 def _deserialize_keras_tensors(kwargs, layer_map):
   """Deserializes Keras Tensors passed to `call`.."""
@@ -1893,3 +1806,147 @@ def _deserialize_keras_tensors(kwargs, layer_map):
 
   kwargs = tf_utils.convert_inner_node_data(kwargs, wrap=True)
   return nest.map_structure(_deserialize_keras_tensor, kwargs)
+
+
+def reconstruct_from_config(config, custom_objects=None, created_layers=None):
+  """Reconstructs graph from config object.
+
+  Args:
+    config: Dictionary returned from Network.get_config()
+    custom_objects: Optional dictionary mapping names (strings) to custom
+      classes or functions to be considered during deserialization.
+    created_layers: Optional dictionary mapping names to Layer objects. Any
+      layer not in this dictionary will be be created and added to the dict.
+
+  Returns:
+    Tuple of (input tensors, output tensors, dictionary of created layers)
+  """
+  # Layer instances created during the graph reconstruction process.
+  created_layers = created_layers or collections.OrderedDict()
+
+  # Dictionary mapping layer instances to
+  # node data that specifies a layer call.
+  # It acts as a queue that maintains any unprocessed
+  # layer call until it becomes possible to process it
+  # (i.e. until the input tensors to the call all exist).
+  unprocessed_nodes = {}
+
+  def add_unprocessed_node(layer, node_data):
+    if layer not in unprocessed_nodes:
+      unprocessed_nodes[layer] = [node_data]
+    else:
+      unprocessed_nodes[layer].append(node_data)
+
+  def process_node(layer, node_data):
+    """Deserialize a node.
+
+    Arguments:
+        layer: layer instance.
+        node_data: Nested structure of `ListWrapper`.
+
+    Raises:
+        ValueError: In case of improperly formatted `node_data`.
+    """
+    input_tensors = []
+    for input_data in nest.flatten(node_data):
+      input_data = input_data.as_list()
+      inbound_layer_name = input_data[0]
+      inbound_node_index = input_data[1]
+      inbound_tensor_index = input_data[2]
+      if len(input_data) == 3:
+        kwargs = {}
+      elif len(input_data) == 4:
+        kwargs = input_data[3]
+        kwargs = _deserialize_keras_tensors(kwargs, created_layers)
+      else:
+        raise ValueError('Improperly formatted model config.')
+
+      inbound_layer = created_layers[inbound_layer_name]
+      if len(inbound_layer._inbound_nodes) <= inbound_node_index:
+        add_unprocessed_node(layer, node_data)
+        return
+      inbound_node = inbound_layer._inbound_nodes[inbound_node_index]
+      input_tensors.append(
+          nest.flatten(inbound_node.output_tensors)[inbound_tensor_index])
+    input_tensors = nest.pack_sequence_as(node_data, input_tensors)
+    # Call layer on its inputs, thus creating the node
+    # and building the layer if needed.
+    if input_tensors is not None:
+      # Preserve compatibility with older configs
+      flat_input_tensors = nest.flatten(input_tensors)
+      # If this is a single element but not a dict, unwrap. If this is a dict,
+      # assume the first layer expects a dict (as is the case with a
+      # DenseFeatures layer); pass through.
+      if not isinstance(input_tensors, dict) and len(flat_input_tensors) == 1:
+        input_tensors = flat_input_tensors[0]
+      layer(input_tensors, **kwargs)
+
+  def process_layer(layer_data):
+    """Deserializes a layer, then call it on appropriate inputs.
+
+    Arguments:
+        layer_data: layer config dict.
+
+    Raises:
+        ValueError: In case of improperly formatted `layer_data` dict.
+    """
+    layer_name = layer_data['name']
+
+    if layer_name in created_layers:
+      layer = created_layers[layer_name]
+    else:
+      # Instantiate layer.
+      from tensorflow.python.keras.layers import deserialize as deserialize_layer  # pylint: disable=g-import-not-at-top
+
+      layer = deserialize_layer(layer_data, custom_objects=custom_objects)
+      created_layers[layer_name] = layer
+
+    # Gather layer inputs and convert to `ListWrapper` objects.
+    inbound_nodes_data = layer_data['inbound_nodes']
+    inbound_nodes_data = tf_utils.convert_inner_node_data(
+        inbound_nodes_data, wrap=True)
+    for node_data in inbound_nodes_data:
+      # We don't process nodes (i.e. make layer calls)
+      # on the fly because the inbound node may not yet exist,
+      # in case of layer shared at different topological depths
+      # (e.g. a model such as A(B(A(B(x)))))
+      add_unprocessed_node(layer, node_data)
+
+  # First, we create all layers and enqueue nodes to be processed
+  for layer_data in config['layers']:
+    process_layer(layer_data)
+  # Then we process nodes in order of layer depth.
+  # Nodes that cannot yet be processed (if the inbound node
+  # does not yet exist) are re-enqueued, and the process
+  # is repeated until all nodes are processed.
+  while unprocessed_nodes:
+    for layer_data in config['layers']:
+      layer = created_layers[layer_data['name']]
+      if layer in unprocessed_nodes:
+        for node_data in unprocessed_nodes.pop(layer):
+          process_node(layer, node_data)
+
+  input_tensors = []
+  output_tensors = []
+
+  input_layers = tf_utils.convert_inner_node_data(
+      config['input_layers'], wrap=True)
+  for layer_data in nest.flatten(input_layers):
+    layer_name, node_index, tensor_index = layer_data.as_list()
+    assert layer_name in created_layers
+    layer = created_layers[layer_name]
+    layer_output_tensors = layer._inbound_nodes[node_index].output_tensors
+    input_tensors.append(nest.flatten(layer_output_tensors)[tensor_index])
+
+  output_layers = tf_utils.convert_inner_node_data(
+      config['output_layers'], wrap=True)
+  for layer_data in nest.flatten(output_layers):
+    layer_name, node_index, tensor_index = layer_data.as_list()
+    assert layer_name in created_layers
+    layer = created_layers[layer_name]
+    layer_output_tensors = layer._inbound_nodes[node_index].output_tensors
+    output_tensors.append(nest.flatten(layer_output_tensors)[tensor_index])
+
+  input_tensors = nest.pack_sequence_as(input_layers, input_tensors)
+  output_tensors = nest.pack_sequence_as(output_layers, output_tensors)
+  return input_tensors, output_tensors, created_layers
diff --git a/tensorflow/python/keras/engine/network_test.py b/tensorflow/python/keras/engine/network_test.py
index 5726204cd17..cdfb427d4c8 100644
--- a/tensorflow/python/keras/engine/network_test.py
+++ b/tensorflow/python/keras/engine/network_test.py
@@ -28,6 +28,7 @@ from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import test_util
 from tensorflow.python.keras import keras_parameterized
 from tensorflow.python.keras import testing_utils
+from tensorflow.python.keras.engine import base_layer
 from tensorflow.python.keras.engine import input_layer as input_layer_lib
 from tensorflow.python.keras.engine import network as network_lib
 from tensorflow.python.keras.engine import training
@@ -1730,5 +1731,128 @@ class DTypeTest(keras_parameterized.TestCase):
     self.assertEqual(network(array_ops.constant(1, 'float64')).dtype, 'float64')
 
 
+class AttrTrackingLayer(base_layer.Layer):
+  """Count how many times `dynamic` and `stateful` are called.
+
+  These counts are used to test that the attribute cache behaves as expected.
+  """
+  def __init__(self, *args, **kwargs):
+    self.stateful_count = 0
+    self.dynamic_count = 0
+    super(AttrTrackingLayer, self).__init__(*args, **kwargs)
+
+  @base_layer.Layer.stateful.getter
+  def stateful(self):
+    self.stateful_count += 1
+    return super(AttrTrackingLayer, self).stateful
+
+  @property
+  def dynamic(self):
+    self.dynamic_count += 1
+    return super(AttrTrackingLayer, self).dynamic
+
+
+class CacheCorrectnessTest(keras_parameterized.TestCase):
+  def layer_and_network_test(self):
+    # Top level layer
+    network = network_lib.Network()
+
+    layer_0 = AttrTrackingLayer()
+
+    sub_network = network_lib.Network()
+    layer_1 = AttrTrackingLayer(dynamic=True)
+    layer_2 = AttrTrackingLayer()
+    sub_network.sub_layers = [layer_1, layer_2]
+
+    network.sub_layer = layer_0
+
+    for _ in range(2):
+      self.assertEqual(network.dynamic, False)
+      self.assertEqual(network.stateful, False)
+
+      # The second pass should be a cache hit.
+      self.assertEqual(layer_0.dynamic_count, 1)
+      self.assertEqual(layer_0.stateful_count, 1)
+
+    # Mutations of the sub-layer should force recalculation of the network's
+    # stateful attribute. (mutations bubble up.)
+    layer_0.stateful = True
+    self.assertEqual(network.stateful, True)
+    self.assertEqual(layer_0.stateful_count, 2)
+
+    layer_0.stateful = False
+    self.assertEqual(network.stateful, False)
+    self.assertEqual(layer_0.stateful_count, 3)
+
+    # But changing stateful should not affect dynamic.
+    self.assertEqual(network.dynamic, False)
+    self.assertEqual(layer_0.dynamic_count, 1)
+
+    network.sub_network = sub_network
+
+    # Adding to the topology should invalidate the cache and reflect in the top
+    # level network.
+    self.assertEqual(network.dynamic, True)
+    self.assertEqual(layer_0.dynamic_count, 2)
+    self.assertEqual(layer_1.dynamic_count, 1)
+
+    # Still dynamic, but we need to recompute.
+    sub_network.sub_layers.pop()
+    self.assertEqual(network.dynamic, True)
+    self.assertEqual(layer_0.dynamic_count, 3)
+    self.assertEqual(layer_1.dynamic_count, 2)
+
+    # Now that we've removed the dynamic layer deep in the layer hierarchy, we
+    # need to make sure that that bubbles up through all the levels.
+    sub_network.sub_layers.pop()
+    self.assertEqual(network.dynamic, False)
+    self.assertEqual(layer_0.dynamic_count, 4)
+    self.assertEqual(layer_1.dynamic_count, 2)
+
+    # Now check with a tracked dict.
+    sub_network.sub_layers = {
+        "layer_1": layer_1,
+        "layer_2": layer_2,
+    }
+
+    self.assertEqual(network.dynamic, True)
+    self.assertEqual(layer_0.dynamic_count, 5)
+    self.assertEqual(layer_1.dynamic_count, 3)
+
+    # In-place assignment should still invalidate the cache.
+    sub_network.sub_layers["layer_1"] = layer_1
+    self.assertEqual(network.dynamic, True)
+    self.assertEqual(layer_0.dynamic_count, 6)
+    self.assertEqual(layer_1.dynamic_count, 4)
+
+    sub_network.sub_layers["layer_1"] = None
+    for _ in range(2):
+      self.assertEqual(network.dynamic, False)
+      self.assertEqual(layer_0.dynamic_count, 7)
+      self.assertEqual(layer_1.dynamic_count, 4)
+
+    layer_3 = AttrTrackingLayer()
+    layer_3.stateful = True
+
+    sub_network.sub_layers = None
+    self.assertEqual(network.dynamic, False)
+    self.assertEqual(network.stateful, False)
+
+    # Test duplicate layers.
+    sub_network.sub_layers = [layer_1, layer_1, layer_1, layer_3]
+    self.assertEqual(network.dynamic, True)
+    self.assertEqual(network.stateful, True)
+
+    for _ in range(3):
+      sub_network.sub_layers.pop()
+      self.assertEqual(network.dynamic, True)
+      self.assertEqual(network.stateful, False)
+
+    sub_network.sub_layers.pop()
+    self.assertEqual(network.dynamic, False)
+    self.assertEqual(network.stateful, False)
+
+
+
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/keras/engine/sequential.py b/tensorflow/python/keras/engine/sequential.py
index e0eec1c81a3..d7a87caffad 100644
--- a/tensorflow/python/keras/engine/sequential.py
+++ b/tensorflow/python/keras/engine/sequential.py
@@ -33,6 +33,7 @@ from tensorflow.python.keras.utils import layer_utils
 from tensorflow.python.keras.utils import tf_utils
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.training.tracking import base as trackable
+from tensorflow.python.training.tracking import layer_utils as trackable_layer_utils
 from tensorflow.python.util import nest
 from tensorflow.python.util import tf_inspect
 from tensorflow.python.util.tf_export import keras_export
@@ -127,6 +128,7 @@ class Sequential(training.Model):
     return layers[:]
 
   @property
+  @trackable_layer_utils.cache_recursive_attribute('dynamic')
   def dynamic(self):
     return any(layer.dynamic for layer in self.layers)
 
@@ -160,6 +162,10 @@ class Sequential(training.Model):
 
     tf_utils.assert_no_legacy_layers([layer])
 
+    # This allows the added layer to broadcast mutations to the current
+    # layer, which is necessary to ensure cache correctness.
+    layer._attribute_sentinel.add_parent(self._attribute_sentinel)
+
     self.built = False
     set_inputs = False
     if not self._layers:
@@ -215,6 +221,9 @@ class Sequential(training.Model):
       self._track_layers(self._layers)
 
     self._layer_call_argspecs[layer] = tf_inspect.getfullargspec(layer.call)
+    # Different Model types add to `._layers` in different ways, so for safety
+    # we do a cache invalidation to make sure the changes are reflected.
+    self._attribute_sentinel.invalidate_all()
 
   @trackable.no_automatic_dependency_tracking
   def pop(self):
@@ -228,6 +237,7 @@ class Sequential(training.Model):
 
     layer = self._layers.pop()
     self._layer_call_argspecs.pop(layer)
+    self._attribute_sentinel.invalidate_all()
     if not self.layers:
       self.outputs = None
       self.inputs = None
diff --git a/tensorflow/python/keras/engine/training.py b/tensorflow/python/keras/engine/training.py
index d29138afecd..98b3b773930 100644
--- a/tensorflow/python/keras/engine/training.py
+++ b/tensorflow/python/keras/engine/training.py
@@ -379,7 +379,7 @@ class Model(network.Network):
       self.predict_function = None
 
       # Collected trainable weights, sorted in topological order.
-      self._collected_trainable_weights = self._unique_trainable_weights
+      self._collected_trainable_weights = self.trainable_weights
 
       # Validate all variables were correctly created in distribution scope.
       if self._distribution_strategy and not self._compile_distribution:
@@ -414,6 +414,9 @@ class Model(network.Network):
   @property
   def metrics_names(self):
     """Returns the model's display labels for all outputs."""
+
+    # This property includes all output names including `loss` and per-output
+    # losses for backward compatibility.
     metrics_names = ['loss']
     if self._is_compiled:
       # Add output loss metric names to the metric names list.
@@ -424,13 +427,8 @@ class Model(network.Network):
             if not e.should_skip_target()
         ])
 
-      # Add compile metrics/weighted metrics' names to the metric names list.
-      metrics_names.extend([m.name for m in self._compile_metric_functions])
-
-    # Add metric names from layers.
-    for layer in self.layers:
-      metrics_names += [m.name for m in layer._metrics]  # pylint: disable=protected-access
-    metrics_names += [m.name for m in self._metrics]
+    # Add all metric names.
+    metrics_names += [m.name for m in self.metrics]
     return metrics_names
 
   @property
@@ -1419,7 +1417,7 @@ class Model(network.Network):
   def _check_call_args(self, method_name):
     """Check that `call` has only one positional arg."""
     # Always allow first arg, regardless of arg name.
-    fullargspec = tf_inspect.getfullargspec(self.call)
+    fullargspec = self._call_full_argspec
     if fullargspec.defaults:
       positional_args = fullargspec.args[:-len(fullargspec.defaults)]
     else:
@@ -1568,7 +1566,7 @@ class Model(network.Network):
     # Set metric attributes on model.
     self._set_metric_attributes()
 
-    self._collected_trainable_weights = self._unique_trainable_weights
+    self._collected_trainable_weights = self.trainable_weights
 
   def _update_sample_weight_modes(self, sample_weights=None):
     """Updates sample weight modes based on training/eval inputs.
@@ -1813,9 +1811,11 @@ class Model(network.Network):
                 x, batch_size))
       return
 
-    layers = super(Model, self).layers  # Avoids the override in Sequential.
-    if layers:
-      first_layer = layers[0]
+    # Avoids the override in Sequential.layers which filters Input layers.
+    # (Which are often the very layers that we're after.)
+    layers = trackable_layer_utils.filter_empty_layer_containers(self._layers)
+    first_layer = next(layers, None)
+    if first_layer:
       # The per-replica static batch size.
       static_batch_size = training_utils.get_static_batch_size(first_layer)
       if static_batch_size is not None:
@@ -2079,8 +2079,7 @@ class Model(network.Network):
     if not hasattr(self, '_collected_trainable_weights'):
       return
 
-    if (len(self._unique_trainable_weights) !=
-        len(self._collected_trainable_weights)):
+    if len(self.trainable_weights) != len(self._collected_trainable_weights):
       logging.log_first_n(
           logging.WARN, 'Discrepancy between trainable weights and collected'
           ' trainable weights, did you set `model.trainable`'
@@ -2436,14 +2435,17 @@ class Model(network.Network):
     # part of the graph.
     # Note: in this case, `any` and `all` are equivalent since we disallow
     # mixed symbolic/value inputs.
-    if (not self.run_eagerly and is_build_called and is_compile_called and
+      
+    # self.run_eagerly is not free to compute, so we want to reuse the value.
+    run_eagerly = self.run_eagerly
+    if (not run_eagerly and is_build_called and is_compile_called and
         not is_dataset  and any(_is_symbolic_tensor(v) for v in all_inputs)):
       return [], [], None
 
     # What follows is input validation and standardization to list format,
     # in the case where all inputs are value arrays.
 
-    if self.run_eagerly:
+    if run_eagerly:
       # In eager mode, do not do shape validation
       # since the network has no input nodes (placeholders) to be fed.
       feed_input_names = self.input_names
@@ -2529,7 +2531,7 @@ class Model(network.Network):
       # Check that all arrays have the same length.
       if not self._distribution_strategy:
         training_utils.check_array_lengths(x, y, sample_weights)
-        if self._is_graph_network and not self.run_eagerly:
+        if self._is_graph_network and not run_eagerly:
           # Additional checks to avoid users mistakenly using improper loss fns.
           training_utils.check_loss_and_target_compatibility(
               y, self._feed_loss_fns, feed_output_shapes)
@@ -2856,10 +2858,8 @@ class Model(network.Network):
     add_metric metrics.
     """
     metrics = []
-    if getattr(self, '_output_loss_metrics', None) is not None:
-      metrics.extend(self._output_loss_metrics)
-    if hasattr(self, 'metrics'):
-      metrics.extend(self.metrics)
+    metrics.extend(getattr(self, '_output_loss_metrics', None) or [])
+    metrics.extend(getattr(self, 'metrics', None) or [])
     return metrics
 
   def _assert_compile_was_called(self):
diff --git a/tensorflow/python/keras/engine/training_arrays.py b/tensorflow/python/keras/engine/training_arrays.py
index e4273d786d8..c6e01a138ce 100644
--- a/tensorflow/python/keras/engine/training_arrays.py
+++ b/tensorflow/python/keras/engine/training_arrays.py
@@ -229,7 +229,8 @@ def model_iteration(model,
       verbose=0,  # Handle ProgBarLogger separately in this loop.
       mode=mode)
   # TODO(omalleyt): Handle ProgBar as part of Callbacks once hooks are ready.
-  progbar = training_utils.get_progbar(model, count_mode)
+  progbar = training_utils.get_progbar(
+      model, count_mode, mode != ModeKeys.PREDICT)
   progbar.params = callbacks.params
   progbar.params['verbose'] = verbose
 
@@ -268,7 +269,10 @@ def model_iteration(model,
 
     # Setup work for each epoch
     epoch_logs = {}
-    model.reset_metrics()
+    if mode != ModeKeys.PREDICT:
+      # Collecting and resetting metrics has non-zero cost and will needlessly
+      # slow down model.predict.
+      model.reset_metrics()
     if mode == ModeKeys.TRAIN:
       callbacks.on_epoch_begin(epoch, epoch_logs)
     progbar.on_epoch_begin(epoch, epoch_logs)
diff --git a/tensorflow/python/keras/engine/training_dataset_test.py b/tensorflow/python/keras/engine/training_dataset_test.py
index fa67f5acdd7..efbfc099941 100644
--- a/tensorflow/python/keras/engine/training_dataset_test.py
+++ b/tensorflow/python/keras/engine/training_dataset_test.py
@@ -294,7 +294,7 @@ class TestTrainingWithDataset(keras_parameterized.TestCase):
         self.w = self.add_weight('w', ())
 
       def call(self, inputs):
-        return keras.backend.sum(inputs) + self.w * 0
+        return keras.backend.sum(inputs, axis=1, keepdims=True) + self.w * 0
 
     model = keras.Sequential([SumLayer(input_shape=(2,))])
     model.compile(
@@ -317,11 +317,11 @@ class TestTrainingWithDataset(keras_parameterized.TestCase):
     history = model.fit(train_dataset,
                         epochs=2, steps_per_epoch=2, verbose=1,
                         validation_data=val_dataset, validation_steps=2)
-    self.assertListEqual(history.history['loss'],
-                         [inputs[:20].sum() / 2, inputs[20:].sum() / 2])
+    self.assertAllClose(history.history['loss'],
+                        [inputs[:20].sum() / 20, inputs[20:].sum() / 20])
     # The validation dataset will be reset at the end of each validation run.
-    self.assertListEqual(history.history['val_loss'],
-                         [inputs[:20].sum() / 2, inputs[:20].sum() / 2])
+    self.assertAllClose(history.history['val_loss'],
+                        [inputs[:20].sum() / 20, inputs[:20].sum() / 20])
 
     # Test correctness with dataset reset.
     train_dataset = dataset_ops.Dataset.from_tensor_slices(
@@ -330,10 +330,12 @@ class TestTrainingWithDataset(keras_parameterized.TestCase):
         (inputs, targets)).batch(10)
     history = model.fit(train_dataset,
                         epochs=2, verbose=1, validation_data=val_dataset)
-    self.assertListEqual(history.history['loss'],
-                         [inputs.sum() / 4, inputs.sum() / 4])
-    self.assertListEqual(history.history['val_loss'],
-                         [inputs.sum() / 4, inputs.sum() / 4])
+    self.assertAllClose(
+        history.history['loss'],
+        [inputs.sum() / 40, inputs.sum() / 40])
+    self.assertAllClose(
+        history.history['val_loss'],
+        [inputs.sum() / 40, inputs.sum() / 40])
 
   @tf_test_util.run_deprecated_v1
   def test_dataset_input_shape_validation(self):
@@ -456,7 +458,6 @@ class TestTrainingWithDataset(keras_parameterized.TestCase):
 
     lines = capture.output.splitlines()
 
-    self.assertIn('1/Unknown', lines[2])
     self.assertIn('10/10', lines[-1])
 
     self.assertLen(history.history['loss'], 2)
diff --git a/tensorflow/python/keras/engine/training_eager.py b/tensorflow/python/keras/engine/training_eager.py
index ab16efc3646..be1b2e89d90 100644
--- a/tensorflow/python/keras/engine/training_eager.py
+++ b/tensorflow/python/keras/engine/training_eager.py
@@ -258,7 +258,7 @@ def _process_single_batch(model,
       else:
         scaled_total_loss = total_loss
     if training:
-      trainable_weights = model._unique_trainable_weights
+      trainable_weights = model.trainable_weights
       if trainable_weights:
         # TODO(tanzheny) b/132690565: Provide mechanism for user to override
         # model.train_on_batch.
diff --git a/tensorflow/python/keras/engine/training_generator_test.py b/tensorflow/python/keras/engine/training_generator_test.py
index ce1d73da59e..cebc1ba5e5d 100644
--- a/tensorflow/python/keras/engine/training_generator_test.py
+++ b/tensorflow/python/keras/engine/training_generator_test.py
@@ -18,6 +18,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import itertools
+
 from absl.testing import parameterized
 import numpy as np
 
@@ -59,6 +61,31 @@ def custom_generator(mode=2):
       yield x, y, w
 
 
+def custom_generator_changing_batch_size(mode=2):
+  batch_size = 10
+  cur_batch_size = 11
+  num_samples = 50
+  arr_data = np.random.random((num_samples, 2))
+  arr_labels = np.random.random((num_samples, 4))
+  arr_weights = np.random.random((num_samples,))
+  i = 0
+  while True:
+    if cur_batch_size > 1:
+      cur_batch_size -= 1
+    batch_index = i * batch_size % num_samples
+    i += 1
+    start = batch_index
+    end = start + cur_batch_size
+    x = arr_data[start: end]
+    y = arr_labels[start: end]
+    w = arr_weights[start: end]
+    if mode == 1:
+      yield x
+    elif mode == 2:
+      yield x, y
+    else:
+      yield x, y, w
+
 custom_generator_threads = data_utils.threadsafe_generator(custom_generator)
 
 
@@ -271,6 +298,38 @@ class TestGeneratorMethods(keras_parameterized.TestCase):
     model.evaluate(ones_generator(), steps=2)
     model.predict(ones_generator(), steps=2)
 
+    # Test with a changing batch size
+    model = testing_utils.get_small_mlp(
+        num_hidden=3, num_classes=4, input_dim=2)
+    model.compile(
+        loss='mse',
+        optimizer=rmsprop.RMSprop(1e-3),
+        metrics=['mae', metrics_module.CategoricalAccuracy()],
+        experimental_run_tf_function=testing_utils.should_run_tf_function())
+    model.fit_generator(custom_generator_changing_batch_size(),
+                        steps_per_epoch=5,
+                        epochs=1,
+                        verbose=1,
+                        max_queue_size=10,
+                        use_multiprocessing=False)
+    model.fit_generator(custom_generator_changing_batch_size(),
+                        steps_per_epoch=5,
+                        epochs=1,
+                        verbose=1,
+                        max_queue_size=10,
+                        use_multiprocessing=False,
+                        validation_data=custom_generator_changing_batch_size(),
+                        validation_steps=10)
+
+    model.fit(
+        custom_generator_changing_batch_size(),
+        steps_per_epoch=5,
+        validation_data=custom_generator_changing_batch_size(),
+        validation_steps=10,
+        epochs=2)
+    model.evaluate(custom_generator_changing_batch_size(), steps=5)
+    model.predict(custom_generator_changing_batch_size(), steps=5)
+
   @keras_parameterized.run_with_all_model_types
   @keras_parameterized.run_all_keras_modes
   def test_invalid_batch_size_argument(self):
@@ -299,6 +358,56 @@ class TestGeneratorMethods(keras_parameterized.TestCase):
         ValueError, 'The `batch_size` argument must not be specified'):
       model.predict(ones_generator(), batch_size=2)
 
+  @keras_parameterized.run_with_all_model_types
+  @keras_parameterized.run_all_keras_modes
+  @data_utils.dont_use_multiprocessing_pool
+  def test_generator_dynamic_shapes(self):
+    x = [
+        'I think juice is great',
+        'unknown is the best language since slicedbread',
+        'a a a a a a a',
+        'matmul'
+        'Yaks are also quite nice',
+    ]
+    y = [1, 0, 0, 1, 1]
+
+    vocab = {
+        word: i + 1 for i, word in
+        enumerate(
+            sorted(set(itertools.chain(*[i.split() for i in x]))))
+    }
+
+    def data_gen(batch_size=2):
+      np.random.seed(0)
+      data = list(zip(x, y)) * 10
+      np.random.shuffle(data)
+
+      def pack_and_pad(queue):
+        x = [[vocab[j] for j in i[0].split()] for i in queue]
+        pad_len = max(len(i) for i in x)
+        x = np.array([i + [0] * (pad_len - len(i)) for i in x])
+        y = np.array([i[1] for i in queue])
+        del queue[:]
+        return x, y[:, np.newaxis]
+
+      queue = []
+      for i, element in enumerate(data):
+        queue.append(element)
+        if not (i + 1) % batch_size:
+          yield pack_and_pad(queue)
+
+      if queue:
+        # Last partial batch
+        yield pack_and_pad(queue)
+
+    model = testing_utils.get_model_from_layers([
+        keras.layers.Embedding(input_dim=len(vocab) + 1, output_dim=4),
+        keras.layers.SimpleRNN(units=1),
+        keras.layers.Activation('sigmoid')], input_shape=(None,))
+
+    model.compile(loss=keras.losses.binary_crossentropy, optimizer='sgd')
+    model.fit(data_gen(), epochs=1, steps_per_epoch=5)
+
 
 class TestGeneratorMethodsWithSequences(keras_parameterized.TestCase):
 
@@ -348,6 +457,16 @@ class TestGeneratorMethodsWithSequences(keras_parameterized.TestCase):
       def __len__(self):
         return 2
 
+    class CustomSequenceChangingBatchSize(keras.utils.Sequence):
+
+      def __getitem__(self, idx):
+        batch_size = 10 - idx
+        return (np.ones([batch_size, 10], np.float32),
+                np.ones([batch_size, 1], np.float32))
+
+      def __len__(self):
+        return 2
+
     model = testing_utils.get_small_mlp(
         num_hidden=10, num_classes=1, input_dim=10)
 
@@ -363,6 +482,12 @@ class TestGeneratorMethodsWithSequences(keras_parameterized.TestCase):
                                  '`sample_weight` argument is not supported'):
       model.fit(CustomSequence(), sample_weight=np.ones([10, 1]))
 
+    model.compile(rmsprop.RMSprop(0.001), 'binary_crossentropy')
+    model.fit(CustomSequenceChangingBatchSize(),
+              validation_data=val_data, epochs=2)
+    model.evaluate(CustomSequenceChangingBatchSize())
+    model.predict(CustomSequenceChangingBatchSize())
+
 
 @tf_test_util.run_all_in_graph_and_eager_modes
 class TestConvertToGeneratorLike(test.TestCase, parameterized.TestCase):
diff --git a/tensorflow/python/keras/engine/training_test.py b/tensorflow/python/keras/engine/training_test.py
index 2e9ae40bdb8..81eb52471d1 100644
--- a/tensorflow/python/keras/engine/training_test.py
+++ b/tensorflow/python/keras/engine/training_test.py
@@ -904,6 +904,23 @@ class TrainingTest(keras_parameterized.TestCase):
     x2 = model.predict(val_a)
     self.assertAllClose(x1, x2, atol=1e-7)
 
+  def test_weight_deduplication_in_methods(self):
+    inp = keras.layers.Input(shape=(1,))
+    bn = keras.layers.BatchNormalization()
+    d = keras.layers.Dense(1)
+
+    m0 = keras.models.Model(inp, d(bn(inp)))
+    m1 = keras.models.Model(inp, d(bn(inp)))
+
+    x0 = m0(inp)
+    x1 = m1(inp)
+    x = keras.layers.Add()([x0, x1])
+
+    model = keras.models.Model(inp, x)
+    self.assertLen(model.trainable_weights, 4)
+    self.assertLen(model.non_trainable_weights, 2)
+    self.assertLen(model.weights, 6)
+
   @keras_parameterized.run_all_keras_modes
   def test_weight_deduplication(self):
     class WatchingLayer(keras.layers.Layer):
@@ -3531,6 +3548,49 @@ class TestTrainingWithMetrics(keras_parameterized.TestCase):
     for key in ['loss', 'mae_1', 'mae_2', 'mae_3']:
       self.assertAllClose(history.history[key], expected_val, 1e-3)
 
+  @keras_parameterized.run_all_keras_modes
+  def test_add_metric_order(self):
+
+    class MyLayer(keras.layers.Layer):
+
+      def call(self, inputs, training=None, mask=None):
+        self.add_metric(
+            array_ops.ones([32]) * 2.0, name='two', aggregation='mean')
+        return inputs
+
+    class MyModel(keras.Model):
+
+      def __init__(self, **kwargs):
+        super(MyModel, self).__init__(**kwargs)
+        self._sampler = MyLayer(name='sampler')
+
+      def call(self, inputs, training=None, mask=None):
+        z = self._sampler(inputs)
+        self.add_metric(
+            array_ops.ones([32]) * 1.0, name='one', aggregation='mean')
+        self.add_metric(
+            array_ops.ones([32]) * 3.0, name='three', aggregation='mean')
+        return z
+
+    xdata = np.random.uniform(size=[32, 16]).astype(np.float32)
+    dataset_train = dataset_ops.Dataset.from_tensor_slices((xdata, xdata))
+    dataset_train = dataset_train.batch(32, drop_remainder=True)
+
+    model = MyModel()
+    model.compile(
+        optimizer='sgd',
+        loss='mse',
+        run_eagerly=testing_utils.should_run_eagerly(),
+        experimental_run_tf_function=testing_utils.should_run_tf_function())
+    history = model.fit(dataset_train, epochs=3)
+    self.assertDictEqual(
+        history.history, {
+            'loss': [0.0, 0.0, 0.0],
+            'three': [3.0, 3.0, 3.0],
+            'two': [2.0, 2.0, 2.0],
+            'one': [1.0, 1.0, 1.0]
+        })
+
   @keras_parameterized.run_all_keras_modes
   def test_model_with_nested_compiled_model(self):
 
diff --git a/tensorflow/python/keras/engine/training_utils.py b/tensorflow/python/keras/engine/training_utils.py
index b45bcbc5b3d..a17d4483644 100644
--- a/tensorflow/python/keras/engine/training_utils.py
+++ b/tensorflow/python/keras/engine/training_utils.py
@@ -46,6 +46,7 @@ from tensorflow.python.keras import backend as K
 from tensorflow.python.keras import callbacks as cbks
 from tensorflow.python.keras import losses
 from tensorflow.python.keras import metrics as metrics_module
+from tensorflow.python.keras.utils import data_utils
 from tensorflow.python.keras.utils import generic_utils
 from tensorflow.python.keras.utils import losses_utils
 from tensorflow.python.ops import array_ops
@@ -347,11 +348,14 @@ class OutputsAggregator(Aggregator):
     self.results = nest.pack_sequence_as(self._structure, self.results)
 
 
-def get_progbar(model, count_mode):
+def get_progbar(model, count_mode, include_metrics=True):
   """Get Progbar."""
-  stateful_metric_names = None
-  if hasattr(model, 'metrics_names'):
-    stateful_metric_names = model.metrics_names[1:]  # Exclude `loss`
+  if include_metrics:
+    stateful_metric_names = getattr(model, 'metrics_names', None)
+    if stateful_metric_names:
+      stateful_metric_names = stateful_metric_names[1:]  # Exclude `loss`
+  else:
+    stateful_metric_names = None
   return cbks.ProgbarLogger(count_mode, stateful_metrics=stateful_metric_names)
 
 
@@ -1900,7 +1904,9 @@ def unpack_validation_data(validation_data):
   """
   if (isinstance(validation_data, (iterator_ops.Iterator,
                                    iterator_ops.IteratorV2,
-                                   dataset_ops.DatasetV2))):
+                                   dataset_ops.DatasetV2,
+                                   data_utils.Sequence))
+      or not hasattr(validation_data, '__len__')):
     val_x = validation_data
     val_y = None
     val_sample_weight = None
diff --git a/tensorflow/python/keras/engine/training_v2.py b/tensorflow/python/keras/engine/training_v2.py
index 63fab9d2b22..065d95714e3 100644
--- a/tensorflow/python/keras/engine/training_v2.py
+++ b/tensorflow/python/keras/engine/training_v2.py
@@ -576,11 +576,12 @@ def _process_training_inputs(model,
       (val_x, val_y,
        val_sample_weights) = training_utils.unpack_validation_data(
            validation_data)
-      # For eval data, we use the training data batch_size it was unknown.
+      # For eval data, we use a representative batch size of the
+      # training data if batch_size was unknown.
       # This is useful for generator/sequence training data input with numpy
       # validation data input.
       if not batch_size:
-        batch_size = train_adapter.batch_size()
+        batch_size = train_adapter.representative_batch_size()
       val_adapter = _process_inputs(
           model,
           ModeKeys.TEST,
diff --git a/tensorflow/python/keras/layers/convolutional.py b/tensorflow/python/keras/layers/convolutional.py
index 9e06c4c882e..641bdd45be5 100644
--- a/tensorflow/python/keras/layers/convolutional.py
+++ b/tensorflow/python/keras/layers/convolutional.py
@@ -145,10 +145,7 @@ class Conv(Layer):
 
   def build(self, input_shape):
     input_shape = tensor_shape.TensorShape(input_shape)
-    if self.data_format == 'channels_first':
-      channel_axis = 1
-    else:
-      channel_axis = -1
+    channel_axis = self._get_channel_axis()
     if input_shape.dims[channel_axis].value is None:
       raise ValueError('The channel dimension of the inputs '
                        'should be defined. Found `None`.')
@@ -271,6 +268,12 @@ class Conv(Layer):
       causal_padding = [[0, 0], [0, 0], [left_pad, 0]]
     return causal_padding
 
+  def _get_channel_axis(self):
+    if self.data_format == 'channels_first':
+      return 1
+    else:
+      return -1
+
 
 @keras_export('keras.layers.Conv1D', 'keras.layers.Convolution1D')
 class Conv1D(Conv):
@@ -757,10 +760,7 @@ class Conv2DTranspose(Conv2D):
     if len(input_shape) != 4:
       raise ValueError('Inputs should have rank 4. Received input shape: ' +
                        str(input_shape))
-    if self.data_format == 'channels_first':
-      channel_axis = 1
-    else:
-      channel_axis = -1
+    channel_axis = self._get_channel_axis()
     if input_shape.dims[channel_axis].value is None:
       raise ValueError('The channel dimension of the inputs '
                        'should be defined. Found `None`.')
@@ -1030,10 +1030,7 @@ class Conv3DTranspose(Conv3D):
     if len(input_shape) != 5:
       raise ValueError('Inputs should have rank 5, received input shape:',
                        str(input_shape))
-    if self.data_format == 'channels_first':
-      channel_axis = 1
-    else:
-      channel_axis = -1
+    channel_axis = self._get_channel_axis()
     if input_shape.dims[channel_axis].value is None:
       raise ValueError('The channel dimension of the inputs '
                        'should be defined, found None: ' + str(input_shape))
@@ -1290,10 +1287,7 @@ class SeparableConv(Conv):
 
   def build(self, input_shape):
     input_shape = tensor_shape.TensorShape(input_shape)
-    if self.data_format == 'channels_first':
-      channel_axis = 1
-    else:
-      channel_axis = -1
+    channel_axis = self._get_channel_axis()
     if input_shape.dims[channel_axis].value is None:
       raise ValueError('The channel dimension of the inputs '
                        'should be defined. Found `None`.')
@@ -1786,10 +1780,7 @@ class DepthwiseConv2D(Conv2D):
       raise ValueError('Inputs to `DepthwiseConv2D` should have rank 4. '
                        'Received input shape:', str(input_shape))
     input_shape = tensor_shape.TensorShape(input_shape)
-    if self.data_format == 'channels_first':
-      channel_axis = 1
-    else:
-      channel_axis = 3
+    channel_axis = self._get_channel_axis()
     if input_shape.dims[channel_axis].value is None:
       raise ValueError('The channel dimension of the inputs to '
                        '`DepthwiseConv2D` '
diff --git a/tensorflow/python/keras/layers/core.py b/tensorflow/python/keras/layers/core.py
index df78cffa4a2..59da3a67fed 100644
--- a/tensorflow/python/keras/layers/core.py
+++ b/tensorflow/python/keras/layers/core.py
@@ -26,6 +26,7 @@ import warnings
 import numpy as np
 
 from tensorflow.python.eager import context
+from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
@@ -580,9 +581,29 @@ class Flatten(Layer):
       permutation.append(1)
       inputs = array_ops.transpose(inputs, perm=permutation)
 
-    outputs = array_ops.reshape(
-        inputs, (tensor_shape.dimension_value(inputs.shape[0]) or
-                 array_ops.shape(inputs)[0], -1))
+    input_shape = inputs.shape
+    if input_shape[1:].is_fully_defined():
+      flattened_dim = tensor_shape.dimension_value(
+          np.prod(input_shape[1:], dtype=int))
+      # Temporary fix for integer overflow issue.
+      if flattened_dim > np.iinfo(np.int32).max:
+        shape_dtype = dtypes.int64
+      else:
+        shape_dtype = dtypes.int32
+      outputs = array_ops.reshape(
+          inputs, constant_op.constant((-1, flattened_dim), dtype=shape_dtype))
+    else:
+      batch_size = tensor_shape.dimension_value(inputs.shape[0])
+      if batch_size:
+        # Temporary fix for integer overflow issue.
+        if batch_size > np.iinfo(np.int32).max:
+          shape_dtype = dtypes.int64
+        else:
+          shape_dtype = dtypes.int32
+        outputs = array_ops.reshape(
+            inputs, constant_op.constant((batch_size, -1), dtype=shape_dtype))
+      else:
+        outputs = array_ops.reshape(inputs, (array_ops.shape(inputs)[0], -1))
     if not context.executing_eagerly():
       outputs.set_shape(self.compute_output_shape(inputs.shape))
     return outputs
@@ -986,7 +1007,8 @@ class Dense(Layer):
 
     super(Dense, self).__init__(
         activity_regularizer=regularizers.get(activity_regularizer), **kwargs)
-    self.units = int(units)
+
+    self.units = int(units) if not isinstance(units, int) else units
     self.activation = activations.get(activation)
     self.use_bias = use_bias
     self.kernel_initializer = initializers.get(kernel_initializer)
diff --git a/tensorflow/python/keras/layers/local_test.py b/tensorflow/python/keras/layers/local_test.py
index 2efbd098d1e..86fd5c7f9de 100644
--- a/tensorflow/python/keras/layers/local_test.py
+++ b/tensorflow/python/keras/layers/local_test.py
@@ -283,6 +283,7 @@ class LocallyConnectedImplementationModeTest(test.TestCase,
       num_epochs = 2
 
       np.random.seed(1)
+      tf_test_util.random_seed.set_seed(1)
       targets = np.random.randint(0, num_classes, (num_samples,))
 
       height = 7
@@ -334,17 +335,20 @@ class LocallyConnectedImplementationModeTest(test.TestCase,
           x=inputs,
           y=targets,
           epochs=num_epochs,
-          batch_size=num_samples)
+          batch_size=num_samples,
+          shuffle=False)
       model_2.fit(
           x=inputs,
           y=targets,
           epochs=num_epochs,
-          batch_size=num_samples)
+          batch_size=num_samples,
+          shuffle=False)
       model_3.fit(
           x=inputs,
           y=targets,
           epochs=num_epochs,
-          batch_size=num_samples)
+          batch_size=num_samples,
+          shuffle=False)
 
       # Compare outputs after a few training steps.
       out_1 = model_1.call(inputs)
diff --git a/tensorflow/python/keras/layers/recurrent.py b/tensorflow/python/keras/layers/recurrent.py
index 2ee98a44bea..c91281ff642 100644
--- a/tensorflow/python/keras/layers/recurrent.py
+++ b/tensorflow/python/keras/layers/recurrent.py
@@ -36,7 +36,9 @@ from tensorflow.python.keras.engine.input_spec import InputSpec
 from tensorflow.python.keras.utils import generic_utils
 from tensorflow.python.keras.utils import tf_utils
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import control_flow_util
+from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import state_ops
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.training.tracking import base as trackable
@@ -790,11 +792,22 @@ class RNN(Layer):
       if len(initial_state) == 0:
         initial_state = None
       inputs = inputs[0]
-    if initial_state is not None:
-      pass
-    elif self.stateful:
-      initial_state = self.states
-    else:
+
+    if self.stateful:
+      if initial_state is not None:
+        # When layer is stateful and initial_state is provided, check if the
+        # recorded state is same as the default value (zeros). Use the recorded
+        # state if it is not same as the default.
+        non_zero_count = math_ops.add_n([math_ops.count_nonzero_v2(s)
+                                         for s in nest.flatten(self.states)])
+        # Set strict = True to keep the original structure of the state.
+        initial_state = control_flow_ops.cond(non_zero_count > 0,
+                                              true_fn=lambda: self.states,
+                                              false_fn=lambda: initial_state,
+                                              strict=True)
+      else:
+        initial_state = self.states
+    elif initial_state is None:
       initial_state = self.get_initial_state(inputs)
 
     if len(initial_state) != len(self.states):
@@ -1374,7 +1387,8 @@ class SimpleRNN(RNN):
         bias_constraint=bias_constraint,
         dropout=dropout,
         recurrent_dropout=recurrent_dropout,
-        dtype=kwargs.get('dtype'))
+        dtype=kwargs.get('dtype'),
+        trainable=kwargs.get('trainable', True))
     super(SimpleRNN, self).__init__(
         cell,
         return_sequences=return_sequences,
@@ -1903,7 +1917,8 @@ class GRU(RNN):
         recurrent_dropout=recurrent_dropout,
         implementation=implementation,
         reset_after=reset_after,
-        dtype=kwargs.get('dtype'))
+        dtype=kwargs.get('dtype'),
+        trainable=kwargs.get('trainable', True))
     super(GRU, self).__init__(
         cell,
         return_sequences=return_sequences,
@@ -2530,7 +2545,8 @@ class LSTM(RNN):
         dropout=dropout,
         recurrent_dropout=recurrent_dropout,
         implementation=implementation,
-        dtype=kwargs.get('dtype'))
+        dtype=kwargs.get('dtype'),
+        trainable=kwargs.get('trainable', True))
     super(LSTM, self).__init__(
         cell,
         return_sequences=return_sequences,
diff --git a/tensorflow/python/keras/layers/recurrent_test.py b/tensorflow/python/keras/layers/recurrent_test.py
index 37311a7b3ca..b6f72ae6397 100644
--- a/tensorflow/python/keras/layers/recurrent_test.py
+++ b/tensorflow/python/keras/layers/recurrent_test.py
@@ -657,6 +657,16 @@ class RNNTest(keras_parameterized.TestCase):
     self.assertEqual(len(layer.trainable_weights), 3)
     self.assertEqual(len(layer.non_trainable_weights), 0)
 
+  @parameterized.parameters(
+      [keras.layers.SimpleRNN, keras.layers.GRU, keras.layers.LSTM])
+  def test_rnn_cell_trainability(self, layer_cls):
+    # https://github.com/tensorflow/tensorflow/issues/32369.
+    layer = layer_cls(3, trainable=False)
+    self.assertFalse(layer.cell.trainable)
+
+    layer.trainable = True
+    self.assertTrue(layer.cell.trainable)
+
   def test_state_reuse_with_dropout(self):
     layer_class = keras.layers.SimpleRNN
     embedding_dim = 4
@@ -1382,6 +1392,69 @@ class RNNTest(keras_parameterized.TestCase):
     layer.reset_states(new_states)
     model.predict(np.ones((batch, timesteps, input_dim)))
 
+  def test_stateful_rnn_with_initial_state(self):
+    # See https://github.com/tensorflow/tensorflow/issues/32299.
+    batch = 12
+    timesteps = 1
+    input_dim = 8
+    output_dim = 16
+
+    test_inputs = np.full((batch, timesteps, input_dim), 0.5)
+
+    def make_model(stateful=False, with_initial_state=False):
+      input_layer = keras.Input(shape=(None, input_dim), batch_size=batch)
+      if with_initial_state:
+        initial_states = keras.backend.constant(np.ones((batch, output_dim)))
+      else:
+        initial_states = None
+      rnn_output = keras.layers.GRU(
+          units=output_dim, return_sequences=True, stateful=stateful)(
+              input_layer, initial_state=initial_states)
+      model = keras.Model(input_layer, rnn_output)
+      model.compile(
+          optimizer=keras.optimizers.RMSprop(), loss='mse',
+          run_eagerly=testing_utils.should_run_eagerly(),
+          experimental_run_tf_function=testing_utils.should_run_tf_function())
+      return model
+
+    # Define a model with a constant state initialization
+    model = make_model(stateful=True, with_initial_state=True)
+    layer_weights = model.layers[1].get_weights()
+
+    model.reset_states()
+    predict_1 = model.predict(test_inputs)
+    predict_2 = model.predict(test_inputs)
+
+    model.reset_states()
+    predict_3 = model.predict(test_inputs)
+
+    # predict 1 and 2 should be different since the batch 2 should use the state
+    # from batch 1 as the initial state.
+    self.assertNotAllClose(predict_1, predict_2)
+    self.assertAllClose(predict_1, predict_3)
+
+    # Create a new model with same weights but without initial states. Make sure
+    # the predict value is different from the model with non-zero initial state.
+    model_2 = make_model(stateful=True, with_initial_state=False)
+    model_2.layers[1].set_weights(layer_weights)
+
+    model_2.reset_states()
+    predict_4 = model_2.predict(test_inputs)
+    predict_5 = model_2.predict(test_inputs)
+    self.assertNotAllClose(predict_1, predict_4)
+    self.assertNotAllClose(predict_4, predict_5)
+
+    # Create models with stateful=False, and make sure they handle init state
+    # correctly.
+    model_3 = make_model(stateful=False, with_initial_state=True)
+    model_3.layers[1].set_weights(layer_weights)
+
+    model_3.reset_states()
+    predict_6 = model_3.predict(test_inputs)
+    predict_7 = model_3.predict(test_inputs)
+    self.assertAllClose(predict_1, predict_6)
+    self.assertAllClose(predict_6, predict_7)
+
   def test_input_dim_length(self):
     simple_rnn = keras.layers.SimpleRNN(5, input_length=10, input_dim=8)
     self.assertEqual(simple_rnn._batch_input_shape, (None, 10, 8))
diff --git a/tensorflow/python/keras/layers/serialization.py b/tensorflow/python/keras/layers/serialization.py
index ac5a8d18b0c..85fca80f985 100644
--- a/tensorflow/python/keras/layers/serialization.py
+++ b/tensorflow/python/keras/layers/serialization.py
@@ -43,6 +43,7 @@ from tensorflow.python.keras.layers.recurrent import *
 from tensorflow.python.keras.layers.rnn_cell_wrapper_v2 import *
 from tensorflow.python.keras.layers.wrappers import *
 from tensorflow.python.keras.utils.generic_utils import deserialize_keras_object
+from tensorflow.python.keras.utils.generic_utils import serialize_keras_object
 from tensorflow.python.util.tf_export import keras_export
 
 if tf2.enabled():
@@ -61,7 +62,7 @@ _DESERIALIZATION_TABLE = {
 
 @keras_export('keras.layers.serialize')
 def serialize(layer):
-  return {'class_name': layer.__class__.__name__, 'config': layer.get_config()}
+  return serialize_keras_object(layer)
 
 
 @keras_export('keras.layers.deserialize')
diff --git a/tensorflow/python/keras/layers/serialization_test.py b/tensorflow/python/keras/layers/serialization_test.py
index c0eb2fe9359..74b43a740cf 100644
--- a/tensorflow/python/keras/layers/serialization_test.py
+++ b/tensorflow/python/keras/layers/serialization_test.py
@@ -30,6 +30,19 @@ from tensorflow.python.keras.layers import recurrent_v2 as rnn_v2
 from tensorflow.python.platform import test
 
 
+class SerializableInt(int):
+
+  def __new__(cls, value):
+    return int.__new__(cls, value)
+
+  def get_config(self):
+    return {'value': int(self)}
+
+  @classmethod
+  def from_config(cls, config):
+    return cls(**config)
+
+
 @tf_test_util.run_all_in_graph_and_eager_modes
 class LayerSerializationTest(parameterized.TestCase, test.TestCase):
 
@@ -49,6 +62,42 @@ class LayerSerializationTest(parameterized.TestCase, test.TestCase):
                        keras.initializers.Ones)
     self.assertEqual(new_layer.units, 3)
 
+  def test_implicit_serialize_deserialize_fails_without_object(self):
+    layer = keras.layers.Dense(
+        SerializableInt(3),
+        activation='relu',
+        kernel_initializer='ones',
+        bias_regularizer='l2')
+    config = keras.layers.serialize(layer)
+    # Because we're passing an unknown class here, deserialization should fail
+    # unless we add SerializableInt to the custom object dict.
+    with self.assertRaisesRegex(ValueError,
+                                'Unknown config_item: SerializableInt.*'):
+      _ = keras.layers.deserialize(config)
+
+  def test_implicit_serialize_deserialize_succeeds_with_object(self):
+    layer = keras.layers.Dense(
+        SerializableInt(3),
+        activation='relu',
+        kernel_initializer='ones',
+        bias_regularizer='l2')
+    config = keras.layers.serialize(layer)
+    # Because we're passing an unknown class here, deserialization should fail
+    # unless we add SerializableInt to the custom object dict.
+    new_layer = keras.layers.deserialize(
+        config, custom_objects={'SerializableInt': SerializableInt})
+    self.assertEqual(new_layer.activation, keras.activations.relu)
+    self.assertEqual(new_layer.bias_regularizer.__class__,
+                     keras.regularizers.L1L2)
+    if tf2.enabled():
+      self.assertEqual(new_layer.kernel_initializer.__class__,
+                       keras.initializers.OnesV2)
+    else:
+      self.assertEqual(new_layer.kernel_initializer.__class__,
+                       keras.initializers.Ones)
+    self.assertEqual(new_layer.units.__class__, SerializableInt)
+    self.assertEqual(new_layer.units, 3)
+
   @parameterized.parameters(
       [batchnorm_v1.BatchNormalization, batchnorm_v2.BatchNormalization])
   def test_serialize_deserialize_batchnorm(self, batchnorm_layer):
diff --git a/tensorflow/python/keras/losses.py b/tensorflow/python/keras/losses.py
index 3a23bb66ddb..47448e228e3 100644
--- a/tensorflow/python/keras/losses.py
+++ b/tensorflow/python/keras/losses.py
@@ -25,6 +25,7 @@ import six
 from tensorflow.python.distribute import distribution_strategy_context
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import smart_cond
+from tensorflow.python.framework import tensor_util
 from tensorflow.python.keras import backend as K
 from tensorflow.python.keras.utils import losses_utils
 from tensorflow.python.keras.utils import tf_utils
@@ -34,6 +35,7 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn
 from tensorflow.python.ops.losses import losses_impl
+from tensorflow.python.ops.losses import util as tf_losses_util
 from tensorflow.python.util.tf_export import keras_export
 from tensorflow.tools.docs import doc_controls
 
@@ -213,6 +215,9 @@ class LossFunctionWrapper(Loss):
     Returns:
       Loss values per sample.
     """
+    if tensor_util.is_tensor(y_pred) and tensor_util.is_tensor(y_true):
+      y_pred, y_true = tf_losses_util.squeeze_or_expand_dimensions(
+          y_pred, y_true)
     return self.fn(y_true, y_pred, **self._fn_kwargs)
 
   def get_config(self):
diff --git a/tensorflow/python/keras/mixed_precision/experimental/BUILD b/tensorflow/python/keras/mixed_precision/experimental/BUILD
index 31dd12b6e51..8512d5428fe 100644
--- a/tensorflow/python/keras/mixed_precision/experimental/BUILD
+++ b/tensorflow/python/keras/mixed_precision/experimental/BUILD
@@ -97,11 +97,22 @@ py_test(
     ],
 )
 
+py_library(
+    name = "loss_scale",
+    srcs = ["loss_scale.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python:loss_scale",
+        "//tensorflow/python/keras:generic_utils",
+    ],
+)
+
 py_library(
     name = "loss_scale_optimizer",
     srcs = ["loss_scale_optimizer.py"],
     srcs_version = "PY2AND3",
     deps = [
+        ":loss_scale",
         "//tensorflow/python:loss_scale",
         "//tensorflow/python/keras/optimizer_v2",
         "@absl_py//absl/testing:parameterized",
diff --git a/tensorflow/python/keras/mixed_precision/experimental/keras_test.py b/tensorflow/python/keras/mixed_precision/experimental/keras_test.py
index 0c715a44452..bbea1a75a9b 100644
--- a/tensorflow/python/keras/mixed_precision/experimental/keras_test.py
+++ b/tensorflow/python/keras/mixed_precision/experimental/keras_test.py
@@ -37,6 +37,7 @@ from tensorflow.python.keras import layers
 from tensorflow.python.keras import models
 from tensorflow.python.keras import optimizers
 from tensorflow.python.keras import regularizers
+from tensorflow.python.keras import saving
 from tensorflow.python.keras import testing_utils
 from tensorflow.python.keras.engine import base_layer
 from tensorflow.python.keras.engine import base_layer_utils
@@ -59,7 +60,8 @@ class AssertTypeLayer(base_layer.Layer):
   """A layer which asserts it's inputs are a certain type."""
 
   def __init__(self, assert_type=None, **kwargs):
-    self._assert_type = assert_type
+    self._assert_type = (dtypes.as_dtype(assert_type).name if assert_type
+                         else None)
     super(AssertTypeLayer, self).__init__(**kwargs)
 
   def assert_input_types(self, inputs):
@@ -112,6 +114,15 @@ class AddLayer(AssertTypeLayer):
     else:
       return math_ops.add(x, y)
 
+  def get_config(self):
+    config = super(AddLayer, self).get_config()
+    assert self._regularizer is None, (
+        'regularizer must be None to get config for AddLayer')
+    config['use_operator'] = self._use_operator
+    config['var_name'] = self._var_name
+    config['assert_type'] = self._assert_type
+    return config
+
 
 class AddLayerWithoutAutoCast(AddLayer):
   """Same as AddLayer, but does not use AutoCastVariables."""
@@ -928,6 +939,78 @@ class KerasModelTest(keras_parameterized.TestCase):
     self.assertEqual(backend.get_value(loss_scale()), 2)
     self.assertEqual(backend.get_value(loss_scale._num_good_steps), 1)
 
+  @keras_parameterized.run_all_keras_modes
+  @parameterized.named_parameters(
+      {
+          'testcase_name': 'base',
+          'strategy_fn': default_strategy_fn,
+      }, {
+          'testcase_name': 'distribute',
+          'strategy_fn': create_mirrored_strategy,
+      }, {
+          'testcase_name': 'base_h5',
+          'strategy_fn': default_strategy_fn,
+          'h5': True,
+      }, {
+          'testcase_name': 'distribute_h5',
+          'strategy_fn': create_mirrored_strategy,
+          'h5': True,
+      })
+  def test_save_model_with_dynamic_loss_scaling(self, strategy_fn, h5=False):
+    if not self._is_strategy_supported(strategy_fn):
+      return
+    strategy = strategy_fn()
+    if (isinstance(strategy, mirrored_strategy.MirroredStrategy) and
+        not context.executing_eagerly()):
+      # TODO(b/121381184): Enable running the test in this case.
+      return
+
+    # Create and run model.
+    with strategy.scope():
+      x = layers.Input(shape=(2,), batch_size=2, dtype=dtypes.float32)
+      y = AddLayer()(x)
+      model = models.Model(inputs=x, outputs=y)
+
+      loss_scale = loss_scale_module.DynamicLossScale(
+          initial_loss_scale=1., increment_period=2., multiplier=2.)
+      opt = gradient_descent.SGD(1.)
+      opt = loss_scale_optimizer.LossScaleOptimizer(opt, loss_scale)
+      model.compile(
+          optimizer=opt,
+          loss='mse',
+          run_eagerly=testing_utils.should_run_eagerly(),
+          experimental_run_tf_function=testing_utils.should_run_tf_function())
+    # Run for 3 steps (6 examples with a batch size of 2)
+    model.fit(np.zeros((6, 2)), np.zeros((6, 2)), batch_size=2)
+    self.assertEqual(backend.get_value(loss_scale()), 2)
+    self.assertEqual(backend.get_value(loss_scale._num_good_steps), 1)
+    (weight,) = model.trainable_weights
+    orig_weight = backend.get_value(weight)
+
+    # Save model weights.
+    save_path = os.path.join(self.get_temp_dir(), 'model')
+    model.save(save_path, save_format='h5' if h5 else 'tf')
+
+    # Run model again for 1 step (2 examples with a batch size of 2)
+    model.fit(np.zeros((2, 2)), np.zeros((2, 2)), batch_size=2)
+    new_weight = backend.get_value(weight)
+    self.assertNotEqual(new_weight, orig_weight)
+    self.assertEqual(backend.get_value(loss_scale()), 4)
+    self.assertEqual(backend.get_value(loss_scale._num_good_steps), 0)
+
+    # Load model weights and ensure loss scale weights are restored.
+    model = saving.load_model(save_path, custom_objects={'AddLayer': AddLayer})
+    loss_scale = model.optimizer.loss_scale
+    (weight,) = model.trainable_weights
+    loaded_weight = backend.get_value(weight)
+    self.assertEqual(loaded_weight, orig_weight)
+    # Currently the loss scale isn't always saved when the model is saved with
+    # Model.save(). So we assert the loss scale either has the value when it was
+    # saved, or the value it was initialized with.
+    # TODO(reedwm): Always save/restore the loss scale with Model.save().
+    self.assertIn(backend.get_value(loss_scale()), (1, 2))
+    self.assertIn(backend.get_value(loss_scale._num_good_steps), (0, 1))
+
 
 class RnnTest(keras_parameterized.TestCase):
   """Test mixed precision with RNNs."""
diff --git a/tensorflow/python/keras/mixed_precision/experimental/loss_scale.py b/tensorflow/python/keras/mixed_precision/experimental/loss_scale.py
new file mode 100644
index 00000000000..680b0a5b89f
--- /dev/null
+++ b/tensorflow/python/keras/mixed_precision/experimental/loss_scale.py
@@ -0,0 +1,49 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Contains keras-specific LossScale functionality.
+
+This functions cannot be in the non-keras loss_scale.py file since they depend
+on keras, and files outside of keras should not depend on files inside keras.
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.keras.utils import generic_utils
+from tensorflow.python.training.experimental import loss_scale as loss_scale_module
+
+
+def serialize(loss_scale):
+  return generic_utils.serialize_keras_object(loss_scale)
+
+
+def deserialize(config, custom_objects=None):
+  loss_scale_module_objects = {
+      'FixedLossScale': loss_scale_module.FixedLossScale,
+      'DynamicLossScale': loss_scale_module.DynamicLossScale,
+  }
+
+  return generic_utils.deserialize_keras_object(
+      config,
+      module_objects=loss_scale_module_objects,
+      custom_objects=custom_objects,
+      printable_module_name='loss scale'
+  )
+
+
+def get(identifier):
+  if isinstance(identifier, dict):
+    return deserialize(identifier)
+  return loss_scale_module.get(identifier)
diff --git a/tensorflow/python/keras/mixed_precision/experimental/loss_scale_optimizer.py b/tensorflow/python/keras/mixed_precision/experimental/loss_scale_optimizer.py
index a68c6ff8663..8d14ec1ceca 100644
--- a/tensorflow/python/keras/mixed_precision/experimental/loss_scale_optimizer.py
+++ b/tensorflow/python/keras/mixed_precision/experimental/loss_scale_optimizer.py
@@ -20,6 +20,8 @@ from __future__ import print_function
 from tensorflow.python.distribute import distribution_strategy_context
 from tensorflow.python.framework import smart_cond
 from tensorflow.python.keras import backend
+from tensorflow.python.keras import optimizers
+from tensorflow.python.keras.mixed_precision.experimental import loss_scale as keras_loss_scale_module
 from tensorflow.python.keras.optimizer_v2 import optimizer_v2
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
@@ -100,11 +102,11 @@ class LossScaleOptimizer(optimizer_v2.OptimizerV2):
   ```
   """
 
-  def __init__(self, opt, loss_scale):
+  def __init__(self, optimizer, loss_scale):
     """Initializes this loss scale optimizer.
 
     Args:
-      opt: The Optimizer instance to wrap.
+      optimizer: The Optimizer instance to wrap.
       loss_scale: The loss scale to scale the loss and gradients. This can
         either be an int/float to use a fixed loss scale, the string "dynamic"
         to use dynamic loss scaling, or an instance of a LossScale. The string
@@ -112,21 +114,21 @@ class LossScaleOptimizer(optimizer_v2.OptimizerV2):
         int/float is equivalent to passing a FixedLossScale with the given loss
         scale.
     """
-    if not isinstance(opt, optimizer_v2.OptimizerV2):
-      raise ValueError('"opt" must be an instance of OptimizerV2, but got: %s'
-                       % opt)
-    if hasattr(opt, 'clipnorm'):
+    if not isinstance(optimizer, optimizer_v2.OptimizerV2):
+      raise ValueError('"optimizer" must be an instance of OptimizerV2, but '
+                       'got: %s' % optimizer)
+    if hasattr(optimizer, 'clipnorm'):
       raise ValueError('LossScaleOptimizer does not support wrapping '
                        'optimizers with a clipnorm. Optimizer %s has clipnorm '
-                       '%s' % (opt, opt.clipnorm))
+                       '%s' % (optimizer, optimizer.clipnorm))
 
-    if hasattr(opt, 'clipvalue'):
+    if hasattr(optimizer, 'clipvalue'):
       raise ValueError('LossScaleOptimizer does not support wrapping '
                        'optimizers with a clipvalue. Optimizer %s has '
-                       'clipvalue %s' % (opt, opt.clipvalue))
+                       'clipvalue %s' % (optimizer, optimizer.clipvalue))
 
-    self._optimizer = opt
-    self._loss_scale = loss_scale_module.get(loss_scale)
+    self._optimizer = optimizer
+    self._loss_scale = keras_loss_scale_module.get(loss_scale)
     for weight in loss_scale_module.get_loss_scale_weights(self._loss_scale):
       # We cannot call `track_variable` in the LossScale class itself, because a
       # file outside of Keras cannot depend on a Keras file. Calling it here
@@ -248,6 +250,26 @@ class LossScaleOptimizer(optimizer_v2.OptimizerV2):
     return self._optimizer.apply_gradients(list(zip(grads, wrapped_vars.value)),
                                            name)
 
+  def get_config(self):
+    serialized_optimizer = optimizers.serialize(self._optimizer)
+    serialized_loss_scale = keras_loss_scale_module.serialize(self._loss_scale)
+    return {
+        'optimizer': serialized_optimizer,
+        'loss_scale': serialized_loss_scale,
+    }
+
+  @classmethod
+  def from_config(cls, config, custom_objects=None):
+    config = config.copy()  # Make a copy, since we mutate config
+    config['optimizer'] = optimizers.deserialize(
+        config['optimizer'], custom_objects=custom_objects)
+    config['loss_scale'] = keras_loss_scale_module.deserialize(
+        config['loss_scale'], custom_objects=custom_objects)
+    return cls(**config)
+
+  # Delegations: We delegate most OptimizerV2 methods to the wrapped optimizer
+  # below.
+
   @property
   def iterations(self):
     return self._optimizer.iterations
@@ -256,6 +278,22 @@ class LossScaleOptimizer(optimizer_v2.OptimizerV2):
   def iterations(self, variable):
     self._optimizer.iterations = variable
 
+  def get_slot_names(self):
+    return self._optimizer.get_slot_names()
+
+  def variables(self):
+    return self._optimizer.variables()
+
+  @property
+  def weights(self):
+    return self._optimizer.weights
+
+  def get_weights(self):
+    return self._optimizer.get_weights()
+
+  def set_weights(self, weights):
+    return self._optimizer.set_weights(weights)
+
   # For the most part, we only expose methods in the base OptimizerV2, not
   # individual subclasses like Adam. However, although "learning_rate" and "lr"
   # properties are not part of the base OptimizerV2 class, they are part of most
@@ -277,22 +315,35 @@ class LossScaleOptimizer(optimizer_v2.OptimizerV2):
   def lr(self, lr):
     self._optimizer.lr = lr
 
-  def get_slot_names(self):
-    """A list of names for this optimizer's slots."""
-    return self._optimizer.get_slot_names()
+  def get_slot(self, var, slot_name):
+    # We cannot implement get_slot for the following reason: When saving a
+    # checkpoint, two optimizers cannot share slot variables. Since both the
+    # LossScaleOptimizer and the wrapped optimizer (self and self._optimizer
+    # respectively) are checkpointed, we cannot expose the wrapped optimizer's
+    # slots in the LossScaleOptimizer. Otherwise, a checkpoint would believe
+    # both optimizers share slot variables.
+    raise AttributeError(
+        'You cannot call get_slot on a LossScaleOptimizer. This limitation '
+        'will be removed in the future.')
+
+  def add_slot(self, var, slot_name, initializer='zeros'):
+    # We disallow adding a slot for consistency with `get_slot`.
+    raise AttributeError(
+        'You cannot call add_slot on a LossScaleOptimizer. This limitation '
+        'will be removed in the future.')
+
+  # We do not override some OptimizerV2 methods. For each, we describe why we do
+  # not delegate them to self._optimizer:
+  # * get_updates: get_updates() calls get_gradients(). Since we override
+  #   get_gradients(), we cannot delegate get_updates() to self._optimizer,
+  #   otherwise the overridden get_gradients() method would not be called.
+  #   Luckily, get_updates() does not access any OptimizerV2 fields, so
+  #   inheriting the OptimizerV2 version works fine.
+  # * minimize: We don't delegate for a similar as get_updates(): it calls
+  #   both self._compute_gradients() and self.apply_gradients(), and both need
+  #   to have the LossScaleOptimizer version called.
 
   # TODO(reedwm): Maybe merge this class's functionality into OptimizerV2.
 
   # TODO(reedwm): Maybe throw an error if mixed precision is used without this
   # optimizer being used.
-
-  # TODO(reedwm): Implement get_config and from_config. This will first require
-  # implementing deserialization support for OptimizerV2.
-  def get_config(self):
-    raise NotImplementedError('get_config() is not yet implemented for '
-                              'LossScaleOptimizers')
-
-  @classmethod
-  def from_config(cls, config, custom_objects=None):
-    raise NotImplementedError('from_config() is not yet implemented for '
-                              'LossScaleOptimizers')
diff --git a/tensorflow/python/keras/mixed_precision/experimental/loss_scale_optimizer_test.py b/tensorflow/python/keras/mixed_precision/experimental/loss_scale_optimizer_test.py
index 320b30e27b9..05035c50dab 100644
--- a/tensorflow/python/keras/mixed_precision/experimental/loss_scale_optimizer_test.py
+++ b/tensorflow/python/keras/mixed_precision/experimental/loss_scale_optimizer_test.py
@@ -21,12 +21,14 @@ from __future__ import print_function
 import os
 
 from absl.testing import parameterized
+import numpy as np
 
 from tensorflow.python.distribute import distribution_strategy_context
 from tensorflow.python.distribute import mirrored_strategy
 from tensorflow.python.eager import context
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
+from tensorflow.python.keras import optimizers
 from tensorflow.python.keras.mixed_precision.experimental import loss_scale_optimizer
 from tensorflow.python.keras.mixed_precision.experimental import test_util as mp_test_util
 from tensorflow.python.keras.optimizer_v2 import adam
@@ -263,6 +265,8 @@ class LossScaleOptimizerTest(test.TestCase, parameterized.TestCase):
       self.assertEqual(self.evaluate(opt.loss_scale()),
                        initial_loss_scale * 16)
 
+      self.assertEqual(opt.get_slot_names(), ['momentum'])
+
   @test_util.run_in_graph_and_eager_modes
   def testIterations(self):
     opt = gradient_descent.SGD(2.0)
@@ -271,6 +275,41 @@ class LossScaleOptimizerTest(test.TestCase, parameterized.TestCase):
     self.assertEqual(lso.iterations, 7)
     self.assertEqual(opt.iterations, 7)
 
+  @test_util.run_in_graph_and_eager_modes
+  def testWeightMethods(self):
+    var = variables.Variable([1.0])
+    opt = gradient_descent.SGD(1.0)
+    initial_loss_scale = 2.
+    loss_scale = loss_scale_module.DynamicLossScale(
+        initial_loss_scale=initial_loss_scale, increment_period=1,
+        multiplier=4)
+    opt = loss_scale_optimizer.LossScaleOptimizer(opt, loss_scale)
+    run_op = opt.minimize(lambda: var * 2, [var])
+    self.evaluate(variables.global_variables_initializer())
+    self._run_if_in_graph_mode(run_op)
+
+    self.assertLen(opt.weights, 1)  # The 'iterations' weight
+    self.assertEqual(self.evaluate(opt.weights[0]), 1)
+    self.assertEqual(opt.get_weights()[0], 1)
+    self.assertEqual(self.evaluate(opt.variables()[0]), 1)
+    opt.set_weights([np.array(2.)])
+    self.assertEqual(self.evaluate(opt.variables()[0]), 2)
+
+  @test_util.run_in_graph_and_eager_modes
+  def testSlotMethodErrors(self):
+    opt = gradient_descent.SGD(1.0, momentum=1.0)
+    opt = loss_scale_optimizer.LossScaleOptimizer(opt, 'dynamic')
+    with self.assertRaisesRegexp(
+        AttributeError,
+        'You cannot call get_slot on a LossScaleOptimizer. This limitation '
+        'will be removed in the future.'):
+      opt.get_slot(None, None)
+    with self.assertRaisesRegexp(
+        AttributeError,
+        'You cannot call add_slot on a LossScaleOptimizer. This limitation '
+        'will be removed in the future.'):
+      opt.add_slot(None, None)
+
   @parameterized.named_parameters(*TESTCASES)
   @test_util.run_in_graph_and_eager_modes
   def testGettingAndSettingLearningRate(self, strategy_fn):
@@ -384,6 +423,71 @@ class LossScaleOptimizerTest(test.TestCase, parameterized.TestCase):
       self.assertEqual(self.evaluate(loss_scale._num_good_steps), 1)
       self.assertAlmostEqual(self.evaluate(slot_var).item(), slot_value)
 
+  @test_util.run_in_graph_and_eager_modes
+  def testGetConfig(self):
+    opt = gradient_descent.SGD(2., momentum=0.5)
+    loss_scale = loss_scale_module.DynamicLossScale(
+        initial_loss_scale=2., increment_period=3.,
+        multiplier=4.)
+    opt = loss_scale_optimizer.LossScaleOptimizer(opt, loss_scale)
+    config = opt.get_config()
+    opt = loss_scale_optimizer.LossScaleOptimizer.from_config(config)
+    # Force hyperparameters to be created
+    opt.lr  # pylint: disable=pointless-statement
+    self.evaluate(variables.global_variables_initializer())
+
+    self.assertEqual(self.evaluate(opt.lr), 2.)
+    self.assertEqual(self.evaluate(opt._optimizer.momentum), 0.5)
+    self.assertEqual(self.evaluate(opt.loss_scale()), 2.)
+    self.assertEqual(opt.loss_scale.increment_period, 3.)
+    self.assertEqual(opt.loss_scale.multiplier, 4.)
+
+  @test_util.run_in_graph_and_eager_modes
+  def testSerializationWithBuiltInOptimizer(self):
+    opt = gradient_descent.SGD(2., momentum=0.5)
+    loss_scale = loss_scale_module.DynamicLossScale(
+        initial_loss_scale=2., increment_period=3.,
+        multiplier=4.)
+    opt = loss_scale_optimizer.LossScaleOptimizer(opt, loss_scale)
+    config = optimizers.serialize(opt)
+    opt = optimizers.deserialize(config)
+    # Force hyperparameters to be created
+    opt.lr  # pylint: disable=pointless-statement
+    self.evaluate(variables.global_variables_initializer())
+
+    self.assertEqual(self.evaluate(opt.lr), 2.)
+    self.assertEqual(self.evaluate(opt._optimizer.momentum), 0.5)
+    self.assertEqual(self.evaluate(opt.loss_scale()), 2.)
+    self.assertEqual(opt.loss_scale.increment_period, 3.)
+    self.assertEqual(opt.loss_scale.multiplier, 4.)
+
+  @test_util.run_in_graph_and_eager_modes
+  def testSerializationWithCustomOptimizer(self):
+    class MySGD(gradient_descent.SGD):
+
+      def __init__(self, *args, **kwargs):
+        super(MySGD, self).__init__(*args, **kwargs)
+        self.my_attribute = 123
+
+    opt = MySGD(2., momentum=0.5)
+    loss_scale = loss_scale_module.DynamicLossScale(
+        initial_loss_scale=2., increment_period=3.,
+        multiplier=4.)
+    opt = loss_scale_optimizer.LossScaleOptimizer(opt, loss_scale)
+    config = optimizers.serialize(opt)
+    custom_objects = {'MySGD': MySGD}
+    opt = optimizers.deserialize(config, custom_objects=custom_objects)
+    # Force hyperparameters to be created
+    opt.lr  # pylint: disable=pointless-statement
+    self.evaluate(variables.global_variables_initializer())
+
+    self.assertEqual(self.evaluate(opt.lr), 2.)
+    self.assertEqual(self.evaluate(opt._optimizer.momentum), 0.5)
+    self.assertEqual(self.evaluate(opt.loss_scale()), 2.)
+    self.assertEqual(opt.loss_scale.increment_period, 3.)
+    self.assertEqual(opt.loss_scale.multiplier, 4.)
+    self.assertEqual(opt._optimizer.my_attribute, 123)
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/keras/mixed_precision/experimental/policy.py b/tensorflow/python/keras/mixed_precision/experimental/policy.py
index a0eb11164e7..e016adbfdda 100644
--- a/tensorflow/python/keras/mixed_precision/experimental/policy.py
+++ b/tensorflow/python/keras/mixed_precision/experimental/policy.py
@@ -55,28 +55,21 @@ class Policy(object):
   computation dtype to avoid type errors.
 
   Policies also have a `tf.train.experimental.LossScale` instance, which is used
-  by Models to performance loss scaling. Layers which are not Models ignore
-  the loss scale.
+  by `tf.keras.Model`s to performance loss scaling. Loss scaling is only done by
+  Models in `Model.fit` and `Model.train_on_batch`. Layers which are not Models
+  ignore the loss scale.
 
   Policies are constructed by passing a string to the constructor, e.g.
   `tf.keras.mixed_precision.experimental.Policy('float32')`. The string
-  determines the compute and variable dtypes. Currently, it can be one of
-  in one of the following forms:
+  determines the compute and variable dtypes. It can be one of the following:
 
     * Any dtype name, such as 'float32' or 'float64'. Both the variable and
-      compute dtypes will be that dtype.
-    * '<dtype>_with_float32_vars', where <dtype> is any dtype. The compute dtype
-      will be <dtype>, while the variable dtype is float32. This can be used for
-      mixed precision, which uses float16 or bfloat16 for most computations, and
-      float32 for variables, but it is recommended to use the 'mixed_float16' or
-      'mixed_bfloat16' policies instead.
-    * 'mixed_float16' or 'mixed_bfloat16': Similar to
-      'float16_with_float32_vars' or 'bfloat16_with_float32_vars' respectively.
-      'mixed_float16' is identical to 'float16_with_float32_vars' except the
-      loss_scale is dynamic by default. 'mixed_bfloat16' is currently identical
-      to 'bfloat16_with_float32_vars'. More changes may be added to these mixed
-      policies in the future, to further differentiate them from
-      [b]float16_with_float32_vars.
+      compute dtypes will be that dtype. No loss scaling is done by default.
+    * 'mixed_float16' or 'mixed_bfloat16': The compute dtype is float16 or
+      bfloat16, while the variable dtype is float32. These policies are used for
+      mixed precision training. With 'mixed_float16', a dynamic loss scale is
+      used by default. 'mixed_bfloat16' does no loss scaling by default, as loss
+      scaling is unnecessary with bfloat16.
 
   ### How to use mixed precision in layers with Policies
 
@@ -118,6 +111,14 @@ class Policy(object):
   constructors in the `dtype` argument instead of policies, but only if the
   string is convertible to a dtype.
 
+  Note the 'mixed_float16' policy will apply loss scaling by default in
+  `Model.fit` and `Model.train_on_batch`. If neither method is used (e.g., a
+  custom training loop is used) and 'mixed_float16' is used, the loss scale must
+  be manually applied. See
+  `tf.keras.mixed_precision.experimental.LossScaleOptimizer` for details. For
+  'mixed_bfloat16', no loss scaling is done and loss scaling never needs to be
+  manually applied.
+
   ### The deprecated "infer" policy
 
   In addition to a dtype or "<dtype>_with_float32_vars", a policy can also be
@@ -129,7 +130,13 @@ class Policy(object):
   the dtype of the first input.
 
   Similarly to "infer", there is a deprecated "infer_with_float32_vars" policy
-  that infers the compute dtype, but not the variable dtype.
+  that infers the compute dtype, but not the variable dtype. Once a layer with
+  an "infer_with_float32_vars" policy is called for the first time, the layer's
+  policy will change to "<dtype>_with_float32_vars", where <dtype> is the dtype
+  of the first input. These policies force variables in float32.
+
+  Warning: Policies ending in "_with_float32_vars" will be removed in TensorFlow
+  2.1. Please use "mixed_float16" or "mixed_bfloat16" instead.
 
   In TensorFlow 1, only the "infer" and "infer_with_float32_vars" policies are
   available.
@@ -137,6 +144,12 @@ class Policy(object):
   # TODO(reedwm): Replace link in above docstring with a version that is more
   # TensorFlow-specific, and that also mentions bfloat16.
 
+  # If True, warn when a policy is created whose name ends in
+  # "_with_float32_vars". We always want to warn when a user creates such a
+  # policy, but when the TensorFlow creates a policy, it suppresses the warning
+  # by setting this to False when creating the policy.
+  _warn_about_float32_vars = True
+
   def __init__(self, name, loss_scale=USE_DEFAULT):
     """Constructs the policy.
 
@@ -148,21 +161,12 @@ class Policy(object):
       name: A string. Can be one of the following values:
         * Any dtype name, such as 'float32' or 'float64'. Both the variable and
           compute dtypes will be that dtype.
-        * '<dtype>_with_float32_vars', where <dtype> is any dtype. The compute
-          dtype will be <dtype>, while the variable dtype is float32. This can
-          be used for mixed precision, which uses float16 or bfloat16 for most
-          computations, and float32 for variables, but it is recommended to use
-          the 'mixed_float16' or 'mixed_bfloat16' policies instead.
-        * 'mixed_float16' or 'mixed_bfloat16': Similar to
-          'float16_with_float32_vars' or 'bfloat16_with_float32_vars'
-          respectively. 'mixed_float16' is identical to
-          'float16_with_float32_vars' except the loss_scale is dynamic by
-          default. 'mixed_bfloat16' is currently identical to
-          'bfloat16_with_float32_vars'. More changes may be added to these mixed
-          policies in the future, to further differentiate them from
-          [b]float16_with_float32_vars.
-        * 'infer' or 'infer_with_float32_vars' (deprecated): Infer the
-          computation dtype from the input dtype.
+        * 'mixed_float16' or 'mixed_bfloat16': The compute dtype is float16 or
+          bfloat16, while the variable dtype is float32. With 'mixed_float16',
+          a dynamic loss scale is used. These policies are used for mixed
+          precision training.
+        * 'infer' (deprecated): Infer the compute and variable dtype from the
+          input dtype.
       loss_scale: A `tf.train.experimental.LossScale`, or a value convertible to
         one such as "dynamic". Defaults to using no loss scaling unless `name`
         is "mixed_float16", in which case this defaults to "dynamic". Only
@@ -185,6 +189,18 @@ class Policy(object):
     self._name = name
     self._compute_dtype, self._variable_dtype = self._parse_name(name)
 
+    if name.endswith('_with_float32_vars') and self._warn_about_float32_vars:
+      warning = ("WARNING: The '%s' policy is deprecated and will be removed "
+                 "in TensorFlow 2.1." % name)
+      if name == 'infer_with_float32_vars':
+        warning += (" Please use the 'mixed_float16' or 'mixed_bfloat16' "
+                    "policy instead.")
+      elif name == 'float16_with_float32_vars':
+        warning += " Please use the 'mixed_float16' policy instead."
+      elif name == 'bfloat16_with_float32_vars':
+        warning += " Please use the 'mixed_bfloat16' policy instead."
+      tf_logging.warn(warning)
+
     if loss_scale == USE_DEFAULT:
       loss_scale = 'dynamic' if name == 'mixed_float16' else None
     if loss_scale and self._compute_dtype not in (None, 'float16'):
@@ -221,14 +237,15 @@ class Policy(object):
       try:
         base_dtype = dtypes.as_dtype(base_name).name
       except TypeError:
-        error = ('Cannot convert value %s to a mixed precision Policy. '
-                 'Valid policies include include those in the form "<dtype>" '
-                 'and "<dtype>_with_float32_vars", where <dtype> is the name '
-                 'of a dtype.' % (name,))
+        error = ("Cannot convert value %s to a mixed precision Policy. "
+                 "Valid policies include include 'mixed_float16', "
+                 "'mixed_bfloat16', and the name of any dtype such as "
+                 "'float32'." % (name,))
         if float32_vars:
           error += (' The value %s ends with _with_float32_vars, but %s cannot '
                     'be converted to a DType' % (name, base_name))
-        raise ValueError(error)
+        # six.raise_from supresses the original TypeError from being raised
+        six.raise_from(ValueError(error), None)
 
     if float32_vars:
       return base_dtype, 'float32'
@@ -337,7 +354,11 @@ def with_input_dtype(policy, dtype):
     # Policies without a compute dtype are either "infer" or
     # "infer_with_float32_vars", so the variable_dtype must be float32 here.
     assert policy.variable_dtype == 'float32'
-    return Policy(dtype + '_with_float32_vars')
+    try:
+      Policy._warn_about_float32_vars = False  # pylint: disable=protected-access
+      return Policy(dtype + '_with_float32_vars')
+    finally:
+      Policy._warn_about_float32_vars = True  # pylint: disable=protected-access
 
 
 # The current global policy in effect. If None, it means the current value of
diff --git a/tensorflow/python/keras/mixed_precision/experimental/policy_test.py b/tensorflow/python/keras/mixed_precision/experimental/policy_test.py
index f1c2504a990..09b25960f53 100644
--- a/tensorflow/python/keras/mixed_precision/experimental/policy_test.py
+++ b/tensorflow/python/keras/mixed_precision/experimental/policy_test.py
@@ -169,11 +169,49 @@ class PolicyTest(test.TestCase):
           'policies. You passed loss_scale=2.0 for policy float32. Consider '
           'not passing any loss_scale instead.')
 
-    for policy_name in 'float16', 'float16_with_float32_vars', 'mixed_float16':
+    for policy_name in 'float16', 'mixed_float16':
       with test.mock.patch.object(tf_logging, 'warn') as mock_warn:
         mp_policy.Policy(policy_name, loss_scale=2.)
         mock_warn.assert_not_called()
 
+  @testing_utils.enable_v2_dtype_behavior
+  def test_float32_vars_warning(self):
+    with test.mock.patch.object(tf_logging, 'warn') as mock_warn:
+      mp_policy.Policy('infer_with_float32_vars')
+      self.assertEqual(
+          mock_warn.call_args[0][0],
+          "WARNING: The 'infer_with_float32_vars' policy is deprecated and "
+          "will be removed in TensorFlow 2.1. Please use the 'mixed_float16' "
+          "or 'mixed_bfloat16' policy instead.")
+
+    with test.mock.patch.object(tf_logging, 'warn') as mock_warn:
+      mp_policy.Policy('float16_with_float32_vars')
+      self.assertEqual(
+          mock_warn.call_args[0][0],
+          "WARNING: The 'float16_with_float32_vars' policy is deprecated and "
+          "will be removed in TensorFlow 2.1. Please use the 'mixed_float16' "
+          "policy instead.")
+
+    with test.mock.patch.object(tf_logging, 'warn') as mock_warn:
+      mp_policy.Policy('bfloat16_with_float32_vars')
+      self.assertEqual(
+          mock_warn.call_args[0][0],
+          "WARNING: The 'bfloat16_with_float32_vars' policy is deprecated and "
+          "will be removed in TensorFlow 2.1. Please use the 'mixed_bfloat16' "
+          "policy instead.")
+
+    with test.mock.patch.object(tf_logging, 'warn') as mock_warn:
+      mp_policy.Policy('float64_with_float32_vars')
+      self.assertEqual(
+          mock_warn.call_args[0][0],
+          "WARNING: The 'float64_with_float32_vars' policy is deprecated and "
+          "will be removed in TensorFlow 2.1.")
+
+    for policy_name in 'float16', 'float32', 'mixed_float16', 'mixed_bfloat16':
+      with test.mock.patch.object(tf_logging, 'warn') as mock_warn:
+        mp_policy.Policy(policy_name)
+        mock_warn.assert_not_called()
+
   @testing_utils.enable_v2_dtype_behavior
   def test_policy_scope(self):
     if base_layer_utils.v2_dtype_behavior_enabled():
diff --git a/tensorflow/python/keras/models.py b/tensorflow/python/keras/models.py
index bf157da6878..5362d7556b9 100644
--- a/tensorflow/python/keras/models.py
+++ b/tensorflow/python/keras/models.py
@@ -482,6 +482,11 @@ def _in_place_subclassed_model_reset(model):
     setattr(model, name, fresh_layer)
     model._layers.append(fresh_layer)
 
+    # The base Layer __setattr__ will invalidate its attribute cache when
+    # `._layers` is assigned, but it has no way to know when the underlying list
+    # is mutated so we must explicitly signal the append.
+    model._attribute_sentinel.invalidate_all()
+
   # Cache original model build attributes (in addition to layers)
   if (not hasattr(model, '_original_attributes_cache') or
       model._original_attributes_cache is None):
diff --git a/tensorflow/python/keras/optimizer_v2/BUILD b/tensorflow/python/keras/optimizer_v2/BUILD
index c1f72440208..715c7938600 100644
--- a/tensorflow/python/keras/optimizer_v2/BUILD
+++ b/tensorflow/python/keras/optimizer_v2/BUILD
@@ -201,21 +201,13 @@ cuda_py_test(
     xla_enable_strict_auto_jit = True,
 )
 
-py_test(
+cuda_py_test(
     name = "optimizer_v2_test",
     size = "medium",
     srcs = ["optimizer_v2_test.py"],
-    python_version = "PY2",
-    shard_count = 8,
-    tags = [
-        "no_gpu",  # b/127001953
-        "no_windows",
-        # TODO(b/127092862): Re-enable this test in Kokoro.
-        "no_oss",
-        "notap",  # b/140242244
-    ],
-    deps = [
+    additional_deps = [
         ":optimizer_v2",
+        "@absl_py//absl/testing:parameterized",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:clip_ops",
@@ -227,8 +219,12 @@ py_test(
         "//tensorflow/python:variables",
         "//tensorflow/python/eager:def_function",
         "//tensorflow/python/keras",
-        "@absl_py//absl/testing:parameterized",
     ],
+    shard_count = 8,
+    tags = [
+        "no_windows",
+    ],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
diff --git a/tensorflow/python/keras/optimizer_v2/adagrad.py b/tensorflow/python/keras/optimizer_v2/adagrad.py
index a766e6b3a30..df085042d81 100644
--- a/tensorflow/python/keras/optimizer_v2/adagrad.py
+++ b/tensorflow/python/keras/optimizer_v2/adagrad.py
@@ -20,16 +20,12 @@ from __future__ import print_function
 
 import numpy as np
 
-from tensorflow.python.compat import compat
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.keras import backend_config
 from tensorflow.python.keras.optimizer_v2 import optimizer_v2
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import init_ops
-from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import resource_variable_ops
-from tensorflow.python.ops import state_ops
 from tensorflow.python.training import training_ops
 from tensorflow.python.util.tf_export import keras_export
 
@@ -153,21 +149,13 @@ class Adagrad(optimizer_v2.OptimizerV2):
                     or self._fallback_apply_state(var_device, var_dtype))
 
     acc = self.get_slot(var, 'accumulator')
-    if compat.forward_compatible(2019, 8, 20):
-      return training_ops.resource_apply_adagrad_v2(
-          var.handle,
-          acc.handle,
-          coefficients['lr_t'],
-          coefficients['epsilon'],
-          grad,
-          use_locking=self._use_locking)
-
-    acc_t = state_ops.assign_add(
-        acc, math_ops.square(grad), use_locking=self._use_locking)
-    var_update = state_ops.assign_sub(
-        var, coefficients['lr_t'] * grad /
-        (math_ops.sqrt(acc_t) + coefficients['epsilon']))
-    return var_update
+    return training_ops.resource_apply_adagrad_v2(
+        var.handle,
+        acc.handle,
+        coefficients['lr_t'],
+        coefficients['epsilon'],
+        grad,
+        use_locking=self._use_locking)
 
   def _resource_apply_sparse(self, grad, var, indices, apply_state=None):
     var_device, var_dtype = var.device, var.dtype.base_dtype
@@ -175,24 +163,14 @@ class Adagrad(optimizer_v2.OptimizerV2):
                     or self._fallback_apply_state(var_device, var_dtype))
 
     acc = self.get_slot(var, 'accumulator')
-    if compat.forward_compatible(2019, 8, 20):
-      return training_ops.resource_sparse_apply_adagrad_v2(
-          var.handle,
-          acc.handle,
-          coefficients['lr_t'],
-          coefficients['epsilon'],
-          grad,
-          indices,
-          use_locking=self._use_locking)
-    with ops.control_dependencies([
-        resource_variable_ops.resource_scatter_add(acc.handle, indices,
-                                                   math_ops.square(grad))
-    ]):
-      acc_t_slice = acc.sparse_read(indices)
-    var_update = resource_variable_ops.resource_scatter_add(
-        var.handle, indices, coefficients['neg_lr_t'] * grad /
-        (math_ops.sqrt(acc_t_slice) + coefficients['epsilon']))
-    return var_update
+    return training_ops.resource_sparse_apply_adagrad_v2(
+        var.handle,
+        acc.handle,
+        coefficients['lr_t'],
+        coefficients['epsilon'],
+        grad,
+        indices,
+        use_locking=self._use_locking)
 
   def get_config(self):
     config = super(Adagrad, self).get_config()
diff --git a/tensorflow/python/keras/optimizer_v2/adamax_test.py b/tensorflow/python/keras/optimizer_v2/adamax_test.py
index b246a1d07f8..ea9e8a53a47 100644
--- a/tensorflow/python/keras/optimizer_v2/adamax_test.py
+++ b/tensorflow/python/keras/optimizer_v2/adamax_test.py
@@ -80,7 +80,7 @@ class AdamaxOptimizerTest(test.TestCase):
 
   def doTestSparse(self, use_resource=False):
     for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
-      with self.cached_session():
+      with self.cached_session(use_gpu=True):
         # Initialize variables for numpy implementation.
         zero_slots = lambda: np.zeros((3), dtype=dtype.as_numpy_dtype)  # pylint: disable=cell-var-from-loop
         m0, v0, m1, v1 = zero_slots(), zero_slots(), zero_slots(), zero_slots()
@@ -176,9 +176,12 @@ class AdamaxOptimizerTest(test.TestCase):
   @test_util.run_in_graph_and_eager_modes(reset_test=True)
   def testBasic(self):
     for i, dtype in enumerate([dtypes.half, dtypes.float32, dtypes.float64]):
-      with self.session(graph=ops.Graph()):
+      with self.session(graph=ops.Graph(), use_gpu=True):
         # Initialize variables for numpy implementation.
-        m0, v0, m1, v1 = 0.0, 0.0, 0.0, 0.0
+        m0 = np.array([0.0, 0.0])
+        v0 = np.array([0.0, 0.0])
+        m1 = np.array([0.0, 0.0])
+        v1 = np.array([0.0, 0.0])
         var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
         grads0_np = np.array([0.1, 0.1], dtype=dtype.as_numpy_dtype)
         var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype)
@@ -224,7 +227,7 @@ class AdamaxOptimizerTest(test.TestCase):
   @test_util.run_in_graph_and_eager_modes(reset_test=True)
   def testBasicWithLearningRateDecay(self):
     for i, dtype in enumerate([dtypes.half, dtypes.float32, dtypes.float64]):
-      with self.session(graph=ops.Graph()):
+      with self.session(graph=ops.Graph(), use_gpu=True):
         # Initialize variables for numpy implementation.
         m0, v0, m1, v1 = 0.0, 0.0, 0.0, 0.0
         var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
@@ -278,7 +281,7 @@ class AdamaxOptimizerTest(test.TestCase):
   @test_util.run_deprecated_v1
   def testTensorLearningRate(self):
     for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
-      with self.cached_session():
+      with self.cached_session(use_gpu=True):
         # Initialize variables for numpy implementation.
         m0, v0, m1, v1 = 0.0, 0.0, 0.0, 0.0
         var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
@@ -315,7 +318,7 @@ class AdamaxOptimizerTest(test.TestCase):
   @test_util.run_deprecated_v1
   def testSharing(self):
     for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
-      with self.cached_session():
+      with self.cached_session(use_gpu=True):
         # Initialize variables for numpy implementation.
         m0, v0, m1, v1 = 0.0, 0.0, 0.0, 0.0
         var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
diff --git a/tensorflow/python/keras/optimizer_v2/ftrl_test.py b/tensorflow/python/keras/optimizer_v2/ftrl_test.py
index f0f07e9d03f..f7cf4cd965d 100644
--- a/tensorflow/python/keras/optimizer_v2/ftrl_test.py
+++ b/tensorflow/python/keras/optimizer_v2/ftrl_test.py
@@ -37,8 +37,8 @@ from tensorflow.python.training import gradient_descent
 class FtrlOptimizerTest(test.TestCase):
 
   def doTestFtrlwithoutRegularization(self, use_resource=False):
-    for dtype in [dtypes.half, dtypes.float32]:
-      with self.cached_session() as sess:
+    for dtype in [dtypes.float32]:
+      with self.cached_session(use_gpu=True):
         if use_resource:
           var0 = resource_variable_ops.ResourceVariable([0.0, 0.0], dtype=dtype)
           var1 = resource_variable_ops.ResourceVariable([0.0, 0.0], dtype=dtype)
@@ -80,7 +80,7 @@ class FtrlOptimizerTest(test.TestCase):
   @test_util.run_deprecated_v1
   def testFtrlwithoutRegularization2(self):
     for dtype in [dtypes.half, dtypes.float32]:
-      with self.cached_session() as sess:
+      with self.cached_session(use_gpu=True):
         var0 = variables.Variable([1.0, 2.0], dtype=dtype)
         var1 = variables.Variable([4.0, 3.0], dtype=dtype)
         grads0 = constant_op.constant([0.1, 0.2], dtype=dtype)
@@ -110,7 +110,7 @@ class FtrlOptimizerTest(test.TestCase):
   @test_util.run_deprecated_v1
   def testMinimizeSparseResourceVariable(self):
     for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
-      with self.cached_session():
+      with self.cached_session(use_gpu=True):
         var0 = resource_variable_ops.ResourceVariable([[1.0, 2.0]], dtype=dtype)
         x = constant_op.constant([[4.0], [5.0]], dtype=dtype)
 
@@ -132,7 +132,7 @@ class FtrlOptimizerTest(test.TestCase):
   @test_util.run_deprecated_v1
   def testFtrlWithL1(self):
     for dtype in [dtypes.half, dtypes.float32]:
-      with self.cached_session() as sess:
+      with self.cached_session(use_gpu=True):
         var0 = variables.Variable([1.0, 2.0], dtype=dtype)
         var1 = variables.Variable([4.0, 3.0], dtype=dtype)
         grads0 = constant_op.constant([0.1, 0.2], dtype=dtype)
@@ -162,7 +162,7 @@ class FtrlOptimizerTest(test.TestCase):
   @test_util.run_deprecated_v1
   def testFtrlWithL1_L2(self):
     for dtype in [dtypes.half, dtypes.float32]:
-      with self.cached_session() as sess:
+      with self.cached_session(use_gpu=True):
         var0 = variables.Variable([1.0, 2.0], dtype=dtype)
         var1 = variables.Variable([4.0, 3.0], dtype=dtype)
         grads0 = constant_op.constant([0.1, 0.2], dtype=dtype)
@@ -199,7 +199,7 @@ class FtrlOptimizerTest(test.TestCase):
     weights will tend to have smaller magnitudes with this parameter set.
     """
     for dtype in [dtypes.half, dtypes.float32]:
-      with self.cached_session() as sess:
+      with self.cached_session(use_gpu=True):
         var0 = variables.Variable([1.0, 2.0], dtype=dtype)
         var1 = variables.Variable([4.0, 3.0], dtype=dtype)
         grads0 = constant_op.constant([0.1, 0.2], dtype=dtype)
@@ -232,7 +232,7 @@ class FtrlOptimizerTest(test.TestCase):
   def testFtrlWithL1_L2_L2ShrinkageSparse(self):
     """Tests the new FTRL op with support for l2 shrinkage on sparse grads."""
     for dtype in [dtypes.half, dtypes.float32]:
-      with self.cached_session() as sess:
+      with self.cached_session(use_gpu=True):
         var0 = variables.Variable([[1.0], [2.0]], dtype=dtype)
         var1 = variables.Variable([[4.0], [3.0]], dtype=dtype)
         grads0 = ops.IndexedSlices(
@@ -267,7 +267,7 @@ class FtrlOptimizerTest(test.TestCase):
   def testFtrlWithL2ShrinkageDoesNotChangeLrSchedule(self):
     """Verifies that l2 shrinkage in FTRL does not change lr schedule."""
     for dtype in [dtypes.half, dtypes.float32]:
-      with self.cached_session() as sess:
+      with self.cached_session(use_gpu=True) as sess:
         var0 = variables.Variable([1.0, 2.0], dtype=dtype)
         var1 = variables.Variable([1.0, 2.0], dtype=dtype)
         grads0 = constant_op.constant([0.1, 0.2], dtype=dtype)
@@ -325,7 +325,6 @@ class FtrlOptimizerTest(test.TestCase):
     update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
     variables.global_variables_initializer().run()
 
-    sess = ops.get_default_session()
     v0_val, v1_val = self.evaluate([var0, var1])
     if is_sparse:
       self.assertAllCloseAccordingToType([[0.0], [0.0]], v0_val)
@@ -351,7 +350,7 @@ class FtrlOptimizerTest(test.TestCase):
   @test_util.run_deprecated_v1
   def testEquivAdagradwithoutRegularization(self):
     for dtype in [dtypes.half, dtypes.float32]:
-      with self.cached_session():
+      with self.cached_session(use_gpu=True):
         val0, val1 = self.applyOptimizer(
             ftrl.Ftrl(
                 3.0,
@@ -362,7 +361,7 @@ class FtrlOptimizerTest(test.TestCase):
                 l2_regularization_strength=0.0),
             dtype)
 
-      with self.cached_session():
+      with self.cached_session(use_gpu=True):
         val2, val3 = self.applyOptimizer(
             adagrad.AdagradOptimizer(3.0, initial_accumulator_value=0.1), dtype)
 
@@ -396,7 +395,7 @@ class FtrlOptimizerTest(test.TestCase):
   @test_util.run_deprecated_v1
   def testEquivSparseGradientDescentwithoutRegularization(self):
     for dtype in [dtypes.half, dtypes.float32]:
-      with self.cached_session():
+      with self.cached_session(use_gpu=True):
         val0, val1 = self.applyOptimizer(
             ftrl.Ftrl(
                 3.0,
@@ -408,7 +407,7 @@ class FtrlOptimizerTest(test.TestCase):
             dtype,
             is_sparse=True)
 
-      with self.cached_session():
+      with self.cached_session(use_gpu=True):
         val2, val3 = self.applyOptimizer(
             gradient_descent.GradientDescentOptimizer(3.0),
             dtype,
@@ -420,7 +419,7 @@ class FtrlOptimizerTest(test.TestCase):
   @test_util.run_deprecated_v1
   def testEquivGradientDescentwithoutRegularization(self):
     for dtype in [dtypes.half, dtypes.float32]:
-      with self.cached_session():
+      with self.cached_session(use_gpu=True):
         val0, val1 = self.applyOptimizer(
             ftrl.Ftrl(
                 3.0,
@@ -431,7 +430,7 @@ class FtrlOptimizerTest(test.TestCase):
                 l2_regularization_strength=0.0),
             dtype)
 
-      with self.cached_session():
+      with self.cached_session(use_gpu=True):
         val2, val3 = self.applyOptimizer(
             gradient_descent.GradientDescentOptimizer(3.0), dtype)
 
diff --git a/tensorflow/python/keras/optimizer_v2/optimizer_v2.py b/tensorflow/python/keras/optimizer_v2/optimizer_v2.py
index 6b916fc7d9d..e471ac3aca3 100644
--- a/tensorflow/python/keras/optimizer_v2/optimizer_v2.py
+++ b/tensorflow/python/keras/optimizer_v2/optimizer_v2.py
@@ -40,6 +40,7 @@ from tensorflow.python.keras.utils import generic_utils
 from tensorflow.python.keras.utils import tf_utils
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import clip_ops
+from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import gradients
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import resource_variable_ops
@@ -432,6 +433,10 @@ class OptimizerV2(trackable.Trackable):
         self._create_hypers()
         self._create_slots(var_list)
 
+      if not grads_and_vars:
+        # Distribution strategy does not support reducing an empty list of
+        # gradients
+        return control_flow_ops.no_op()
       apply_state = self._prepare(var_list)
       return distribute_ctx.get_replica_context().merge_call(
           functools.partial(self._distributed_apply, apply_state=apply_state),
diff --git a/tensorflow/python/keras/optimizer_v2/optimizer_v2_test.py b/tensorflow/python/keras/optimizer_v2/optimizer_v2_test.py
index a0b9702916d..3868a6fb16c 100644
--- a/tensorflow/python/keras/optimizer_v2/optimizer_v2_test.py
+++ b/tensorflow/python/keras/optimizer_v2/optimizer_v2_test.py
@@ -65,7 +65,7 @@ class OptimizerTest(test.TestCase):
   @test_util.run_in_graph_and_eager_modes
   def testBasic(self):
     for _, dtype in enumerate([dtypes.half, dtypes.float32, dtypes.float64]):
-      with self.cached_session():
+      with self.cached_session(use_gpu=True):
         var0 = resource_variable_ops.ResourceVariable([1.0, 2.0], dtype=dtype)
         var1 = resource_variable_ops.ResourceVariable([3.0, 4.0], dtype=dtype)
         loss = lambda: 5 * var0 + 3 * var1  # pylint: disable=cell-var-from-loop
@@ -129,7 +129,7 @@ class OptimizerTest(test.TestCase):
   @test_util.run_in_graph_and_eager_modes
   def testPrecomputedGradient(self):
     for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
-      with self.cached_session():
+      with self.cached_session(use_gpu=True):
         var0 = variables.Variable([1.0, 2.0], dtype=dtype)
         var1 = variables.Variable([3.0, 4.0], dtype=dtype)
         loss = lambda: 5 * var0 + 3 * var1  # pylint: disable=cell-var-from-loop
@@ -153,7 +153,7 @@ class OptimizerTest(test.TestCase):
   @test_util.run_in_graph_and_eager_modes
   def testNoGradients(self):
     for _, dtype in enumerate([dtypes.half, dtypes.float32, dtypes.float64]):
-      with self.cached_session():
+      with self.cached_session(use_gpu=True):
         var0 = resource_variable_ops.ResourceVariable([1.0, 2.0], dtype=dtype)
         var1 = resource_variable_ops.ResourceVariable([3.0, 4.0], dtype=dtype)
         loss = lambda: 5 * var0  # pylint: disable=cell-var-from-loop
@@ -165,7 +165,7 @@ class OptimizerTest(test.TestCase):
   @test_util.run_in_graph_and_eager_modes
   def testNoGradientsForAnyVariables_Minimize(self):
     for _, dtype in enumerate([dtypes.half, dtypes.float32, dtypes.float64]):
-      with self.cached_session():
+      with self.cached_session(use_gpu=True):
         var0 = resource_variable_ops.ResourceVariable([1.0, 2.0], dtype=dtype)
         var1 = resource_variable_ops.ResourceVariable([3.0, 4.0], dtype=dtype)
         loss = lambda: constant_op.constant(5.0)
@@ -178,7 +178,7 @@ class OptimizerTest(test.TestCase):
   @test_util.run_in_graph_and_eager_modes
   def testNoGradientsForAnyVariables_ApplyGradients(self):
     for _, dtype in enumerate([dtypes.half, dtypes.float32, dtypes.float64]):
-      with self.cached_session():
+      with self.cached_session(use_gpu=True):
         var0 = resource_variable_ops.ResourceVariable([1.0, 2.0], dtype=dtype)
         var1 = resource_variable_ops.ResourceVariable([3.0, 4.0], dtype=dtype)
         sgd_op = gradient_descent.SGD(3.0)
@@ -189,7 +189,7 @@ class OptimizerTest(test.TestCase):
   @test_util.run_in_graph_and_eager_modes
   def testGradientsAsVariables(self):
     for i, dtype in enumerate([dtypes.half, dtypes.float32, dtypes.float64]):
-      with self.cached_session():
+      with self.cached_session(use_gpu=True):
         var0 = resource_variable_ops.ResourceVariable([1.0, 2.0], dtype=dtype)
         var1 = resource_variable_ops.ResourceVariable([3.0, 4.0], dtype=dtype)
         loss = lambda: 5 * var0 + 3 * var1  # pylint: disable=cell-var-from-loop
@@ -227,7 +227,7 @@ class OptimizerTest(test.TestCase):
 
   @test_util.run_in_graph_and_eager_modes
   def testComputeGradientsWithTensors(self):
-    with self.cached_session():
+    with self.cached_session(use_gpu=True):
       x = ops.convert_to_tensor(1.0)
 
       def f():
@@ -247,7 +247,7 @@ class OptimizerTest(test.TestCase):
   def testConstraint(self):
     constraint_01 = lambda x: clip_ops.clip_by_value(x, -0.1, 0.)
     constraint_0 = lambda x: clip_ops.clip_by_value(x, 0., 1.)
-    with self.cached_session():
+    with self.cached_session(use_gpu=True):
       var0 = variables.Variable([1.0, 2.0],
                                 constraint=constraint_01)
       var1 = variables.Variable([3.0, 4.0],
@@ -269,14 +269,14 @@ class OptimizerTest(test.TestCase):
 
   @test_util.run_in_graph_and_eager_modes
   def testIterationWithoutMinimize(self):
-    with self.cached_session():
+    with self.cached_session(use_gpu=True):
       sgd = gradient_descent.SGD(3.0)
       self.evaluate(sgd.iterations.initializer)
       self.assertEqual(0, self.evaluate(sgd.iterations))
 
   @test_util.run_in_graph_and_eager_modes
   def testConfig(self):
-    with self.cached_session():
+    with self.cached_session(use_gpu=True):
       opt = gradient_descent.SGD(learning_rate=1.0)
       config = opt.get_config()
       opt2 = gradient_descent.SGD.from_config(config)
@@ -296,7 +296,7 @@ class OptimizerTest(test.TestCase):
 
   @test_util.run_in_graph_and_eager_modes
   def testConfigWithLearningRateDecay(self):
-    with self.cached_session():
+    with self.cached_session(use_gpu=True):
       var0 = variables.Variable([[1.0], [2.0]], dtype=dtypes.float32)
       for decay_schedule in [
           learning_rate_schedule.InverseTimeDecay(
@@ -327,7 +327,7 @@ class OptimizerTest(test.TestCase):
 
   @test_util.run_in_graph_and_eager_modes
   def testGradClipValue(self):
-    with self.cached_session():
+    with self.cached_session(use_gpu=True):
       var = resource_variable_ops.ResourceVariable([1.0, 2.0])
       loss = lambda: 3 * var
       opt = gradient_descent.SGD(learning_rate=1.0, clipvalue=1.0)
@@ -338,7 +338,7 @@ class OptimizerTest(test.TestCase):
 
   @test_util.run_in_graph_and_eager_modes
   def testGradClipNorm(self):
-    with self.cached_session():
+    with self.cached_session(use_gpu=True):
       var = resource_variable_ops.ResourceVariable([1.0])
       loss = lambda: 3 * var
       opt = gradient_descent.SGD(learning_rate=1.0, clipnorm=1.0)
@@ -359,7 +359,7 @@ class OptimizerTest(test.TestCase):
 
   @test_util.run_in_graph_and_eager_modes
   def testWeights(self):
-    with self.cached_session():
+    with self.cached_session(use_gpu=True):
       opt1 = adam.Adam(learning_rate=1.0)
       var1 = resource_variable_ops.ResourceVariable([1.0, 2.0],
                                                     dtype=dtypes.float32)
@@ -608,6 +608,12 @@ class OptimizerTest(test.TestCase):
       self.assertLen(opt_vars, 5)
       self.assertEqual('outter/Adam/var_2/m:0', opt_vars[3].name)
 
+  @test_util.run_in_graph_and_eager_modes
+  def testEmptyVarList(self):
+    opt = gradient_descent.SGD(1.)
+    opt.minimize(lambda: constant_op.constant(1.), [])
+    opt.apply_gradients([])
+
 
 @keras_parameterized.run_all_keras_modes
 class OptimizersCompatibilityTest(keras_parameterized.TestCase):
@@ -620,7 +626,7 @@ class OptimizersCompatibilityTest(keras_parameterized.TestCase):
           'v1 optimizer does not run in experimental_run_tf_function mode or '
           'eager mode')
     np.random.seed(1331)
-    with self.cached_session():
+    with self.cached_session(use_gpu=True):
       train_samples = 20
       input_dim = 3
       num_classes = 2
@@ -708,7 +714,7 @@ class OptimizersCompatibilityTest(keras_parameterized.TestCase):
           'v1 optimizer does not run in experimental_run_tf_function mode or '
           'eager mode')
     np.random.seed(1331)
-    with self.cached_session():
+    with self.cached_session(use_gpu=True):
       train_samples = 20
       input_dim = 3
       num_classes = 2
@@ -769,7 +775,7 @@ class OptimizersCompatibilityTest(keras_parameterized.TestCase):
           'v1 optimizer does not run in experimental_run_tf_function mode or '
           'eager mode')
     np.random.seed(1331)
-    with self.cached_session():
+    with self.cached_session(use_gpu=True):
       train_samples = 20
       input_dim = 3
       num_classes = 2
diff --git a/tensorflow/python/keras/optimizers.py b/tensorflow/python/keras/optimizers.py
index 10ce8336abc..1c3eae2cb94 100644
--- a/tensorflow/python/keras/optimizers.py
+++ b/tensorflow/python/keras/optimizers.py
@@ -794,6 +794,7 @@ def deserialize(config, custom_objects=None):
   Returns:
       A Keras Optimizer instance.
   """
+  from tensorflow.python.keras.mixed_precision.experimental import loss_scale_optimizer  # pylint: disable=g-import-not-at-top
   all_classes = {
       'adadelta': adadelta_v2.Adadelta,
       'adagrad': adagrad_v2.Adagrad,
@@ -802,7 +803,8 @@ def deserialize(config, custom_objects=None):
       'nadam': nadam_v2.Nadam,
       'rmsprop': rmsprop_v2.RMSprop,
       'sgd': gradient_descent_v2.SGD,
-      'ftrl': ftrl.Ftrl
+      'ftrl': ftrl.Ftrl,
+      'lossscaleoptimizer': loss_scale_optimizer.LossScaleOptimizer,
   }
 
   # Make deserialization case-insensitive for built-in optimizers.
diff --git a/tensorflow/python/keras/premade/wide_deep.py b/tensorflow/python/keras/premade/wide_deep.py
index de926f5f6bb..0afa479a483 100644
--- a/tensorflow/python/keras/premade/wide_deep.py
+++ b/tensorflow/python/keras/premade/wide_deep.py
@@ -24,6 +24,7 @@ from tensorflow.python.keras import layers as layer_module
 from tensorflow.python.keras.engine import base_layer
 from tensorflow.python.keras.engine import training
 from tensorflow.python.keras.utils import generic_utils
+from tensorflow.python.util import nest
 from tensorflow.python.util.tf_export import keras_export
 
 
@@ -93,9 +94,10 @@ class WideDeepModel(training.Model):
       linear_inputs, dnn_inputs = inputs
     linear_output = self.linear_model(linear_inputs)
     dnn_output = self.dnn_model(dnn_inputs)
-    output = .5 * (linear_output + dnn_output)
+    output = nest.map_structure(lambda x, y: 0.5 * (x + y), linear_output,
+                                dnn_output)
     if self.activation:
-      return self.activation(output)
+      return nest.map_structure(self.activation, output)
     return output
 
   def _get_optimizers(self):
@@ -106,8 +108,8 @@ class WideDeepModel(training.Model):
 
   # This does not support gradient scaling and LossScaleOptimizer.
   def _backwards(self, tape, loss):
-    linear_vars = self.linear_model._unique_trainable_weights  # pylint: disable=protected-access
-    dnn_vars = self.dnn_model._unique_trainable_weights  # pylint: disable=protected-access
+    linear_vars = self.linear_model.trainable_weights  # pylint: disable=protected-access
+    dnn_vars = self.dnn_model.trainable_weights  # pylint: disable=protected-access
     linear_grads, dnn_grads = tape.gradient(loss, (linear_vars, dnn_vars))
     linear_optimizer, dnn_optimizer = self._get_optimizers()
     linear_optimizer.apply_gradients(zip(linear_grads, linear_vars))
@@ -138,11 +140,11 @@ class WideDeepModel(training.Model):
           # Training updates
           updates = []
           linear_updates = linear_optimizer.get_updates(
-              params=self.linear_model._unique_trainable_weights,  # pylint: disable=protected-access
+              params=self.linear_model.trainable_weights,  # pylint: disable=protected-access
               loss=self.total_loss)
           updates += linear_updates
           dnn_updates = dnn_optimizer.get_updates(
-              params=self.dnn_model._unique_trainable_weights,  # pylint: disable=protected-access
+              params=self.dnn_model.trainable_weights,  # pylint: disable=protected-access
               loss=self.total_loss)
           updates += dnn_updates
           # Unconditional updates
diff --git a/tensorflow/python/keras/premade/wide_deep_test.py b/tensorflow/python/keras/premade/wide_deep_test.py
index fbbd10dc01c..f7e10fc209c 100644
--- a/tensorflow/python/keras/premade/wide_deep_test.py
+++ b/tensorflow/python/keras/premade/wide_deep_test.py
@@ -20,6 +20,7 @@ from __future__ import print_function
 
 import numpy as np
 
+from tensorflow.python.eager import context
 from tensorflow.python.feature_column import dense_features_v2
 from tensorflow.python.feature_column import feature_column_v2 as fc
 from tensorflow.python.keras import keras_parameterized
@@ -31,6 +32,7 @@ from tensorflow.python.keras.layers import core
 from tensorflow.python.keras.optimizer_v2 import gradient_descent
 from tensorflow.python.keras.premade import linear
 from tensorflow.python.keras.premade import wide_deep
+from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
 
@@ -96,6 +98,31 @@ class WideDeepModelTest(keras_parameterized.TestCase):
         experimental_run_tf_function=testing_utils.should_run_tf_function())
     wide_deep_model.fit(inputs, output, epochs=5)
 
+  def test_wide_deep_model_with_multi_outputs(self):
+    with context.eager_mode():
+      inp = input_layer.Input(shape=(1,), name='linear')
+      l = linear.LinearModel(units=2, use_bias=False)(inp)
+      l1, l2 = array_ops.split(l, num_or_size_splits=2, axis=1)
+      linear_model = training.Model(inp, [l1, l2])
+      linear_model.set_weights([np.asarray([[0.5, 0.3]])])
+      h = core.Dense(units=2, use_bias=False)(inp)
+      h1, h2 = array_ops.split(h, num_or_size_splits=2, axis=1)
+      dnn_model = training.Model(inp, [h1, h2])
+      dnn_model.set_weights([np.asarray([[0.1, -0.5]])])
+      wide_deep_model = wide_deep.WideDeepModel(linear_model, dnn_model)
+      inp_np = np.asarray([[1.]])
+      out1, out2 = wide_deep_model(inp_np)
+      # output should be 0.5 * (0.5 + 0.1), and 0.5 * (0.3 - 0.5)
+      self.assertAllClose([[0.3]], out1)
+      self.assertAllClose([[-0.1]], out2)
+
+      wide_deep_model = wide_deep.WideDeepModel(
+          linear_model, dnn_model, activation='relu')
+      out1, out2 = wide_deep_model(inp_np)
+      # output should be relu(0.5 * (0.5 + 0.1)), and relu(0.5 * (0.3 - 0.5))
+      self.assertAllClose([[0.3]], out1)
+      self.assertAllClose([[0.]], out2)
+
   def test_wide_deep_model_with_single_optimizer(self):
     linear_model = linear.LinearModel(units=1)
     dnn_model = sequential.Sequential([core.Dense(units=1, input_dim=3)])
diff --git a/tensorflow/python/keras/saving/hdf5_format.py b/tensorflow/python/keras/saving/hdf5_format.py
index d23db5b7763..8027cd36330 100644
--- a/tensorflow/python/keras/saving/hdf5_format.py
+++ b/tensorflow/python/keras/saving/hdf5_format.py
@@ -75,6 +75,12 @@ def save_model_to_hdf5(model, filepath, overwrite=True, include_optimizer=True):
   # TODO(psv) Add warning when we save models that contain non-serializable
   # entities like metrics added using `add_metric` and losses added using
   # `add_loss.`
+  if len(model.weights) != len(model._undeduplicated_weights):
+    logging.warning('Found duplicated `Variable`s in Model\'s `weights`. '
+                    'This is usually caused by `Variable`s being shared by '
+                    'Layers in the Model. These `Variable`s will be treated '
+                    'as separate `Variable`s when the Model is restored. To '
+                    'avoid this, please save with `save_format="tf"`.')
 
   if not isinstance(filepath, h5py.File):
     # If file exists and should not be overwritten.
diff --git a/tensorflow/python/keras/saving/saved_model/layer_serialization.py b/tensorflow/python/keras/saving/saved_model/layer_serialization.py
index 0ce7b260081..2d02afc2990 100644
--- a/tensorflow/python/keras/saving/saved_model/layer_serialization.py
+++ b/tensorflow/python/keras/saving/saved_model/layer_serialization.py
@@ -105,3 +105,28 @@ class LayerSavedModelSaver(base_serialization.SavedModelSaver):
     # function dict, even if the value is None.
     functions['_default_save_signature'] = None
     return objects, functions
+
+
+class InputLayerSavedModelSaver(base_serialization.SavedModelSaver):
+  """InputLayer serialization."""
+
+  @property
+  def object_identifier(self):
+    return '_tf_keras_input_layer'
+
+  @property
+  def python_properties(self):
+    return dict(
+        class_name=type(self.obj).__name__,
+        name=self.obj.name,
+        dtype=self.obj.dtype,
+        sparse=self.obj.sparse,
+        ragged=self.obj.ragged,
+        batch_input_shape=self.obj._batch_input_shape,  # pylint: disable=protected-access
+        config=self.obj.get_config())
+
+  def objects_to_serialize(self, serialization_cache):
+    return {}
+
+  def functions_to_serialize(self, serialization_cache):
+    return {}
diff --git a/tensorflow/python/keras/saving/saved_model/load.py b/tensorflow/python/keras/saving/saved_model/load.py
index 0cf38182c93..d393f802c0d 100644
--- a/tensorflow/python/keras/saving/saved_model/load.py
+++ b/tensorflow/python/keras/saving/saved_model/load.py
@@ -46,6 +46,9 @@ models_lib = LazyLoader("models_lib", globals(),
 base_layer = LazyLoader(
     "base_layer", globals(),
     "tensorflow.python.keras.engine.base_layer")
+input_layer = LazyLoader(
+    "input_layer", globals(),
+    "tensorflow.python.keras.engine.input_layer")
 network_lib = LazyLoader(
     "network_lib", globals(),
     "tensorflow.python.keras.engine.network")
@@ -85,12 +88,16 @@ def load(path, compile=True):  # pylint: disable=redefined-builtin
   # TODO(kathywu): Add code to load from objects that contain all endpoints
   model = tf_load.load_internal(path, loader_cls=KerasObjectLoader)
 
-  if isinstance(model, RevivedModel) and compile:
+  # pylint: disable=protected-access
+  if isinstance(model, training_lib.Model) and compile:
     # TODO(kathywu): Use compiled objects from SavedModel, instead of
     # creating new objects from the training config.
-    if model._training_config is not None:  # pylint: disable=protected-access
+    training_config = model._serialized_attributes['metadata'].get(
+        'training_config', None)
+    if training_config is not None:
       model.compile(**saving_utils.compile_args_from_training_config(
-          model._training_config))  # pylint: disable=protected-access
+          training_config))
+  # pylint: disable=protected-access
 
   return model
 
@@ -104,9 +111,15 @@ class KerasObjectLoader(tf_load.Loader):
 
   def _finalize(self):
     # pylint: disable=protected-access
+
+    # Set up call functions for all layers (skip this step for Sequential and
+    # Functional models).
     for node in self._nodes:
       if isinstance(node, RevivedLayer):
-        if not isinstance(node, RevivedSequential):
+        node.built = True
+        is_graph_network = node._serialized_attributes['metadata'].get(
+            'is_graph_network', False)
+        if not (isinstance(node, models_lib.Sequential) or is_graph_network):
           if hasattr(node.keras_api, 'call_and_return_conditional_losses'):
             node.call = utils.use_wrapped_call(
                 node, node.keras_api.call_and_return_conditional_losses,
@@ -114,22 +127,37 @@ class KerasObjectLoader(tf_load.Loader):
             node._init_call_fn_args()
 
     for node in self._nodes:
-      if isinstance(node, RevivedModel):
+      if isinstance(node, RevivedNetwork):
         call_fn = node.keras_api.call_and_return_conditional_losses
         if call_fn.input_signature is None:
           inputs = infer_inputs_from_restored_call_function(call_fn)
         else:
           inputs = call_fn.input_signature[0]
-        if isinstance(node, RevivedSequential):
+
+        # Set model inputs and outputs.
+        is_graph_network = node._serialized_attributes['metadata'].get(
+            'is_graph_network', False)
+        if isinstance(node, models_lib.Sequential):
           with trackable.no_automatic_dependency_tracking_scope(node):
             node._layers = []
           for layer in node.keras_api.layers:
             node.add(layer)
-
-        if not node.inputs:
-          # Since this revived object is technically a subclassed model (even if
-          # the original model is functional/sequential), inputs should be set.
+        elif is_graph_network:
+          # Reconstruct functional model from the config and layers loaded
+          # from the SavedModel.
+          inputs, outputs, _ = network_lib.reconstruct_from_config(
+              node.get_config(),
+              created_layers={layer.name: layer for layer in node.layers})
+          node._init_graph_network(
+              inputs, outputs,
+              name=node._serialized_attributes['metadata']['name'])
+          # Set the metadata attributes once more, since _init_graph_network
+          # resets these attributes.
+          _set_network_attributes_from_metadata(node)
+        else:  # Model is subclassed.
           node._set_inputs(inputs)
+
+      # Add unconditional losses.
       if isinstance(node, RevivedLayer):
         if hasattr(node.keras_api, 'layer_regularization_losses'):
           losses = getattr(node.keras_api, 'layer_regularization_losses', [])
@@ -163,9 +191,10 @@ class KerasObjectLoader(tf_load.Loader):
   def _recreate_base_user_object(self, proto):
     revived_classes = {
         '_tf_keras_layer': (RevivedLayer, base_layer.Layer),
+        '_tf_keras_input_layer': (RevivedInputLayer, input_layer.InputLayer),
         '_tf_keras_network': (RevivedNetwork, network_lib.Network),
-        '_tf_keras_model': (RevivedModel, training_lib.Model),
-        '_tf_keras_sequential': (RevivedSequential, models_lib.Sequential)
+        '_tf_keras_model': (RevivedNetwork, training_lib.Model),
+        '_tf_keras_sequential': (RevivedNetwork, models_lib.Sequential)
     }
 
     parent_classes = revived_classes.get(proto.identifier, None)
@@ -177,8 +206,7 @@ class KerasObjectLoader(tf_load.Loader):
           compat.as_str(metadata['class_name']),
           parent_classes,
           {'__setattr__': parent_classes[1].__setattr__})
-      obj = revived_cls._init_from_metadata(metadata)  # pylint: disable=protected-access
-      return obj, revived_cls._revive_setter  # pylint: disable=protected-access
+      return revived_cls._init_from_metadata(metadata)  # pylint: disable=protected-access
 
     return super(KerasObjectLoader, self)._recreate_base_user_object(proto)
 
@@ -219,19 +247,10 @@ class RevivedLayer(object):
       # Store attributes revived from SerializedAttributes in a un-tracked
       # dictionary. The attributes are the ones listed in CommonEndpoints or
       # "keras_api" for keras-specific attributes.
-      revived_obj._serialized_attributes = {}
+      revived_obj._serialized_attributes = {'metadata': metadata}
       # pylint:enable=protected-access
 
-    return revived_obj
-
-  def _revive_setter(self, name, value):
-    """Reattaches attributes from the SavedModel to the newly revived object."""
-    if name in PUBLIC_ATTRIBUTES:
-      if isinstance(value, trackable.Trackable):
-        self._track_trackable(value, name=name)
-      self._serialized_attributes[name] = value
-    else:
-      setattr(self, name, value)
+    return revived_obj, _revive_setter
 
   @property
   def keras_api(self):
@@ -244,6 +263,40 @@ class RevivedLayer(object):
       raise NotImplementedError
 
 
+def _revive_setter(layer, name, value):
+  """Reattaches attributes from the SavedModel to the newly revived object."""
+  if name in PUBLIC_ATTRIBUTES:
+    # pylint: disable=protected-access
+    if isinstance(value, trackable.Trackable):
+      layer._track_trackable(value, name=name)
+    layer._serialized_attributes[name] = value
+    # pylint: enable=protected-access
+  else:
+    setattr(layer, name, value)
+
+
+class RevivedInputLayer(object):
+  """InputLayer loaded from a SavedModel."""
+
+  @classmethod
+  def _init_from_metadata(cls, metadata):
+    """Revives the saved InputLayer from the Metadata."""
+    init_args = dict(
+        name=metadata['name'],
+        dtype=metadata['dtype'],
+        sparse=metadata['sparse'],
+        ragged=metadata['ragged'],
+        batch_input_shape=metadata['batch_input_shape'])
+    revived_obj = cls(**init_args)
+    with trackable.no_automatic_dependency_tracking_scope(revived_obj):
+      revived_obj._config = metadata['config']  # pylint:disable=protected-access
+
+    return revived_obj, setattr
+
+  def get_config(self):
+    return self._config
+
+
 def recursively_deserialize_keras_object(config, module_objects=None):
   """Deserialize Keras object from a nested structure."""
   if isinstance(config, dict):
@@ -288,48 +341,33 @@ class RevivedNetwork(RevivedLayer):
     """Create revived network from metadata stored in the SavedModel proto."""
     revived_obj = cls(name=metadata['name'])
 
+    # Store attributes revived from SerializedAttributes in a un-tracked
+    # dictionary. The attributes are the ones listed in CommonEndpoints or
+    # "keras_api" for keras-specific attributes.
     with trackable.no_automatic_dependency_tracking_scope(revived_obj):
       # pylint:disable=protected-access
-      if metadata.get('dtype') is not None:
-        revived_obj._dtype = metadata['dtype']
-      revived_obj.trainable = metadata['trainable']
-
-      revived_obj._expects_training_arg = metadata['expects_training_arg']
-      if metadata.get('config') is not None:
-        revived_obj._config = metadata['config']
-
-      if metadata.get('activity_regularizer') is not None:
-        revived_obj.activity_regularizer = regularizers.deserialize(
-            metadata['activity_regularizer'])
-
-      # Store attributes revived from SerializedAttributes in a un-tracked
-      # dictionary. The attributes are the ones listed in CommonEndpoints or
-      # "keras_api" for keras-specific attributes.
-      revived_obj._serialized_attributes = {}
+      revived_obj._serialized_attributes = {'metadata': metadata}
+      _set_network_attributes_from_metadata(revived_obj)
       # pylint:enable=protected-access
 
-    return revived_obj
+    return revived_obj, _revive_setter  # pylint:disable=protected-access
 
 
-class RevivedModel(RevivedNetwork):
-  """Keras model loaded from a SavedModel."""
+def _set_network_attributes_from_metadata(revived_obj):
+  """Sets attributes recorded in the metadata."""
+  with trackable.no_automatic_dependency_tracking_scope(revived_obj):
+    # pylint:disable=protected-access
+    metadata = revived_obj._serialized_attributes['metadata']
+    if metadata.get('dtype') is not None:
+      revived_obj._dtype = metadata['dtype']
+    revived_obj.trainable = metadata['trainable']
 
-  @classmethod
-  def _init_from_metadata(cls, metadata):
-    """Create revived model from metadata stored in the SavedModel proto."""
-    revived_obj = super(RevivedModel, cls)._init_from_metadata(metadata)
+    revived_obj._expects_training_arg = metadata['expects_training_arg']
+    if metadata.get('config') is not None:
+      revived_obj._config = metadata['config']
 
-    with trackable.no_automatic_dependency_tracking_scope(revived_obj):
-      revived_obj._training_config = metadata.get('training_config')  # pylint:disable=protected-access
+    if metadata.get('activity_regularizer') is not None:
+      revived_obj.activity_regularizer = regularizers.deserialize(
+          metadata['activity_regularizer'])
+    # pylint:enable=protected-access
 
-    return revived_obj
-
-
-class RevivedSequential(RevivedModel):
-  """Keras sequential model loaded from a SavedModel."""
-
-  @classmethod
-  def _init_from_metadata(cls, metadata):
-    """Create revived Sequential model from SavedModel metadata."""
-    revived_obj = super(RevivedSequential, cls)._init_from_metadata(metadata)
-    return revived_obj
diff --git a/tensorflow/python/keras/saving/saved_model/network_serialization.py b/tensorflow/python/keras/saving/saved_model/network_serialization.py
index 7d8b34002e1..2e089c2ede8 100644
--- a/tensorflow/python/keras/saving/saved_model/network_serialization.py
+++ b/tensorflow/python/keras/saving/saved_model/network_serialization.py
@@ -27,3 +27,8 @@ class NetworkSavedModelSaver(layer_serialization.LayerSavedModelSaver):
   @property
   def object_identifier(self):
     return '_tf_keras_network'
+
+  def _python_properties_internal(self):
+    metadata = super(NetworkSavedModelSaver, self)._python_properties_internal()
+    metadata['is_graph_network'] = self.obj._is_graph_network  # pylint: disable=protected-access
+    return metadata
diff --git a/tensorflow/python/keras/saving/saved_model/save_impl.py b/tensorflow/python/keras/saving/saved_model/save_impl.py
index 3923eaad533..79f40c579da 100644
--- a/tensorflow/python/keras/saving/saved_model/save_impl.py
+++ b/tensorflow/python/keras/saving/saved_model/save_impl.py
@@ -25,6 +25,7 @@ import functools
 import weakref
 
 from tensorflow.python.eager import def_function
+from tensorflow.python.eager import function
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_spec
 from tensorflow.python.keras import backend as K
@@ -53,9 +54,15 @@ from tensorflow.python.util.lazy_loader import LazyLoader
 base_layer = LazyLoader(
     "base_layer", globals(),
     "tensorflow.python.keras.engine.base_layer")
+input_layer = LazyLoader(
+    "input_layer", globals(),
+    "tensorflow.python.keras.engine.input_layer")
 training_lib = LazyLoader(
     "training_lib", globals(),
     "tensorflow.python.keras.engine.training")
+sequential_lib = LazyLoader(
+    "sequential_lib", globals(),
+    "tensorflow.python.keras.engine.sequential")
 # pylint:enable=g-inconsistent-quotes
 
 
@@ -146,7 +153,7 @@ def wrap_layer_functions(layer, serialization_cache):
   # Since Sequential models may be modified in place using model.add() or
   # model.pop(), don't use saved functions.
   if (isinstance(layer, keras_load.RevivedLayer) and
-      not isinstance(layer, keras_load.RevivedSequential)):
+      not isinstance(layer, sequential_lib.Sequential)):
     return {fn_name: getattr(layer.keras_api, fn_name, None)
             for fn_name in serialized_attributes.LayerAttributes.all_functions}
 
@@ -186,7 +193,8 @@ def wrap_layer_functions(layer, serialization_cache):
   # Manually trigger traces before restoring the overwritten functions. The
   # functions are traced within the layer call context to ensure that layer
   # functions (e.g. add_loss) behave as though running in graph mode.
-  with base_layer_utils.call_context().enter(layer, None, True, None):
+  with base_layer_utils.call_context().enter(
+      layer, inputs=None, build_graph=True, training=None, saving=True):
     for fn in fns.values():
       if fn is not None and fn.input_signature is not None:
         fn.get_concrete_function()
@@ -210,7 +218,8 @@ def _list_all_layers(obj):
   if isinstance(obj, training_lib.Model):
     return obj.layers
   else:
-    return trackable_layer_utils.filter_empty_layer_containers(obj._layers)  # pylint: disable=protected-access
+    return list(
+        trackable_layer_utils.filter_empty_layer_containers(obj._layers))  # pylint: disable=protected-access
 
 
 def _replace_child_layer_functions(layer, serialization_cache):
@@ -240,6 +249,9 @@ def _replace_child_layer_functions(layer, serialization_cache):
   # pylint: disable=protected-access
   original_fns = {}
   for child_layer in _list_all_layers(layer):
+    if isinstance(child_layer, input_layer.InputLayer):
+      continue
+
     if child_layer not in serialization_cache[constants.KERAS_CACHE_KEY]:
       layer_fns = (
           child_layer._trackable_saved_model_saver._get_serialized_attributes(
@@ -334,12 +346,16 @@ class LayerCallCollection(object):
 
   def __init__(self, layer):
     self.layer = layer
+
+    self.layer_call_method = _get_layer_call_method(layer)
     self._expects_training_arg = layer_uses_training_bool(layer)
-    self._training_arg_index = utils.get_training_arg_index(layer.call)
+    self._training_arg_index = utils.get_training_arg_index(
+        self.layer_call_method)
 
     # If the layer call function has kwargs, then the traced function cannot
     # have an input signature.
-    arg_spec = tf_inspect.getfullargspec(layer.call)
+    arg_spec = tf_inspect.getfullargspec(
+        self.layer_call_method)
     self._has_kwargs = bool(self._expects_training_arg or
                             arg_spec.defaults or
                             arg_spec.kwonlyargs or
@@ -504,7 +520,8 @@ def layer_call_wrapper(call_collection, method):
     # pylint: enable=protected-access
     original_losses = _reset_layer_losses(layer)
     with base_layer_utils.call_context().enter(
-        layer, inputs=inputs, build_graph=False, training=training):
+        layer, inputs=inputs, build_graph=False, training=training,
+        saving=True):
       ret = method(*args, **kwargs)
     _restore_layer_losses(original_losses)
     return ret
@@ -516,7 +533,7 @@ class LayerCall(def_function.Function):
 
   def __init__(self, call_collection, python_function, *args, **kwargs):
     self.call_collection = call_collection
-    self.original_call = call_collection.layer.call
+    self.original_call = call_collection.layer_call_method
     python_function = layer_call_wrapper(call_collection, python_function)
     super(LayerCall, self).__init__(python_function, *args, **kwargs)
 
@@ -545,7 +562,7 @@ def _wrap_call_and_conditional_losses(layer):
     activity regularizer
   """
   # Create function that generates both outputs and losses
-  layer_call = layer.call
+  layer_call = _get_layer_call_method(layer)
   def call_and_return_conditional_losses(inputs, *args, **kwargs):
     return layer_call(inputs, *args, **kwargs), layer.get_losses_for(inputs)
   return _create_call_fn_decorator(layer, call_and_return_conditional_losses)
@@ -571,11 +588,12 @@ def _append_activity_regularizer_loss(
 
 
 def _create_call_fn_decorator(layer, wrapped_call):
+  call_fn = _get_layer_call_method(layer)
   fn, arg_spec = utils.maybe_add_training_arg(
-      layer.call, wrapped_call, layer._expects_training_arg,  # pylint: disable=protected-access
+      call_fn, wrapped_call, layer._expects_training_arg,  # pylint: disable=protected-access
       default_training_value=False)
   return tf_decorator.make_decorator(
-      target=layer.call,
+      target=call_fn,
       decorator_func=fn,
       decorator_argspec=arg_spec)
 
@@ -599,3 +617,9 @@ def _wrap_activity_regularizer(layer):
       layer.activity_regularizer,
       '{}_activity_regularizer'.format(layer.name),
       input_signature=[tensor_spec.TensorSpec(None, layer.dtype or K.floatx())])
+
+
+def _get_layer_call_method(layer):
+  if isinstance(layer.call, (def_function.Function, function.ConcreteFunction)):
+    return layer.call.python_function
+  return layer.call
diff --git a/tensorflow/python/keras/saving/saved_model/saved_model_test.py b/tensorflow/python/keras/saving/saved_model/saved_model_test.py
index f13f3fa2dd9..4d38a71f239 100644
--- a/tensorflow/python/keras/saving/saved_model/saved_model_test.py
+++ b/tensorflow/python/keras/saving/saved_model/saved_model_test.py
@@ -474,6 +474,83 @@ class TestModelSavingAndLoadingV2(keras_parameterized.TestCase):
       loaded_predictions = loaded.predict(features)
       self.assertAllClose(predictions, loaded_predictions)
 
+  def testSaveTensorKwarg(self):
+
+    class LayerWithTensorKwarg(keras.layers.Layer):
+
+      def call(self, inputs, tensor=None):
+        if tensor is not None:
+          return inputs * math_ops.cast(tensor, dtypes.float32)
+        else:
+          return inputs
+
+    t = array_ops.sequence_mask(1)
+    inputs = keras.layers.Input(shape=(3))
+    model = keras.models.Model(inputs, LayerWithTensorKwarg()(inputs, t))
+
+    input_arr = np.random.random((1, 3)).astype(np.float32)
+    predictions = model.predict(input_arr)
+
+    saved_model_dir = self._save_model_dir()
+    model.save(saved_model_dir, save_format='tf')
+    loaded = keras_load.load(saved_model_dir)
+    loaded_predictions = loaded.predict(input_arr)
+    self.assertAllClose(predictions, loaded_predictions)
+
+  def testModelWithTfFunctionCall(self):
+    class Subclass(keras.models.Model):
+
+      @def_function.function
+      def call(self, inputs, training=False):
+        return inputs * math_ops.cast(training, dtypes.float32)
+
+    model = Subclass()
+    model.predict(array_ops.ones((1, 2)), steps=1)
+    saved_model_dir = self._save_model_dir()
+    model.save(saved_model_dir, save_format='tf')
+    loaded = keras_load.load(saved_model_dir)
+    self.assertAllEqual(
+        [[1, 5]],
+        self.evaluate(loaded(array_ops.constant([[1, 5.]]), training=True)))
+    self.assertAllEqual(
+        [[0, 0]],
+        self.evaluate(loaded(array_ops.constant([[1, 5.]]), training=False)))
+
+  def testReviveFunctionalModel(self):
+
+    class CustomAdd(keras.layers.Add):
+
+      def build(self, input_shape):
+        self.w = self.add_weight('w', shape=[])
+        super(CustomAdd, self).build(input_shape)
+
+      def call(self, inputs):
+        outputs = super(CustomAdd, self).call(inputs)
+        return outputs * self.w
+
+    input1 = keras.layers.Input(shape=(None, 3), name='input_1')
+    input2 = keras.layers.Input(shape=(None, 3), name='input_2')
+
+    d = keras.layers.Dense(4, name='dense_with_two_inbound_nodes')
+    output1 = d(input1)
+    output2 = d(input2)
+
+    # Use a custom layer in this model to ensure that layers aren't being
+    # recreated directly from the config.
+    outputs = CustomAdd(name='custom')([output1, output2])
+    model = keras.models.Model([input1, input2], outputs, name='save_model')
+
+    self.evaluate(variables.variables_initializer(model.variables))
+    saved_model_dir = self._save_model_dir()
+    model.save(saved_model_dir, save_format='tf')
+
+    loaded = keras_load.load(saved_model_dir)
+    self.assertEqual('save_model', loaded.name)
+    self.assertLen(
+        loaded.get_layer('dense_with_two_inbound_nodes')._inbound_nodes, 2)
+    self.assertEqual('CustomAdd', type(loaded.get_layer('custom')).__name__)
+    self.assertLen(loaded.get_layer('custom').weights, 1)
+
 
 class TestLayerCallTracing(test.TestCase):
 
diff --git a/tensorflow/python/keras/saving/saving_utils.py b/tensorflow/python/keras/saving/saving_utils.py
index a1019022c01..d5bc66aa76b 100644
--- a/tensorflow/python/keras/saving/saving_utils.py
+++ b/tensorflow/python/keras/saving/saving_utils.py
@@ -25,6 +25,7 @@ from tensorflow.python.framework import tensor_spec
 from tensorflow.python.keras import backend as K
 from tensorflow.python.keras import losses
 from tensorflow.python.keras import optimizers
+from tensorflow.python.keras.engine import base_layer_utils
 from tensorflow.python.keras.utils.io_utils import ask_to_proceed_with_overwrite
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util import nest
@@ -136,7 +137,11 @@ def trace_model_call(model, input_signature=None):
     # When given a single input, Keras models will call the model on the tensor
     # rather than a list consisting of the single tensor.
     inputs = args[0] if len(input_signature) == 1 else list(args)
-    outputs_list = nest.flatten(model(inputs=inputs, training=False))
+
+    with base_layer_utils.call_context().enter(
+        model, inputs=inputs, build_graph=False, training=False, saving=True):
+      outputs_list = nest.flatten(model(inputs=inputs, training=False))
+
     try:
       output_names = model.output_names
     except AttributeError:
diff --git a/tensorflow/python/keras/utils/conv_utils.py b/tensorflow/python/keras/utils/conv_utils.py
index 1d6256ea3d8..5d031428bce 100644
--- a/tensorflow/python/keras/utils/conv_utils.py
+++ b/tensorflow/python/keras/utils/conv_utils.py
@@ -240,17 +240,17 @@ def conv_kernel_mask(input_shape, kernel_shape, strides, padding):
   indicating pairs of input and output locations that are connected by a weight.
 
   Example:
-    ```python
-        >>> input_shape = (4,)
-        >>> kernel_shape = (2,)
-        >>> strides = (1,)
-        >>> padding = "valid"
-        >>> conv_kernel_mask(input_shape, kernel_shape, strides, padding)
-        array([[ True, False, False],
-               [ True,  True, False],
-               [False,  True,  True],
-               [False, False,  True]], dtype=bool)
-    ```
+
+    >>> input_shape = (4,)
+    >>> kernel_shape = (2,)
+    >>> strides = (1,)
+    >>> padding = "valid"
+    >>> conv_kernel_mask(input_shape, kernel_shape, strides, padding)
+    array([[ True, False, False],
+           [ True,  True, False],
+           [False,  True,  True],
+           [False, False,  True]])
+
     where rows and columns correspond to inputs and outputs respectively.
 
 
@@ -316,18 +316,18 @@ def conv_kernel_idxs(input_shape, kernel_shape, strides, padding, filters_in,
     weights) to the respective single input image at `input_idx`
 
   Example:
-    ```python
-        >>> input_shape = (2, 2)
-        >>> kernel_shape = (2, 1)
-        >>> strides = (1, 1)
-        >>> padding = "valid"
-        >>> filters_in = 1
-        >>> filters_out = 1
-        >>> data_format = "channels_last"
-        >>> list(conv_kernel_idxs(input_shape, kernel_shape, strides, padding,
-        >>>                       filters_in, filters_out, data_format))
-        [(0, 0), (0, 2), (1, 1), (1, 3)]
-    ```
+
+    >>> input_shape = (2, 2)
+    >>> kernel_shape = (2, 1)
+    >>> strides = (1, 1)
+    >>> padding = "valid"
+    >>> filters_in = 1
+    >>> filters_out = 1
+    >>> data_format = "channels_last"
+    >>> list(conv_kernel_idxs(input_shape, kernel_shape, strides, padding,
+    ...                       filters_in, filters_out, data_format))
+    [(0, 0), (0, 2), (1, 1), (1, 3)]
+
   Args:
     input_shape: tuple of size N: `(d_in1, ..., d_inN)`, spatial shape of the
       input.
@@ -408,16 +408,16 @@ def conv_connected_inputs(input_shape, kernel_shape, output_position, strides,
   `output_position = (p_out1, ..., p_outN)`.
 
   Example:
-    ```python
-        >>> input_shape = (4, 4)
-        >>> kernel_shape = (2, 1)
-        >>> output_position = (1, 1)
-        >>> strides = (1, 1)
-        >>> padding = "valid"
-        >>> conv_connected_inputs(input_shape, kernel_shape, output_position,
-        >>>                       strides, padding)
-        [xrange(1, 3), xrange(1, 2)]
-    ```
+
+    >>> input_shape = (4, 4)
+    >>> kernel_shape = (2, 1)
+    >>> output_position = (1, 1)
+    >>> strides = (1, 1)
+    >>> padding = "valid"
+    >>> conv_connected_inputs(input_shape, kernel_shape, output_position,
+    ...                       strides, padding)
+    [range(1, 3), range(1, 2)]
+
   Args:
     input_shape: tuple of size N: `(d_in1, ..., d_inN)`, spatial shape of the
       input.
diff --git a/tensorflow/python/keras/utils/data_utils.py b/tensorflow/python/keras/utils/data_utils.py
index e77f14c30e5..de1839a9f44 100644
--- a/tensorflow/python/keras/utils/data_utils.py
+++ b/tensorflow/python/keras/utils/data_utils.py
@@ -289,9 +289,8 @@ def _hash_file(fpath, algorithm='sha256', chunk_size=65535):
   Example:
 
   ```python
-      >>> from keras.data_utils import _hash_file
-      >>> _hash_file('/path/to/file.zip')
-      'e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855'
+  _hash_file('/path/to/file.zip')
+  'e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855'
   ```
 
   Arguments:
diff --git a/tensorflow/python/keras/utils/generic_utils.py b/tensorflow/python/keras/utils/generic_utils.py
index bc3fafe1df7..b8f99c0c213 100644
--- a/tensorflow/python/keras/utils/generic_utils.py
+++ b/tensorflow/python/keras/utils/generic_utils.py
@@ -143,7 +143,7 @@ def register_keras_serializable(package='Custom', name=None):
   Note that to be serialized and deserialized, classes must implement the
   `get_config()` method. Functions do not have this requirement.
 
-  The object will be registered under the key 'module>name' where `name`,
+  The object will be registered under the key 'package>name' where `name`,
   defaults to the object name if not passed.
 
   Arguments:
@@ -194,8 +194,19 @@ def serialize_keras_object(instance):
     return None
 
   if hasattr(instance, 'get_config'):
+    config = instance.get_config()
+    serialization_config = {}
+    for key, item in config.items():
+      try:
+        serialized_item = serialize_keras_object(item)
+        if isinstance(serialized_item, dict):
+          serialized_item['__passive_serialization__'] = True
+        serialization_config[key] = serialized_item
+      except ValueError:
+        serialization_config[key] = item
+
     name = _get_name_or_custom_name(instance.__class__)
-    return serialize_keras_class_and_config(name, instance.get_config())
+    return serialize_keras_class_and_config(name, serialization_config)
   if hasattr(instance, '__name__'):
     return _get_name_or_custom_name(instance)
   raise ValueError('Cannot serialize', instance)
@@ -221,7 +232,21 @@ def class_and_config_for_serialized_keras_object(
     cls = module_objects.get(class_name)
     if cls is None:
       raise ValueError('Unknown ' + printable_module_name + ': ' + class_name)
-  return (cls, config['config'])
+
+  cls_config = config['config']
+  deserialized_objects = {}
+  for key, item in cls_config.items():
+    if (isinstance(item, dict) and '__passive_serialization__' in item):
+      deserialized_objects[key] = deserialize_keras_object(
+          item,
+          module_objects=module_objects,
+          custom_objects=custom_objects,
+          printable_module_name='config_item')
+
+  for key, item in deserialized_objects.items():
+    cls_config[key] = deserialized_objects[key]
+
+  return (cls, cls_config)
 
 
 @keras_export('keras.utils.deserialize_keras_object')
diff --git a/tensorflow/python/keras/utils/generic_utils_test.py b/tensorflow/python/keras/utils/generic_utils_test.py
index 16001099536..67df6befb08 100644
--- a/tensorflow/python/keras/utils/generic_utils_test.py
+++ b/tensorflow/python/keras/utils/generic_utils_test.py
@@ -164,6 +164,85 @@ class SerializeKerasObjectTest(test.TestCase):
         def get_config(self):
           return {'value': self._value}
 
+  def test_serializable_object(self):
+
+    class SerializableInt(int):
+      """A serializable object to pass out of a test layer's config."""
+
+      def __new__(cls, value):
+        return int.__new__(cls, value)
+
+      def get_config(self):
+        return {'value': int(self)}
+
+      @classmethod
+      def from_config(cls, config):
+        return cls(**config)
+
+    layer = keras.layers.Dense(
+        SerializableInt(3),
+        activation='relu',
+        kernel_initializer='ones',
+        bias_regularizer='l2')
+    config = keras.layers.serialize(layer)
+    new_layer = keras.layers.deserialize(
+        config, custom_objects={'SerializableInt': SerializableInt})
+    self.assertEqual(new_layer.activation, keras.activations.relu)
+    self.assertEqual(new_layer.bias_regularizer.__class__,
+                     keras.regularizers.L1L2)
+    self.assertEqual(new_layer.units.__class__, SerializableInt)
+    self.assertEqual(new_layer.units, 3)
+
+  def test_nested_serializable_object(self):
+    class SerializableInt(int):
+      """A serializable object to pass out of a test layer's config."""
+
+      def __new__(cls, value):
+        return int.__new__(cls, value)
+
+      def get_config(self):
+        return {'value': int(self)}
+
+      @classmethod
+      def from_config(cls, config):
+        return cls(**config)
+
+    class SerializableNestedInt(int):
+      """A serializable object containing another serializable object."""
+
+      def __new__(cls, value, int_obj):
+        obj = int.__new__(cls, value)
+        obj.int_obj = int_obj
+        return obj
+
+      def get_config(self):
+        return {'value': int(self), 'int_obj': self.int_obj}
+
+      @classmethod
+      def from_config(cls, config):
+        return cls(**config)
+
+    nested_int = SerializableInt(4)
+    layer = keras.layers.Dense(
+        SerializableNestedInt(3, nested_int),
+        activation='relu',
+        kernel_initializer='ones',
+        bias_regularizer='l2')
+    config = keras.layers.serialize(layer)
+    new_layer = keras.layers.deserialize(
+        config,
+        custom_objects={
+            'SerializableInt': SerializableInt,
+            'SerializableNestedInt': SerializableNestedInt
+        })
+    self.assertEqual(new_layer.activation, keras.activations.relu)
+    self.assertEqual(new_layer.bias_regularizer.__class__,
+                     keras.regularizers.L1L2)
+    self.assertEqual(new_layer.units.__class__, SerializableNestedInt)
+    self.assertEqual(new_layer.units, 3)
+    self.assertEqual(new_layer.units.int_obj.__class__, SerializableInt)
+    self.assertEqual(new_layer.units.int_obj, 4)
+
 
 class SliceArraysTest(test.TestCase):
 
diff --git a/tensorflow/python/keras/utils/layer_utils.py b/tensorflow/python/keras/utils/layer_utils.py
index 6fff75d080b..c0de7308e67 100644
--- a/tensorflow/python/keras/utils/layer_utils.py
+++ b/tensorflow/python/keras/utils/layer_utils.py
@@ -235,7 +235,7 @@ def print_summary(model, line_length=None, positions=None, print_fn=None):
   if hasattr(model, '_collected_trainable_weights'):
     trainable_count = count_params(model._collected_trainable_weights)
   else:
-    trainable_count = count_params(model._unique_trainable_weights)
+    trainable_count = count_params(model.trainable_weights)
 
   non_trainable_count = count_params(model.non_trainable_weights)
 
diff --git a/tensorflow/python/keras/utils/tf_utils_test.py b/tensorflow/python/keras/utils/tf_utils_test.py
index 902ecf91670..11cd5fe1ff9 100644
--- a/tensorflow/python/keras/utils/tf_utils_test.py
+++ b/tensorflow/python/keras/utils/tf_utils_test.py
@@ -87,7 +87,7 @@ class TestIsSymbolicTensor(test.TestCase):
 
       def __init__(self, input_):
         self._input = input_
-        self.value = ops.convert_to_tensor(42.)
+        self.value = ops.convert_to_tensor([[42.]])
 
       @property
       def dtype(self):
@@ -123,14 +123,14 @@ class TestIsSymbolicTensor(test.TestCase):
 
     # User-land.
     model = keras.Sequential([
-        keras.layers.InputLayer([]),
+        keras.layers.InputLayer((1,)),
         PlumbingLayer(Foo),  # Makes a `Foo` object.
     ])
     # Let's ensure Keras graph history is preserved by composing the models.
     model = keras.Model(model.inputs, model(model.outputs))
     # Now we instantiate the model and verify we have a `Foo` object, not a
     # `Tensor`.
-    y = model(ops.convert_to_tensor(7.))
+    y = model(ops.convert_to_tensor([[7.]]))
     self.assertIsInstance(y, Foo)
     # Confirm that (custom) loss sees `Foo` instance, not Tensor.
     obtained_prediction_box = [None]
diff --git a/tensorflow/python/kernel_tests/BUILD b/tensorflow/python/kernel_tests/BUILD
index e87894fdccc..cab700f1f3a 100644
--- a/tensorflow/python/kernel_tests/BUILD
+++ b/tensorflow/python/kernel_tests/BUILD
@@ -861,7 +861,6 @@ cuda_py_test(
     ],
     # TODO(b/128347673): Re-enable.
     tags = ["no_windows"],
-    xla_enable_strict_auto_jit = True,
 )
 
 tf_py_test(
@@ -920,7 +919,6 @@ cuda_py_test(
         "//tensorflow/python:resource_variable_ops",
     ],
     tags = ["noasan"],  # http://b/32635055
-    xla_enable_strict_auto_jit = True,
 )
 
 tf_py_test(
@@ -3682,7 +3680,7 @@ tf_py_test(
     ],
 )
 
-tf_py_test(
+cuda_py_test(
     name = "bucketize_op_test",
     size = "small",
     srcs = ["bucketize_op_test.py"],
@@ -3888,6 +3886,7 @@ cuda_py_test(
         "//tensorflow/python/ops/linalg/sparse:gen_sparse_csr_matrix_ops",
     ],
     main = "sparse_csr_matrix_ops_test.py",
+    tags = ["no_rocm"],
 )
 
 cuda_py_test(
@@ -3898,6 +3897,7 @@ cuda_py_test(
         "//tensorflow/python/ops/linalg/sparse",
     ],
     main = "csr_sparse_matrix_test.py",
+    tags = ["no_rocm"],
 )
 
 cuda_py_test(
@@ -3909,6 +3909,7 @@ cuda_py_test(
     ],
     main = "sparse_csr_matrix_grad_test.py",
     shard_count = 50,
+    tags = ["no_rocm"],
 )
 
 cuda_py_test(
@@ -3920,6 +3921,7 @@ cuda_py_test(
     ],
     main = "sparse_csr_matrix_dense_mat_mul_grad_test.py",
     shard_count = 50,
+    tags = ["no_rocm"],
 )
 
 cuda_py_test(
@@ -3931,4 +3933,5 @@ cuda_py_test(
     ],
     main = "sparse_csr_matrix_sparse_mat_mul_grad_test.py",
     shard_count = 50,
+    tags = ["no_rocm"],
 )
diff --git a/tensorflow/python/kernel_tests/argmax_op_test.py b/tensorflow/python/kernel_tests/argmax_op_test.py
index 06ec0948c25..50becafc374 100644
--- a/tensorflow/python/kernel_tests/argmax_op_test.py
+++ b/tensorflow/python/kernel_tests/argmax_op_test.py
@@ -17,6 +17,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import functools
 import numpy as np
 
 from tensorflow.python.framework import dtypes
@@ -57,14 +58,18 @@ class ArgMaxTest(test.TestCase):
     self._testArg(method, x, axis, expected_values, False, expected_err_re)
 
   def _testBasic(self, dtype):
-    x = np.asarray(100 * np.random.randn(200), dtype=dtype)
+    x = np.arange(200, dtype=dtype)
+    np.random.shuffle(x)
 
     # Check that argmin and argmax match numpy along the primary axis
     self._testBothArg(math_ops.argmax, x, 0, x.argmax())
     self._testBothArg(math_ops.argmin, x, 0, x.argmin())
 
   def _testDim(self, dtype):
-    x = np.asarray(100 * np.random.randn(3, 2, 4, 5, 6), dtype=dtype)
+    shape = (3, 2, 4, 5, 6)
+    x = np.arange(functools.reduce(lambda x, y: x * y, shape), dtype=dtype)
+    np.random.shuffle(x)
+    x = x.reshape(shape)
 
     # Check that argmin and argmax match numpy along all axes
     for axis in range(-5, 5):
diff --git a/tensorflow/python/kernel_tests/array_ops_test.py b/tensorflow/python/kernel_tests/array_ops_test.py
index 286250561af..942b58b7d3d 100644
--- a/tensorflow/python/kernel_tests/array_ops_test.py
+++ b/tensorflow/python/kernel_tests/array_ops_test.py
@@ -695,8 +695,8 @@ class StridedSliceTest(test_util.TensorFlowTestCase):
     with self.session(use_gpu=True):
       ones = array_ops.placeholder(shape=[2, 2], dtype=dtypes.int16)
       self.assertAllEqual(
-          ones[array_ops.newaxis, :, 0].eval(
-              feed_dict={ones: [[1, 1], [1, 1]]}), [[1, 1]])
+          ones[array_ops.newaxis, :,
+               0].eval(feed_dict={ones: [[1, 1], [1, 1]]}), [[1, 1]])
 
   @test_util.run_deprecated_v1
   def testTensorIndexing(self):
@@ -1098,8 +1098,9 @@ class SliceAssignTest(test_util.TensorFlowTestCase):
   def testInvalidSlice(self):
     with self.cached_session() as sess:
       foo = constant_op.constant([1, 2, 3])
-      with self.assertRaisesRegexp(ValueError, "Sliced assignment"
-                                   " is only supported for variables"):
+      with self.assertRaisesRegexp(
+          ValueError, "Sliced assignment"
+          " is only supported for variables"):
         bar = foo[:2].assign(constant_op.constant([1, 2]))
         sess.run(bar)
 
@@ -1312,6 +1313,7 @@ class IdentityTest(test_util.TensorFlowTestCase):
   @test_util.run_gpu_only
   def testEagerIdentity(self):
     with context.eager_mode():
+
       def _test(x, y, device):
         self.assertAllEqual(x.numpy(), y.numpy())
         self.assertTrue(device in y.device.lower())
@@ -1428,37 +1430,50 @@ class SnapshotOpTest(test_util.TensorFlowTestCase):
         self.assertAllEqual(y.eval(), [0, 1, 2, 3])
 
 
+# Generates a tensor of the specified `shape` using values from `values` scaled
+# by (slice_idx + 1) along `axis` dimension.
+def _scale_per_slice(shape, axis, values):
+  # Note: repeats the values if the shape is larger than values.
+  out = np.take(values, np.remainder(np.arange(np.prod(shape)),
+                                     len(values))).reshape(shape)
+  if axis is not None:
+    scale_shape = [1] * len(shape)
+    scale_shape[axis] = shape[axis]
+    out *= np.arange(1, shape[axis] + 1).reshape(scale_shape)
+  return out
+
+
 @test_util.run_all_in_graph_and_eager_modes
 @test_util.disable_xla("b/140109958")
 class QuantizeAndDequantizeTest(test_util.TensorFlowTestCase):
 
-  def _scale_per_slice(self, shape, axis, values):
-    out = np.take(values, np.remainder(np.arange(np.prod(shape)),
-                                       len(values))).reshape(shape)
-    if axis is not None:
-      scale_shape = [1] * 4
-      scale_shape[axis] = shape[axis]
-      out *= np.arange(1, shape[axis] + 1).reshape(scale_shape)
-    return out
-
   def testAxis(self):
     shape = np.array([2, 3, 4, 5])
-    values = np.array([-1, -0.5, 0, 0.3, 0.8, 0.555, 0.5],
-                      dtype=np.float32)
-    quant_values = np.array([-1, -0.5, 0, 38.0/128, 102.0/128, 71.0/128, 0.5],
-                            dtype=np.float32)
+    values = np.array([-1, -0.5, 0, 0.3, 0.8, 0.555, 0.5], dtype=np.float32)
+    quant_values = np.array(
+        [-1, -0.5, 0, 38.0 / 128, 102.0 / 128, 71.0 / 128, 0.5],
+        dtype=np.float32)
     for axis in [None, 0, 1, 2, 3]:
       inputs = constant_op.constant(self._scale_per_slice(shape, axis, values))
       expected = self._scale_per_slice(shape, axis, quant_values)
       unused_minmax_value = 0 if axis is None else []
-      fake_quantized = self.evaluate(array_ops.quantize_and_dequantize(
-          inputs, unused_minmax_value, unused_minmax_value,
-          range_given=False, round_mode="HALF_UP", axis=axis))
+      fake_quantized = self.evaluate(
+          array_ops.quantize_and_dequantize(
+              inputs,
+              unused_minmax_value,
+              unused_minmax_value,
+              range_given=False,
+              round_mode="HALF_UP",
+              axis=axis))
       self.assertAllEqual(fake_quantized, expected)
       if axis is not None:
-        fake_quantized = self.evaluate(array_ops.quantize_and_dequantize(
-            inputs, unused_minmax_value, unused_minmax_value, range_given=False,
-            axis=(axis - 4)))
+        fake_quantized = self.evaluate(
+            array_ops.quantize_and_dequantize(
+                inputs,
+                unused_minmax_value,
+                unused_minmax_value,
+                range_given=False,
+                axis=(axis - 4)))
         self.assertAllClose(fake_quantized, expected)
 
 
diff --git a/tensorflow/python/kernel_tests/boosted_trees/prediction_ops_test.py b/tensorflow/python/kernel_tests/boosted_trees/prediction_ops_test.py
index e7961fc4c07..ed554ea9288 100644
--- a/tensorflow/python/kernel_tests/boosted_trees/prediction_ops_test.py
+++ b/tensorflow/python/kernel_tests/boosted_trees/prediction_ops_test.py
@@ -2459,9 +2459,20 @@ class PredictionOpsTest(test_util.TensorFlowTestCase):
             }
           }
         }
+        trees {
+          nodes {
+            leaf {
+              vector {
+                value: 0
+                value: 0
+              }
+            }
+          }
+        }
         tree_weights: 0.1
         tree_weights: 0.2
         tree_weights: 1.0
+        tree_weights: 1.0
       """, tree_ensemble_config)
 
       # Create existing ensemble with one root split
diff --git a/tensorflow/python/kernel_tests/boosted_trees/training_ops_test.py b/tensorflow/python/kernel_tests/boosted_trees/training_ops_test.py
index 3713fd289da..5e82fe44316 100644
--- a/tensorflow/python/kernel_tests/boosted_trees/training_ops_test.py
+++ b/tensorflow/python/kernel_tests/boosted_trees/training_ops_test.py
@@ -438,6 +438,12 @@ class UpdateTreeEnsembleOpTest(test_util.TensorFlowTestCase):
             }
             metadata {
               gain: 7.65
+              original_leaf {
+                vector {
+                  value: 0.0
+                  value: 0.0
+                }
+              }
             }
           }
           nodes {
@@ -464,7 +470,10 @@ class UpdateTreeEnsembleOpTest(test_util.TensorFlowTestCase):
         trees {
           nodes {
             leaf {
-              scalar: 0.0
+              vector {
+                value: 0.0
+                value: 0.0
+              }
             }
           }
         }
@@ -1103,7 +1112,6 @@ class UpdateTreeEnsembleOpTest(test_util.TensorFlowTestCase):
         trees {
           nodes {
             leaf {
-              scalar: 0.0
             }
           }
         }
@@ -1352,7 +1360,10 @@ class UpdateTreeEnsembleOpTest(test_util.TensorFlowTestCase):
         trees {
           nodes {
             leaf {
-              scalar: 0.0
+              vector {
+                value: 0.0
+                value: 0.0
+              }
             }
           }
         }
@@ -3065,6 +3076,12 @@ class UpdateTreeEnsembleOpTest(test_util.TensorFlowTestCase):
             }
             metadata {
               gain: -0.2
+              original_leaf {
+                vector {
+                  value: 0.0
+                  value: 0.0
+                }
+              }
             }
           }
           nodes {
@@ -3152,6 +3169,12 @@ class UpdateTreeEnsembleOpTest(test_util.TensorFlowTestCase):
             }
             metadata {
               gain: -0.2
+              original_leaf {
+                vector {
+                  value: 0.0
+                  value: 0.0
+                }
+              }
             }
           }
           nodes {
@@ -3301,6 +3324,12 @@ class UpdateTreeEnsembleOpTest(test_util.TensorFlowTestCase):
             }
             metadata {
               gain: -0.2
+              original_leaf {
+                vector {
+                  value: 0.0
+                  value: 0.0
+                }
+              }
             }
           }
           nodes {
@@ -3357,6 +3386,10 @@ class UpdateTreeEnsembleOpTest(test_util.TensorFlowTestCase):
         trees {
           nodes {
             leaf {
+              vector {
+                value: 0
+                value: 0
+              }
             }
           }
         }
@@ -3682,6 +3715,12 @@ class UpdateTreeEnsembleOpTest(test_util.TensorFlowTestCase):
             }
             metadata {
               gain: -0.62
+              original_leaf {
+                vector {
+                  value: 0.0
+                  value: 0.0
+                }
+              }
             }
           }
           nodes {
@@ -3766,12 +3805,20 @@ class UpdateTreeEnsembleOpTest(test_util.TensorFlowTestCase):
       trees {
         nodes {
           leaf {
+            vector {
+              value: 0
+              value: 0
+            }
           }
         }
       }
       trees {
         nodes {
           leaf {
+            vector {
+              value: 0
+              value: 0
+            }
           }
         }
       }
@@ -4000,6 +4047,12 @@ class UpdateTreeEnsembleOpTest(test_util.TensorFlowTestCase):
             }
             metadata {
               gain: 7.62
+              original_leaf {
+                vector {
+                  value: 0.0
+                  value: 0.0
+                }
+              }
             }
           }
           nodes {
@@ -4026,6 +4079,10 @@ class UpdateTreeEnsembleOpTest(test_util.TensorFlowTestCase):
         trees {
           nodes {
             leaf {
+              vector {
+                value: 0
+                value: 0
+              }
             }
           }
         }
diff --git a/tensorflow/python/kernel_tests/bucketize_op_test.py b/tensorflow/python/kernel_tests/bucketize_op_test.py
index 95df6943705..128cc17db15 100644
--- a/tensorflow/python/kernel_tests/bucketize_op_test.py
+++ b/tensorflow/python/kernel_tests/bucketize_op_test.py
@@ -18,9 +18,13 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import numpy as np
+
 from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors_impl
 from tensorflow.python.framework import test_util
+from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import test
 
@@ -35,6 +39,13 @@ class BucketizationOpTest(test.TestCase):
     with self.session(use_gpu=True) as sess:
       self.assertAllEqual(expected_out, self.evaluate(op))
 
+  def testEmptyFloat(self):
+    op = math_ops._bucketize(
+        array_ops.zeros([0, 3], dtype=dtypes.float32), boundaries=[])
+    expected_out = np.zeros([0, 3], dtype=np.float32)
+    with self.session(use_gpu=True):
+      self.assertAllEqual(expected_out, self.evaluate(op))
+
   def testFloat(self):
     op = math_ops._bucketize(
         constant_op.constant([-5., 0., 2., 3., 5., 8., 10., 11., 12.]),
diff --git a/tensorflow/python/kernel_tests/confusion_matrix_test.py b/tensorflow/python/kernel_tests/confusion_matrix_test.py
index cd6bd29e0de..c1178253a4b 100644
--- a/tensorflow/python/kernel_tests/confusion_matrix_test.py
+++ b/tensorflow/python/kernel_tests/confusion_matrix_test.py
@@ -22,7 +22,6 @@ import numpy as np
 from six.moves import xrange  # pylint: disable=redefined-builtin
 
 from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import errors_impl
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import confusion_matrix
@@ -215,22 +214,6 @@ class ConfusionMatrixTest(test.TestCase):
       self._testConfMatrix(
           labels=labels, predictions=predictions, num_classes=3, truth=None)
 
-  @test_util.run_deprecated_v1
-  def testInvalidRank_predictionsTooBig(self):
-    labels = np.asarray([1, 2, 3])
-    predictions = np.asarray([[1, 2, 3]])
-    self.assertRaisesRegexp(ValueError, "an not squeeze dim",
-                            confusion_matrix.confusion_matrix, predictions,
-                            labels)
-
-  @test_util.run_deprecated_v1
-  def testInvalidRank_predictionsTooSmall(self):
-    labels = np.asarray([[1, 2, 3]])
-    predictions = np.asarray([1, 2, 3])
-    self.assertRaisesRegexp(ValueError, "an not squeeze dim",
-                            confusion_matrix.confusion_matrix, predictions,
-                            labels)
-
   @test_util.run_deprecated_v1
   def testInputDifferentSize(self):
     labels = np.asarray([1, 2])
@@ -454,24 +437,18 @@ class RemoveSqueezableDimensionsTest(test.TestCase):
   def testUnsqueezableLabels(self):
     label_values = np.ones(shape=(2, 3, 2))
     prediction_values = np.zeros(shape=(2, 3))
-    with self.assertRaisesRegexp(ValueError, r"Can not squeeze dim\[2\]"):
-      confusion_matrix.remove_squeezable_dimensions(
-          label_values, prediction_values)
 
     labels_placeholder = array_ops.placeholder(dtype=dtypes.int32)
     predictions_placeholder = array_ops.placeholder(dtype=dtypes.int32)
-    dynamic_labels, dynamic_predictions = (
-        confusion_matrix.remove_squeezable_dimensions(
-            labels_placeholder, predictions_placeholder))
+    _, dynamic_predictions = (
+        confusion_matrix.remove_squeezable_dimensions(labels_placeholder,
+                                                      predictions_placeholder))
 
     with self.cached_session():
       feed_dict = {
           labels_placeholder: label_values,
           predictions_placeholder: prediction_values
       }
-      with self.assertRaisesRegexp(errors_impl.InvalidArgumentError,
-                                   r"Can not squeeze dim\[2\]"):
-        dynamic_labels.eval(feed_dict=feed_dict)
       self.assertAllEqual(
           prediction_values, dynamic_predictions.eval(feed_dict=feed_dict))
 
@@ -479,15 +456,12 @@ class RemoveSqueezableDimensionsTest(test.TestCase):
   def testUnsqueezablePredictions(self):
     label_values = np.ones(shape=(2, 3))
     prediction_values = np.zeros(shape=(2, 3, 2))
-    with self.assertRaisesRegexp(ValueError, r"Can not squeeze dim\[2\]"):
-      confusion_matrix.remove_squeezable_dimensions(
-          label_values, prediction_values)
 
     labels_placeholder = array_ops.placeholder(dtype=dtypes.int32)
     predictions_placeholder = array_ops.placeholder(dtype=dtypes.int32)
-    dynamic_labels, dynamic_predictions = (
-        confusion_matrix.remove_squeezable_dimensions(
-            labels_placeholder, predictions_placeholder))
+    dynamic_labels, _ = (
+        confusion_matrix.remove_squeezable_dimensions(labels_placeholder,
+                                                      predictions_placeholder))
 
     with self.cached_session():
       feed_dict = {
@@ -496,9 +470,6 @@ class RemoveSqueezableDimensionsTest(test.TestCase):
       }
       self.assertAllEqual(
           label_values, dynamic_labels.eval(feed_dict=feed_dict))
-      with self.assertRaisesRegexp(errors_impl.InvalidArgumentError,
-                                   r"Can not squeeze dim\[2\]"):
-        dynamic_predictions.eval(feed_dict=feed_dict)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/kernel_tests/control_flow_ops_py_test.py b/tensorflow/python/kernel_tests/control_flow_ops_py_test.py
index 007c3f268f2..9acaec4f039 100644
--- a/tensorflow/python/kernel_tests/control_flow_ops_py_test.py
+++ b/tensorflow/python/kernel_tests/control_flow_ops_py_test.py
@@ -502,6 +502,16 @@ class ControlFlowTest(test.TestCase, parameterized.TestCase):
       self.assertEqual(
           1.0, self.evaluate(control_flow_ops.cond(rv, case, lambda: t)))
 
+  @test_util.run_deprecated_v1
+  def testCondResourceGradShape(self):
+    rv1 = resource_variable_ops.ResourceVariable([1.0, 2.0])
+    rv2 = resource_variable_ops.ResourceVariable([3.0, 4.0])
+    pred = constant_op.constant(True)
+    result = control_flow_ops.cond(pred, lambda: rv1, lambda: rv2)
+    grads = gradients_impl.gradients(result, [rv1, rv2])
+    self.assertAllEqual(grads[0].shape.as_list(), [2])
+    self.assertAllEqual(grads[1].shape.as_list(), [2])
+
   @test_util.run_v1_only("b/120545219")
   def testCondWithTensorArrayGrad(self):
     with self.cached_session() as sess:
diff --git a/tensorflow/python/kernel_tests/critical_section_test.py b/tensorflow/python/kernel_tests/critical_section_test.py
index e30f5dd7ac5..ecd70b1c29f 100644
--- a/tensorflow/python/kernel_tests/critical_section_test.py
+++ b/tensorflow/python/kernel_tests/critical_section_test.py
@@ -391,8 +391,8 @@ class CriticalSectionTest(test.TestCase, parameterized.TestCase):
 
     def get_first():
       if context.executing_eagerly():
-        return self.evaluate(ds.make_one_shot_iterator().get_next())
-      itr = ds.make_initializable_iterator()
+        return self.evaluate(dataset_ops.make_one_shot_iterator(ds).get_next())
+      itr = dataset_ops.make_initializable_iterator(ds)
       self.evaluate([v.initializer, itr.initializer])
       return self.evaluate(itr.get_next())
 
diff --git a/tensorflow/python/kernel_tests/cwise_ops_binary_test.py b/tensorflow/python/kernel_tests/cwise_ops_binary_test.py
index 49dbbb125a1..153d0e78fd7 100644
--- a/tensorflow/python/kernel_tests/cwise_ops_binary_test.py
+++ b/tensorflow/python/kernel_tests/cwise_ops_binary_test.py
@@ -22,6 +22,7 @@ import numpy as np
 
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes as dtypes_lib
+from tensorflow.python.framework import errors
 from tensorflow.python.framework import errors_impl
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
@@ -255,10 +256,9 @@ class BinaryOpTest(test.TestCase):
     var_x = variables.Variable(x)
     var_y = variables.Variable(y)
 
-    with self.cached_session() as sess:
-      self.evaluate([var_x.initializer, var_y.initializer])
-      left_result = self.evaluate(var_x * y)
-      right_result = self.evaluate(x * var_y)
+    self.evaluate([var_x.initializer, var_y.initializer])
+    left_result = self.evaluate(var_x * y)
+    right_result = self.evaluate(x * var_y)
 
     np_result = x * y
     self.assertAllEqual(np_result, left_result)
@@ -933,7 +933,6 @@ class ComparisonOpTest(test.TestCase):
     self._testBCastByFunc(
         np.not_equal, math_ops.not_equal, include_complex=True)
 
-  @test_util.run_deprecated_v1
   def testShapeMismatch(self):
     dtypes = [np.float16, np.float32, np.float64, np.int32, np.int64]
     funcs = [
@@ -944,8 +943,9 @@ class ComparisonOpTest(test.TestCase):
     y = np.arange(0, 10).reshape([5, 2])
     for t in dtypes:
       for f in funcs:
-        with self.assertRaisesWithPredicateMatch(
-            ValueError, lambda e: "Dimensions must" in str(e)):
+        with self.assertRaisesRegexp(
+            (ValueError, errors.InvalidArgumentError),
+            "Incompatible shapes|Dimensions must be equal"):
           f(x.astype(t), y.astype(t))
 
 
diff --git a/tensorflow/python/kernel_tests/cwise_ops_test.py b/tensorflow/python/kernel_tests/cwise_ops_test.py
index 33158128fb8..4a4eb7931b5 100644
--- a/tensorflow/python/kernel_tests/cwise_ops_test.py
+++ b/tensorflow/python/kernel_tests/cwise_ops_test.py
@@ -23,6 +23,7 @@ import numpy as np
 from tensorflow.python.compat import compat
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes as dtypes_lib
+from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import test_util
@@ -201,7 +202,6 @@ class ComparisonOpTest(test.TestCase):
     self._testBCastByFunc(
         np.not_equal, math_ops.not_equal, include_complex=True)
 
-  @test_util.run_deprecated_v1
   def testShapeMismatch(self):
     dtypes = [np.float16, np.float32, np.float64, np.int32, np.int64]
     funcs = [
@@ -212,8 +212,9 @@ class ComparisonOpTest(test.TestCase):
     y = np.arange(0, 10).reshape([5, 2])
     for t in dtypes:
       for f in funcs:
-        with self.assertRaisesWithPredicateMatch(
-            ValueError, lambda e: "Dimensions must" in str(e)):
+        with self.assertRaisesRegexp(
+            (ValueError, errors.InvalidArgumentError),
+            "Incompatible shapes|Dimensions must be equal"):
           f(x.astype(t), y.astype(t))
 
 
diff --git a/tensorflow/python/kernel_tests/cwise_ops_unary_test.py b/tensorflow/python/kernel_tests/cwise_ops_unary_test.py
index f3ae5487007..0c6807197e9 100644
--- a/tensorflow/python/kernel_tests/cwise_ops_unary_test.py
+++ b/tensorflow/python/kernel_tests/cwise_ops_unary_test.py
@@ -390,6 +390,22 @@ class UnaryOpTest(test.TestCase):
     self._compareBothSparse(y, np.sign, math_ops.sign)
     self._compareBothSparse(x, np.vectorize(math.erf), math_ops.erf, tol=1e-3)
 
+  def testBFloat16Basic(self):
+    x = np.arange(-6, 6,
+                  2).reshape(1, 3, 2).astype(dtypes_lib.bfloat16.as_numpy_dtype)
+    self._compareCpu(x, np.abs, math_ops.abs)
+    self._compareCpu(x, np.abs, _ABS)
+
+  def testInt8Basic(self):
+    x = np.arange(-6, 6, 2).reshape(1, 3, 2).astype(np.int8)
+    self._compareCpu(x, np.abs, math_ops.abs)
+    self._compareCpu(x, np.abs, _ABS)
+
+  def testInt16Basic(self):
+    x = np.arange(-6, 6, 2).reshape(1, 3, 2).astype(np.int16)
+    self._compareCpu(x, np.abs, math_ops.abs)
+    self._compareCpu(x, np.abs, _ABS)
+
   def testInt32Basic(self):
     x = np.arange(-6, 6, 2).reshape(1, 3, 2).astype(np.int32)
     self._compareCpu(x, np.abs, math_ops.abs)
diff --git a/tensorflow/python/kernel_tests/decode_csv_op_test.py b/tensorflow/python/kernel_tests/decode_csv_op_test.py
index 6c7a9de6e05..3ad3e93df8e 100644
--- a/tensorflow/python/kernel_tests/decode_csv_op_test.py
+++ b/tensorflow/python/kernel_tests/decode_csv_op_test.py
@@ -73,7 +73,7 @@ class DecodeCSVOpTest(test.TestCase):
     if context.executing_eagerly():
       err_spec = errors.InvalidArgumentError, (
           "Each record default should be at "
-          "most rank 1.")
+          "most rank 1")
     else:
       err_spec = ValueError, "Shape must be at most rank 1 but is rank 2"
     with self.assertRaisesWithPredicateMatch(*err_spec):
diff --git a/tensorflow/python/kernel_tests/einsum_op_test.py b/tensorflow/python/kernel_tests/einsum_op_test.py
index c4fffa1b5a5..aa3034a864f 100644
--- a/tensorflow/python/kernel_tests/einsum_op_test.py
+++ b/tensorflow/python/kernel_tests/einsum_op_test.py
@@ -50,6 +50,7 @@ class EinsumOpTest(test.TestCase):
     b = self.evaluate(gen_linalg_ops.einsum(input_tensors, s))
     self.assertAllClose(a, b, atol=1e-4, rtol=1e-4)
 
+  @test_util.disable_xla('b/131919749')
   def testUnary(self):
     self._check('->', ())
     self._check('aa->', (3, 3))
@@ -66,7 +67,10 @@ class EinsumOpTest(test.TestCase):
     self._check('aabcc->ac', (3, 3, 5, 4, 4))
     self._check('aabcd->ad', (3, 3, 5, 4, 4))
 
+  @test_util.disable_xla('b/131919749')
   def testUnaryEllipsis(self):
+    # Unary cases with ellipsis.
+    # Edge cases.
     self._check('...->...', ())
     self._check('...->', ())
     self._check('->...', ())
@@ -78,6 +82,7 @@ class EinsumOpTest(test.TestCase):
     self._check('a...a->a...', (2, 1, 2))
     self._check('a...a->a...', (2, 3, 4, 5, 2))
 
+    # Regular cases.
     self._check('...ijk->...ki', (3, 4, 5))
     self._check('...ijk->...ki', (1, 3, 4, 5))
     self._check('...ijk->...ki', (2, 2, 3, 4, 5))
@@ -85,24 +90,36 @@ class EinsumOpTest(test.TestCase):
     # Repeated indices.
     self._check('i...ii->...i', (3, 2, 3, 3))
 
-  def testBinary(self):
+  def testBinarySimple(self):
+    # Binary cases in XLA mode must have either (a) each index appearing exactly
+    # once in both the inputs (batch or contraction index), or (b) appearing
+    # exactly once in an input and in the output (free index).
     self._check(',->', (), ())
     self._check('a,a->', (3,), (3,))
     self._check('a,a->a', (3,), (3,))
-    self._check('ba,b->', (3, 2), (3,))
     self._check('ab,b->a', (3, 4), (4,))
     self._check('ab,ab->', (3, 4), (3, 4))
+    self._check('ab,bc->ac', (3, 4), (4, 5))
     self._check('nij,jk->nik', (5, 2, 3), (3, 4))
     self._check('abc,bad->abcd', (1, 2, 3), (2, 1, 4))
+    # Based on https://github.com/google/jax/issues/37#issuecomment-448572187
+    self._check('sa,shb->shab', (2, 1), (2, 3, 4))
+
+  @test_util.disable_xla('b/131919749')
+  def testReducedIndices(self):
+    self._check('ba,b->', (3, 2), (3,))
+    self._check('ab,ab->', (3, 4), (3, 4))
+
+  @test_util.disable_xla('b/131919749')
+  def testRepeatedIndices(self):
     # Repeated indices.
     self._check('ijj,k->ik', (2, 3, 3), (4,))
     self._check('aba,a->b', (3, 4, 3), (3,))
     # From https://github.com/dask/dask/pull/3412#discussion_r182413444
     self._check('aab,bc->ac', (2, 2, 3), (3, 4))
     self._check('aab,bcc->ac', (2, 2, 3), (3, 4, 4))
-    # Based on https://github.com/google/jax/issues/37#issuecomment-448572187
-    self._check('sa,shb->shab', (2, 1), (2, 3, 4))
 
+  @test_util.disable_xla('b/131919749')
   def testBroadcasting(self):
     # Batch matmul without broadcasting.
     self._check('...ij,...jk->...ik', (5, 1, 2, 3), (5, 1, 3, 4))
@@ -113,14 +130,17 @@ class EinsumOpTest(test.TestCase):
     self._check('...ij,...jk->...ik', (2, 3), (5, 3, 5))
     self._check('...ij,...jk->...ik', (3, 1, 2, 3), (1, 1, 7, 3, 5))
     self._check('i...j,j...k->...ik', (2, 1, 3, 1, 3), (3, 1, 7, 5))
+    # Following 2 from https://stackoverflow.com/a/19203475/1611416
+    self._check('...abc,...abcd->...d', (1, 1, 2, 3, 4), (5, 2, 3, 4, 6))
+    self._check('ab...,b->ab...', (2, 3, 1, 1, 5), (3,))
+
+  @test_util.disable_xla('b/131919749')
+  def testBroadcastingWithRepeatedIndices(self):
     # Broadcasting with repeated indices.
     self._check('ij,jk...k->i...', (3, 2), (2, 4, 1, 4))
     self._check('ij,jk...k->...i', (3, 2), (2, 4, 5, 4))
     self._check('ijj,jk...k->i...', (3, 2, 2), (2, 4, 1, 4))
     self._check('i...jj,jk...k->i...', (3, 3, 1, 2, 2), (2, 4, 1, 5, 4))
-    # Following 2 from # https://stackoverflow.com/a/19203475/1611416
-    self._check('...abc,...abcd->...d', (1, 1, 2, 3, 4), (5, 2, 3, 4, 6))
-    self._check('ab...,b->ab...', (2, 3, 1, 1, 5), (3,))
 
   def testDtypes(self):
     bfloat16 = dtypes.bfloat16.as_numpy_dtype
@@ -153,6 +173,7 @@ class EinsumOpTest(test.TestCase):
     ]:
       check(dtype)
 
+  @test_util.disable_xla('b/131919749')
   @test_util.run_in_graph_and_eager_modes
   def testInvalid(self):
     r = np.random.RandomState(0)
@@ -178,6 +199,7 @@ class EinsumOpTest(test.TestCase):
       with self.assertRaises((ValueError, errors.InvalidArgumentError)):
         _ = self.evaluate(gen_linalg_ops.einsum(placeholders, args[0]))
 
+  @test_util.disable_xla('b/131919749')
   @test_util.run_in_graph_and_eager_modes
   def testPlaceholder(self):
 
@@ -202,9 +224,11 @@ class EinsumOpTest(test.TestCase):
           ((4, 3), (None, 3)))
     check('...ij,...jk->...ik', ((3, 1, 2, 3), None), ((1, 7, 3, 4), None))
 
+  @test_util.disable_xla('b/131919749')
   def testOutputRepeatedLabels(self):
-    # This is the reverse operation of repeated input labels, to be used for
-    # computing symbolic gradients of einsum.
+    # This is the reverse operation of generalized traces, to be used for
+    # computing symbolic gradients of einsum. Note: this operation is not
+    # supported by np.einsum as it's only required for gradients.
     r = np.random.RandomState(0)
     a = r.randn(2, 2)
     s = 'a->aa'
@@ -212,6 +236,29 @@ class EinsumOpTest(test.TestCase):
     b = self.evaluate(gen_linalg_ops.einsum([np.diag(a)], s))
     self.assertAllClose(diag_a, b, atol=1e-4, rtol=1e-4)
 
+  @test_util.disable_xla('b/131919749')
+  def testEmpty(self):
+    def check(equation, input_shapes, output_shape):
+      # All these cases result in an output filled with zeros, so we don't call
+      # np.einsum. Also np.einsum doesn't support generalized diagonals which
+      # are needed for EinsumOp gradients.
+      r = np.random.RandomState(0)
+      inputs = [np.array(r.randn(*shape)) for shape in input_shapes]
+      output = self.evaluate(gen_linalg_ops.einsum(inputs, equation))
+      self.assertAllClose(output, np.zeros(output_shape), atol=1e-4, rtol=1e-4)
+
+    # Contractions along zero-sized dimensons.
+    check('ab,bc->ac', [(0, 10), (10, 10)], (0, 10))
+    # From transformer xl.
+    check('ibnd,ijbn->jnd', [(1, 0, 5, 10), (1, 1, 0, 5)], (1, 5, 10))
+    # Generalized traces with zero-sized dimensions.
+    check('aab,bc->ac', [(0, 0, 10), (10, 10)], (0, 10))
+    check('aaab,bc->c', [(0, 0, 0, 3), (3, 4)], (4,))
+    # Generalized diagonals along with contraction.
+    check('ab,bc->aaca', [(0, 10), (10, 5)], (0, 0, 5, 0))
+    check('ab,bc->aaa', [(0, 10), (10, 5)], (0, 0, 0))
+    check('ab,bc->cc', [(0, 10), (10, 5)], (5, 5))
+    check('ab,ab->aaa', [(0, 5), (0, 5)], (0, 0, 0))
 
 class EinsumBenchmark(test.Benchmark):
   cases = [
diff --git a/tensorflow/python/kernel_tests/identity_op_py_test.py b/tensorflow/python/kernel_tests/identity_op_py_test.py
index 40ec9db4226..013502dfe09 100644
--- a/tensorflow/python/kernel_tests/identity_op_py_test.py
+++ b/tensorflow/python/kernel_tests/identity_op_py_test.py
@@ -22,6 +22,7 @@ import numpy as np
 
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gen_array_ops
@@ -56,11 +57,11 @@ class IdentityOpTest(test.TestCase):
       shape = [2, 3]
       array_2x3 = [[1, 2, 3], [6, 5, 4]]
       tensor = constant_op.constant(array_2x3)
-      self.assertEquals(shape, tensor.get_shape())
-      self.assertEquals(shape, array_ops.identity(tensor).get_shape())
-      self.assertEquals(shape, array_ops.identity(array_2x3).get_shape())
-      self.assertEquals(shape,
-                        array_ops.identity(np.array(array_2x3)).get_shape())
+      self.assertEqual(shape, tensor.get_shape())
+      self.assertEqual(shape, array_ops.identity(tensor).get_shape())
+      self.assertEqual(shape, array_ops.identity(array_2x3).get_shape())
+      self.assertEqual(shape,
+                       array_ops.identity(np.array(array_2x3)).get_shape())
 
   @test_util.run_v1_only("b/120545219")
   def testRefIdentityShape(self):
@@ -69,8 +70,15 @@ class IdentityOpTest(test.TestCase):
       tensor = variables.VariableV1(
           constant_op.constant(
               [[1, 2, 3], [6, 5, 4]], dtype=dtypes.int32))
-      self.assertEquals(shape, tensor.get_shape())
-      self.assertEquals(shape, gen_array_ops.ref_identity(tensor).get_shape())
+      self.assertEqual(shape, tensor.get_shape())
+      self.assertEqual(shape, gen_array_ops.ref_identity(tensor).get_shape())
+
+  def testCompositeTensor(self):
+    original = sparse_tensor.SparseTensor([[3]], [1.0], [100])
+    copied = array_ops.identity(original)
+    self.assertAllEqual(original.indices, copied.indices)
+    self.assertAllEqual(original.values, copied.values)
+    self.assertAllEqual(original.dense_shape, copied.dense_shape)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/kernel_tests/linalg/BUILD b/tensorflow/python/kernel_tests/linalg/BUILD
index 89cc61423da..99b28a71d01 100644
--- a/tensorflow/python/kernel_tests/linalg/BUILD
+++ b/tensorflow/python/kernel_tests/linalg/BUILD
@@ -134,7 +134,6 @@ cuda_py_test(
     additional_deps = [
         "//third_party/py/numpy",
         "//tensorflow/python:array_ops",
-        "//tensorflow/python:spectral_ops_test_util",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework",
         "//tensorflow/python:framework_for_generated_wrappers",
@@ -356,7 +355,6 @@ cuda_py_test(
     additional_deps = [
         "//third_party/py/numpy",
         "//tensorflow/python:array_ops",
-        "//tensorflow/python:spectral_ops_test_util",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework",
         "//tensorflow/python:framework_for_generated_wrappers",
diff --git a/tensorflow/python/kernel_tests/linalg/linear_operator_circulant_test.py b/tensorflow/python/kernel_tests/linalg/linear_operator_circulant_test.py
index 590f9c76d8e..0ba456d1083 100644
--- a/tensorflow/python/kernel_tests/linalg/linear_operator_circulant_test.py
+++ b/tensorflow/python/kernel_tests/linalg/linear_operator_circulant_test.py
@@ -24,7 +24,7 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import spectral_ops_test_util
+from tensorflow.python.ops import variables as variables_module
 from tensorflow.python.ops.linalg import linalg
 from tensorflow.python.ops.linalg import linear_operator_circulant
 from tensorflow.python.ops.linalg import linear_operator_test_util
@@ -35,6 +35,7 @@ rng = np.random.RandomState(0)
 _to_complex = linear_operator_circulant._to_complex
 
 
+@test_util.run_all_in_graph_and_eager_modes
 class LinearOperatorCirculantBaseTest(object):
   """Common class for circulant tests."""
 
@@ -58,8 +59,7 @@ class LinearOperatorCirculantBaseTest(object):
     """We overwrite the FFT operation mapping for testing."""
     with test.TestCase._constrain_devices_and_set_default(
         self, sess, use_gpu, force_gpu) as sess:
-      with spectral_ops_test_util.fft_kernel_label_map():
-        yield sess
+      yield sess
 
   def _shape_to_spectrum_shape(self, shape):
     # If spectrum.shape = batch_shape + [N],
@@ -147,11 +147,10 @@ class LinearOperatorCirculantTestSelfAdjointOperator(
 
     return operator, mat
 
-  @test_util.run_deprecated_v1
   @test_util.disable_xla("No registered Const")
   def test_simple_hermitian_spectrum_gives_operator_with_zero_imag_part(self):
     with self.cached_session():
-      spectrum = math_ops.cast([1., 1j, -1j], dtypes.complex64)
+      spectrum = math_ops.cast([1. + 0j, 1j, -1j], dtypes.complex64)
       operator = linalg.LinearOperatorCirculant(
           spectrum, input_output_dtype=dtypes.complex64)
       matrix = operator.to_dense()
@@ -160,6 +159,12 @@ class LinearOperatorCirculantTestSelfAdjointOperator(
       np.testing.assert_allclose(
           0, self.evaluate(imag_matrix), rtol=0, atol=eps * 3)
 
+  def test_tape_safe(self):
+    spectrum = variables_module.Variable(
+        math_ops.cast([1. + 0j, 1. + 0j], dtypes.complex64))
+    operator = linalg.LinearOperatorCirculant(spectrum, is_self_adjoint=True)
+    self.check_tape_safe(operator)
+
 
 class LinearOperatorCirculantTestHermitianSpectrum(
     LinearOperatorCirculantBaseTest,
@@ -216,11 +221,10 @@ class LinearOperatorCirculantTestHermitianSpectrum(
 
     return operator, mat
 
-  @test_util.run_deprecated_v1
   @test_util.disable_xla("No registered Const")
   def test_simple_hermitian_spectrum_gives_operator_with_zero_imag_part(self):
     with self.cached_session():
-      spectrum = math_ops.cast([1., 1j, -1j], dtypes.complex64)
+      spectrum = math_ops.cast([1. + 0j, 1j, -1j], dtypes.complex64)
       operator = linalg.LinearOperatorCirculant(
           spectrum, input_output_dtype=dtypes.complex64)
       matrix = operator.to_dense()
@@ -229,6 +233,12 @@ class LinearOperatorCirculantTestHermitianSpectrum(
       np.testing.assert_allclose(
           0, self.evaluate(imag_matrix), rtol=0, atol=eps * 3)
 
+  def test_tape_safe(self):
+    spectrum = variables_module.Variable(
+        math_ops.cast([1. + 0j, 1. + 1j], dtypes.complex64))
+    operator = linalg.LinearOperatorCirculant(spectrum, is_self_adjoint=False)
+    self.check_tape_safe(operator)
+
 
 class LinearOperatorCirculantTestNonHermitianSpectrum(
     LinearOperatorCirculantBaseTest,
@@ -273,11 +283,10 @@ class LinearOperatorCirculantTestNonHermitianSpectrum(
 
     return operator, mat
 
-  @test_util.run_deprecated_v1
   @test_util.disable_xla("No registered Const")
   def test_simple_hermitian_spectrum_gives_operator_with_zero_imag_part(self):
     with self.cached_session():
-      spectrum = math_ops.cast([1., 1j, -1j], dtypes.complex64)
+      spectrum = math_ops.cast([1. + 0j, 1j, -1j], dtypes.complex64)
       operator = linalg.LinearOperatorCirculant(
           spectrum, input_output_dtype=dtypes.complex64)
       matrix = operator.to_dense()
@@ -286,7 +295,6 @@ class LinearOperatorCirculantTestNonHermitianSpectrum(
       np.testing.assert_allclose(
           0, self.evaluate(imag_matrix), rtol=0, atol=eps * 3)
 
-  @test_util.run_deprecated_v1
   def test_simple_positive_real_spectrum_gives_self_adjoint_pos_def_oper(self):
     with self.cached_session() as sess:
       spectrum = math_ops.cast([6., 4, 2], dtypes.complex64)
@@ -296,10 +304,9 @@ class LinearOperatorCirculantTestNonHermitianSpectrum(
           [operator.to_dense(),
            linalg.adjoint(operator.to_dense())])
       self.assertAllClose(matrix, matrix_h)
-      operator.assert_positive_definite().run()  # Should not fail
-      operator.assert_self_adjoint().run()  # Should not fail
+      self.evaluate(operator.assert_positive_definite())  # Should not fail
+      self.evaluate(operator.assert_self_adjoint())  # Should not fail
 
-  @test_util.run_deprecated_v1
   def test_defining_operator_using_real_convolution_kernel(self):
     with self.cached_session():
       convolution_kernel = [1., 2., 1.]
@@ -313,7 +320,7 @@ class LinearOperatorCirculantTestNonHermitianSpectrum(
       # Allow for complex output so we can make sure it has zero imag part.
       self.assertEqual(operator.dtype, dtypes.complex64)
 
-      matrix = operator.to_dense().eval()
+      matrix = self.evaluate(operator.to_dense())
       np.testing.assert_allclose(0, np.imag(matrix), atol=1e-6)
 
   @test_util.run_v1_only("currently failing on v2")
@@ -331,7 +338,6 @@ class LinearOperatorCirculantTestNonHermitianSpectrum(
       np.testing.assert_allclose(
           0, self.evaluate(imag_matrix), rtol=0, atol=eps * 3 * 4)
 
-  @test_util.run_deprecated_v1
   def test_convolution_kernel_same_as_first_row_of_to_dense(self):
     spectrum = [[3., 2., 1.], [2., 1.5, 1.]]
     with self.cached_session():
@@ -341,37 +347,33 @@ class LinearOperatorCirculantTestNonHermitianSpectrum(
 
       self.assertAllEqual((2, 3), h.shape)
       self.assertAllEqual((2, 3, 3), c.shape)
-      self.assertAllClose(h.eval(), self.evaluate(c)[:, :, 0])
+      self.assertAllClose(self.evaluate(h), self.evaluate(c)[:, :, 0])
 
-  @test_util.run_deprecated_v1
   def test_assert_non_singular_fails_for_singular_operator(self):
-    spectrum = math_ops.cast([0, 4, 2j + 2], dtypes.complex64)
+    spectrum = math_ops.cast([0 + 0j, 4 + 0j, 2j + 2], dtypes.complex64)
     operator = linalg.LinearOperatorCirculant(spectrum)
     with self.cached_session():
       with self.assertRaisesOpError("Singular operator"):
-        operator.assert_non_singular().run()
+        self.evaluate(operator.assert_non_singular())
 
-  @test_util.run_deprecated_v1
   def test_assert_non_singular_does_not_fail_for_non_singular_operator(self):
-    spectrum = math_ops.cast([-3j, 4, 2j + 2], dtypes.complex64)
+    spectrum = math_ops.cast([-3j, 4 + 0j, 2j + 2], dtypes.complex64)
     operator = linalg.LinearOperatorCirculant(spectrum)
     with self.cached_session():
-      operator.assert_non_singular().run()  # Should not fail
+      self.evaluate(operator.assert_non_singular())  # Should not fail
 
-  @test_util.run_deprecated_v1
   def test_assert_positive_definite_fails_for_non_positive_definite(self):
-    spectrum = math_ops.cast([6., 4, 2j], dtypes.complex64)
+    spectrum = math_ops.cast([6. + 0j, 4 + 0j, 2j], dtypes.complex64)
     operator = linalg.LinearOperatorCirculant(spectrum)
     with self.cached_session():
       with self.assertRaisesOpError("Not positive definite"):
-        operator.assert_positive_definite().run()
+        self.evaluate(operator.assert_positive_definite())
 
-  @test_util.run_deprecated_v1
   def test_assert_positive_definite_does_not_fail_when_pos_def(self):
-    spectrum = math_ops.cast([6., 4, 2j + 2], dtypes.complex64)
+    spectrum = math_ops.cast([6. + 0j, 4 + 0j, 2j + 2], dtypes.complex64)
     operator = linalg.LinearOperatorCirculant(spectrum)
     with self.cached_session():
-      operator.assert_positive_definite().run()  # Should not fail
+      self.evaluate(operator.assert_positive_definite())  # Should not fail
 
   def test_real_spectrum_and_not_self_adjoint_hint_raises(self):
     spectrum = [1., 2.]
@@ -384,6 +386,7 @@ class LinearOperatorCirculantTestNonHermitianSpectrum(
     self.assertTrue(operator.is_self_adjoint)
 
 
+@test_util.run_all_in_graph_and_eager_modes
 class LinearOperatorCirculant2DBaseTest(object):
   """Common class for 2D circulant tests."""
 
@@ -392,8 +395,7 @@ class LinearOperatorCirculant2DBaseTest(object):
     """We overwrite the FFT operation mapping for testing."""
     with test.TestCase._constrain_devices_and_set_default(
         self, sess, use_gpu, force_gpu) as sess:
-      with spectral_ops_test_util.fft_kernel_label_map():
-        yield sess
+      yield sess
 
   @staticmethod
   def operator_shapes_infos():
@@ -560,9 +562,8 @@ class LinearOperatorCirculant2DTestNonHermitianSpectrum(
 
     return operator, mat
 
-  @test_util.run_deprecated_v1
   def test_real_hermitian_spectrum_gives_real_symmetric_operator(self):
-    with self.cached_session() as sess:
+    with self.cached_session():  # Necessary for fft_kernel_label_map
       # This is a real and hermitian spectrum.
       spectrum = [[1., 2., 2.], [3., 4., 4.], [3., 4., 4.]]
       operator = linalg.LinearOperatorCirculant(spectrum)
@@ -571,13 +572,12 @@ class LinearOperatorCirculant2DTestNonHermitianSpectrum(
       self.assertEqual(matrix_tensor.dtype, dtypes.complex64)
       matrix_t = array_ops.matrix_transpose(matrix_tensor)
       imag_matrix = math_ops.imag(matrix_tensor)
-      matrix, matrix_transpose, imag_matrix = sess.run(
+      matrix, matrix_transpose, imag_matrix = self.evaluate(
           [matrix_tensor, matrix_t, imag_matrix])
 
       np.testing.assert_allclose(0, imag_matrix, atol=1e-6)
       self.assertAllClose(matrix, matrix_transpose, atol=0)
 
-  @test_util.run_v1_only("b/120545219")
   def test_real_spectrum_gives_self_adjoint_operator(self):
     with self.cached_session():
       # This is a real and hermitian spectrum.
@@ -589,37 +589,37 @@ class LinearOperatorCirculant2DTestNonHermitianSpectrum(
       self.assertEqual(matrix_tensor.dtype, dtypes.complex64)
       matrix_h = linalg.adjoint(matrix_tensor)
       matrix, matrix_h = self.evaluate([matrix_tensor, matrix_h])
-      self.assertAllClose(matrix, matrix_h, atol=0)
+      self.assertAllClose(matrix, matrix_h, atol=1e-5)
 
-  @test_util.run_deprecated_v1
   def test_assert_non_singular_fails_for_singular_operator(self):
-    spectrum = math_ops.cast([[0, 4], [2j + 2, 3.]], dtypes.complex64)
+    spectrum = math_ops.cast([[0 + 0j, 4 + 0j], [2j + 2, 3. + 0j]],
+                             dtypes.complex64)
     operator = linalg.LinearOperatorCirculant2D(spectrum)
     with self.cached_session():
       with self.assertRaisesOpError("Singular operator"):
-        operator.assert_non_singular().run()
+        self.evaluate(operator.assert_non_singular())
 
-  @test_util.run_deprecated_v1
   def test_assert_non_singular_does_not_fail_for_non_singular_operator(self):
-    spectrum = math_ops.cast([[-3j, 4], [2j + 2, 3.]], dtypes.complex64)
+    spectrum = math_ops.cast([[-3j, 4 + 0j], [2j + 2, 3. + 0j]],
+                             dtypes.complex64)
     operator = linalg.LinearOperatorCirculant2D(spectrum)
     with self.cached_session():
-      operator.assert_non_singular().run()  # Should not fail
+      self.evaluate(operator.assert_non_singular())  # Should not fail
 
-  @test_util.run_deprecated_v1
   def test_assert_positive_definite_fails_for_non_positive_definite(self):
-    spectrum = math_ops.cast([[6., 4], [2j, 3.]], dtypes.complex64)
+    spectrum = math_ops.cast([[6. + 0j, 4 + 0j], [2j, 3. + 0j]],
+                             dtypes.complex64)
     operator = linalg.LinearOperatorCirculant2D(spectrum)
     with self.cached_session():
       with self.assertRaisesOpError("Not positive definite"):
-        operator.assert_positive_definite().run()
+        self.evaluate(operator.assert_positive_definite())
 
-  @test_util.run_deprecated_v1
   def test_assert_positive_definite_does_not_fail_when_pos_def(self):
-    spectrum = math_ops.cast([[6., 4], [2j + 2, 3.]], dtypes.complex64)
+    spectrum = math_ops.cast([[6. + 0j, 4 + 0j], [2j + 2, 3. + 0j]],
+                             dtypes.complex64)
     operator = linalg.LinearOperatorCirculant2D(spectrum)
     with self.cached_session():
-      operator.assert_positive_definite().run()  # Should not fail
+      self.evaluate(operator.assert_positive_definite())  # Should not fail
 
   def test_real_spectrum_and_not_self_adjoint_hint_raises(self):
     spectrum = [[1., 2.], [3., 4]]
@@ -636,7 +636,15 @@ class LinearOperatorCirculant2DTestNonHermitianSpectrum(
     with self.assertRaisesRegexp(ValueError, "must have at least 2 dimensions"):
       linalg.LinearOperatorCirculant2D(spectrum)
 
+  def test_tape_safe(self):
+    spectrum = variables_module.Variable(
+        math_ops.cast([[1. + 0j, 1. + 0j], [1. + 1j, 2. + 2j]],
+                      dtypes.complex64))
+    operator = linalg.LinearOperatorCirculant2D(spectrum)
+    self.check_tape_safe(operator)
 
+
+@test_util.run_all_in_graph_and_eager_modes
 class LinearOperatorCirculant3DTest(test.TestCase):
   """Simple test of the 3D case.  See also the 1D and 2D tests."""
 
@@ -645,10 +653,8 @@ class LinearOperatorCirculant3DTest(test.TestCase):
     """We overwrite the FFT operation mapping for testing."""
     with test.TestCase._constrain_devices_and_set_default(
         self, sess, use_gpu, force_gpu) as sess:
-      with spectral_ops_test_util.fft_kernel_label_map():
-        yield sess
+      yield sess
 
-  @test_util.run_deprecated_v1
   def test_real_spectrum_gives_self_adjoint_operator(self):
     with self.cached_session():
       # This is a real and hermitian spectrum.
@@ -665,7 +671,6 @@ class LinearOperatorCirculant3DTest(test.TestCase):
       self.assertAllEqual((2, 2 * 3 * 5, 2 * 3 * 5), matrix.shape)
       self.assertAllClose(matrix, matrix_h)
 
-  @test_util.run_deprecated_v1
   def test_defining_operator_using_real_convolution_kernel(self):
     with self.cached_session():
       convolution_kernel = linear_operator_test_util.random_normal(
@@ -680,13 +685,12 @@ class LinearOperatorCirculant3DTest(test.TestCase):
 
       # Allow for complex output so we can make sure it has zero imag part.
       self.assertEqual(operator.dtype, dtypes.complex64)
-      matrix = operator.to_dense().eval()
+      matrix = self.evaluate(operator.to_dense())
       self.assertAllEqual((2, 2 * 3 * 5, 2 * 3 * 5), matrix.shape)
       np.testing.assert_allclose(0, np.imag(matrix), atol=1e-5)
 
-  @test_util.run_deprecated_v1
   def test_defining_spd_operator_by_taking_real_part(self):
-    with self.cached_session() as sess:
+    with self.cached_session():  # Necessary for fft_kernel_label_map
       # S is real and positive.
       s = linear_operator_test_util.random_uniform(
           shape=(10, 2, 3, 4), dtype=dtypes.float32, minval=1., maxval=2.)
@@ -722,11 +726,11 @@ class LinearOperatorCirculant3DTest(test.TestCase):
 
       # Allow for complex output so we can check operator has zero imag part.
       self.assertEqual(operator.dtype, dtypes.complex64)
-      matrix, matrix_t = sess.run([
+      matrix, matrix_t = self.evaluate([
           operator.to_dense(),
           array_ops.matrix_transpose(operator.to_dense())
       ])
-      operator.assert_positive_definite().run()  # Should not fail.
+      self.evaluate(operator.assert_positive_definite())  # Should not fail.
       np.testing.assert_allclose(0, np.imag(matrix), atol=1e-6)
       self.assertAllClose(matrix, matrix_t)
 
@@ -739,7 +743,7 @@ class LinearOperatorCirculant3DTest(test.TestCase):
           1j * math_ops.cast(imag_ifft_s, dtypes.complex64))
       operator_imag = linalg.LinearOperatorCirculant3D(fft_imag_ifft_s)
 
-      matrix, matrix_h = sess.run([
+      matrix, matrix_h = self.evaluate([
           operator_imag.to_dense(),
           array_ops.matrix_transpose(math_ops.conj(operator_imag.to_dense()))
       ])
diff --git a/tensorflow/python/kernel_tests/linalg/linear_operator_toeplitz_test.py b/tensorflow/python/kernel_tests/linalg/linear_operator_toeplitz_test.py
index 6dcc57c54f6..5eb66110b57 100644
--- a/tensorflow/python/kernel_tests/linalg/linear_operator_toeplitz_test.py
+++ b/tensorflow/python/kernel_tests/linalg/linear_operator_toeplitz_test.py
@@ -22,10 +22,12 @@ import contextlib
 import numpy as np
 import scipy.linalg
 
+from tensorflow.python.eager import backprop
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import spectral_ops_test_util
+from tensorflow.python.ops import variables as variables_module
 from tensorflow.python.ops.linalg import linalg as linalg_lib
 from tensorflow.python.ops.linalg import linear_operator_test_util
 from tensorflow.python.ops.linalg import linear_operator_toeplitz
@@ -36,6 +38,7 @@ linalg = linalg_lib
 _to_complex = linear_operator_toeplitz._to_complex
 
 
+@test_util.run_all_in_graph_and_eager_modes
 class LinearOperatorToeplitzTest(
     linear_operator_test_util.SquareLinearOperatorDerivedClassTest):
   """Most tests done in the base class LinearOperatorDerivedClassTest."""
@@ -45,8 +48,7 @@ class LinearOperatorToeplitzTest(
     """We overwrite the FFT operation mapping for testing."""
     with test.TestCase._constrain_devices_and_set_default(
         self, sess, use_gpu, force_gpu) as sess:
-      with spectral_ops_test_util.fft_kernel_label_map():
-        yield sess
+      yield sess
 
   def setUp(self):
     # TODO(srvasude): Lower these tolerances once specialized solve and
@@ -137,6 +139,25 @@ class LinearOperatorToeplitzTest(
     with self.assertRaisesRegexp(ValueError, "must have at least 1 dimension"):
       linear_operator_toeplitz.LinearOperatorToeplitz(1., [1.])
 
+  def test_tape_safe(self):
+    col = variables_module.Variable([1.])
+    row = variables_module.Variable([1.])
+    operator = linear_operator_toeplitz.LinearOperatorToeplitz(
+        col, row, is_self_adjoint=True, is_positive_definite=True)
+    self.check_tape_safe(
+        operator,
+        skip_options=[
+            # .diag_part, .trace depend only on `col`, so test explicitly below.
+            linear_operator_test_util.CheckTapeSafeSkipOptions.DIAG_PART,
+            linear_operator_test_util.CheckTapeSafeSkipOptions.TRACE,
+        ])
+
+    with backprop.GradientTape() as tape:
+      self.assertIsNotNone(tape.gradient(operator.diag_part(), col))
+
+    with backprop.GradientTape() as tape:
+      self.assertIsNotNone(tape.gradient(operator.trace(), col))
+
 
 if __name__ == "__main__":
   linear_operator_test_util.add_tests(LinearOperatorToeplitzTest)
diff --git a/tensorflow/python/kernel_tests/losses_test.py b/tensorflow/python/kernel_tests/losses_test.py
index 203ac344ec2..b5f3e317d1c 100644
--- a/tensorflow/python/kernel_tests/losses_test.py
+++ b/tensorflow/python/kernel_tests/losses_test.py
@@ -488,7 +488,7 @@ class SparseSoftmaxCrossEntropyLossTest(test.TestCase):
       labels = constant_op.constant([[0, 1], [2, 3]])
       weights = constant_op.constant(1.2)
 
-      with self.assertRaisesRegexp(ValueError, 'dimension'):
+      with self.assertRaisesRegexp(ValueError, 'mismatch'):
         losses.sparse_softmax_cross_entropy(
             labels, logits, weights=weights).eval()
 
diff --git a/tensorflow/python/kernel_tests/resource_variable_ops_test.py b/tensorflow/python/kernel_tests/resource_variable_ops_test.py
index 70c6c7ecfbc..b36b252bd81 100644
--- a/tensorflow/python/kernel_tests/resource_variable_ops_test.py
+++ b/tensorflow/python/kernel_tests/resource_variable_ops_test.py
@@ -107,9 +107,10 @@ class ResourceVariableOpsTest(test_util.TensorFlowTestCase,
       handle = resource_variable_ops.var_handle_op(
           dtype=dtypes.int32, shape=[1], name="foo")
       resource_variable_ops.assign_variable_op(handle, 1)
-      with self.assertRaisesRegexp(errors.InvalidArgumentError,
-                                   "Trying to read variable with wrong dtype. "
-                                   "Expected float got int32."):
+      with self.assertRaisesRegexp(
+          errors.InvalidArgumentError,
+          "Trying to read variable with wrong dtype. "
+          "Expected float got int32"):
         _ = resource_variable_ops.read_variable_op(handle, dtype=dtypes.float32)
 
   def testEagerInitializedValue(self):
@@ -195,9 +196,9 @@ class ResourceVariableOpsTest(test_util.TensorFlowTestCase,
           dtype=dtypes.int32, shape=[1], name="foo")
       resource_variable_ops.assign_variable_op(
           handle, constant_op.constant([1]))
-      with self.assertRaisesRegexp(errors.InvalidArgumentError,
-                                   "Trying to assign variable with wrong "
-                                   "dtype. Expected int32 got float."):
+      with self.assertRaisesRegexp(
+          errors.InvalidArgumentError, "Trying to assign variable with wrong "
+          "dtype. Expected int32 got float"):
         resource_variable_ops.assign_variable_op(
             handle, constant_op.constant([1.], dtype=dtypes.float32))
 
@@ -656,12 +657,6 @@ class ResourceVariableOpsTest(test_util.TensorFlowTestCase,
       self.assertEqual(v.handle.op.colocation_groups(),
                        v.initializer.inputs[1].op.colocation_groups())
 
-  def testHandleNumpy(self):
-    with context.eager_mode():
-      with self.assertRaises(ValueError):
-        resource_variable_ops.ResourceVariable(
-            1.0, name="handle-numpy").handle.numpy()
-
   def testCountUpTo(self):
     with context.eager_mode():
       v = resource_variable_ops.ResourceVariable(0, name="upto")
@@ -986,7 +981,9 @@ class ResourceVariableOpsTest(test_util.TensorFlowTestCase,
       x = resource_variable_ops.var_handle_op(
           dtype=v.dtype.base_dtype, shape=v.get_shape(), shared_name="var5",
           container=ops.get_default_graph()._container)
-      with self.assertRaisesOpError("Resource .*/var5/.* does not exist"):
+      with self.assertRaisesOpError(
+          "(Resource .*/var5/.* does not exist|Read of uninitialized variable)"
+      ):
         resource_variable_ops.read_variable_op(x, v.dtype.base_dtype).eval()
 
   @test_util.run_deprecated_v1
diff --git a/tensorflow/python/kernel_tests/scatter_nd_ops_test.py b/tensorflow/python/kernel_tests/scatter_nd_ops_test.py
index d0336f0819f..e1878dc1090 100644
--- a/tensorflow/python/kernel_tests/scatter_nd_ops_test.py
+++ b/tensorflow/python/kernel_tests/scatter_nd_ops_test.py
@@ -167,6 +167,16 @@ class StatefulScatterNdTest(test.TestCase):
       result = self.evaluate(scatter)
       self.assertAllClose(result, expected)
 
+  @test_util.run_in_graph_and_eager_modes
+  def testString(self):
+    ref = variables.Variable(["qq", "ww", "ee", "rr", "", "", "", ""])
+    indices = constant_op.constant([[4], [3], [1], [7]])
+    updates = constant_op.constant(["aa", "dd", "cc", "bb"])
+    update = state_ops.scatter_nd_update(ref, indices, updates)
+    self.evaluate(variables.global_variables_initializer())
+    self.assertAllEqual(self.evaluate(update),
+                        [b"qq", b"cc", b"ee", b"dd", b"aa", b"", b"", b"bb"])
+
   @test_util.run_deprecated_v1
   def testSimpleResource(self):
     indices = constant_op.constant([[4], [3], [1], [7]], dtype=dtypes.int32)
diff --git a/tensorflow/python/kernel_tests/signal/BUILD b/tensorflow/python/kernel_tests/signal/BUILD
index 8b50de24a3a..b260fff573e 100644
--- a/tensorflow/python/kernel_tests/signal/BUILD
+++ b/tensorflow/python/kernel_tests/signal/BUILD
@@ -14,6 +14,8 @@ py_library(
     srcs_version = "PY2AND3",
     deps = [
         "//tensorflow/core:protos_all_py",
+        "//tensorflow/lite/python:interpreter",
+        "//tensorflow/lite/python:lite",
         "//tensorflow/python:tf_optimizer",
         "//tensorflow/python:training",
     ],
@@ -27,7 +29,6 @@ cuda_py_tests(
         "//third_party/py/numpy",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:spectral_ops_test_util",
         "//tensorflow/python/ops/signal",
     ],
     tags = ["no_rocm"],
@@ -36,6 +37,7 @@ cuda_py_tests(
 
 cuda_py_tests(
     name = "fft_ops_test",
+    # TODO(rjryan): Parameterize the test to reduce the time it takes.
     size = "medium",
     srcs = ["fft_ops_test.py"],
     additional_deps = [
@@ -43,10 +45,9 @@ cuda_py_tests(
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:math_ops",
-        "//tensorflow/python:spectral_ops_test_util",
         "//tensorflow/python/ops/signal",
     ],
-    shard_count = 4,
+    shard_count = 8,
     tags = [
         "no_rocm",
         "optonly",
@@ -76,7 +77,6 @@ cuda_py_tests(
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:framework_test_lib",
         "//tensorflow/python/ops/signal",
-        "//tensorflow/python:spectral_ops_test_util",
     ],
     tags = ["no_rocm"],
     xla_enable_strict_auto_jit = True,
@@ -133,7 +133,6 @@ cuda_py_tests(
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:platform_test",
-        "//tensorflow/python:spectral_ops_test_util",
         "//tensorflow/python/ops/signal",
     ],
     tags = [
diff --git a/tensorflow/python/kernel_tests/signal/dct_ops_test.py b/tensorflow/python/kernel_tests/signal/dct_ops_test.py
index 225a0ea411d..cf1d7da8ffd 100644
--- a/tensorflow/python/kernel_tests/signal/dct_ops_test.py
+++ b/tensorflow/python/kernel_tests/signal/dct_ops_test.py
@@ -24,7 +24,6 @@ from absl.testing import parameterized
 import numpy as np
 
 from tensorflow.python.framework import test_util
-from tensorflow.python.ops import spectral_ops_test_util
 from tensorflow.python.ops.signal import dct_ops
 from tensorflow.python.platform import test
 from tensorflow.python.platform import tf_logging
@@ -43,11 +42,7 @@ fftpack = try_import("scipy.fftpack")
 
 
 def _modify_input_for_dct(signals, n=None):
-  """ This is a supporting function for the numpy implementation
-
-  of DCT operations. If n < signal size, it returns the first n elements,
-  else it pads the signal with zeros.
-  """
+  """Pad or trim the provided NumPy array's innermost axis to length n."""
   signal = np.array(signals)
   if n is None or n == signal.shape[-1]:
     signal_mod = signal
@@ -128,15 +123,16 @@ NP_DCT = {1: _np_dct1, 2: _np_dct2, 3: _np_dct3}
 NP_IDCT = {1: _np_dct1, 2: _np_dct3, 3: _np_dct2}
 
 
+@test_util.run_all_in_graph_and_eager_modes
 class DCTOpsTest(parameterized.TestCase, test.TestCase):
 
   def _compare(self, signals, n, norm, dct_type, atol=5e-4, rtol=5e-4):
     """Compares (I)DCT to SciPy (if available) and a NumPy implementation."""
     np_dct = NP_DCT[dct_type](signals, n=n, norm=norm)
-    tf_dct = dct_ops.dct(signals, n=n, type=dct_type, norm=norm).eval()
+    tf_dct = dct_ops.dct(signals, n=n, type=dct_type, norm=norm)
     self.assertAllClose(np_dct, tf_dct, atol=atol, rtol=rtol)
     np_idct = NP_IDCT[dct_type](signals, n=None, norm=norm)
-    tf_idct = dct_ops.idct(signals, type=dct_type, norm=norm).eval()
+    tf_idct = dct_ops.idct(signals, type=dct_type, norm=norm)
     self.assertAllClose(np_idct, tf_idct, atol=atol, rtol=rtol)
     if fftpack:
       scipy_dct = fftpack.dct(signals, n=n, type=dct_type, norm=norm)
@@ -144,12 +140,11 @@ class DCTOpsTest(parameterized.TestCase, test.TestCase):
       scipy_idct = fftpack.idct(signals, type=dct_type, norm=norm)
       self.assertAllClose(scipy_idct, tf_idct, atol=atol, rtol=rtol)
     # Verify inverse(forward(s)) == s, up to a normalization factor.
-    # Since `n` is not implemented for IDCT operation, re-calculating tf_dct without n.
-    tf_dct = dct_ops.dct(signals, type=dct_type, norm=norm).eval()
-    tf_idct_dct = dct_ops.idct(
-        tf_dct, type=dct_type, norm=norm).eval()
-    tf_dct_idct = dct_ops.dct(
-        tf_idct, type=dct_type, norm=norm).eval()
+    # Since `n` is not implemented for IDCT operation, re-calculating tf_dct
+    # without n.
+    tf_dct = dct_ops.dct(signals, type=dct_type, norm=norm)
+    tf_idct_dct = dct_ops.idct(tf_dct, type=dct_type, norm=norm)
+    tf_dct_idct = dct_ops.dct(tf_idct, type=dct_type, norm=norm)
     if norm is None:
       if dct_type == 1:
         tf_idct_dct *= 0.5 / (signals.shape[-1] - 1)
@@ -162,19 +157,17 @@ class DCTOpsTest(parameterized.TestCase, test.TestCase):
 
   @parameterized.parameters([
       [[2]], [[3]], [[10]], [[2, 20]], [[2, 3, 25]]])
-  @test_util.run_deprecated_v1
   def test_random(self, shape):
     """Test randomly generated batches of data."""
-    with spectral_ops_test_util.fft_kernel_label_map():
-      with self.session(use_gpu=True):
-        signals = np.random.rand(*shape).astype(np.float32)
-        n = np.random.randint(1, 2 * signals.shape[-1])
-        n = np.random.choice([None, n])
-        # Normalization not implemented for orthonormal.
-        self._compare(signals, n, norm=None, dct_type=1)
-        for norm in (None, "ortho"):
-          self._compare(signals, n=n, norm=norm, dct_type=2)
-          self._compare(signals, n=n, norm=norm, dct_type=3)
+    with self.session(use_gpu=True):
+      signals = np.random.rand(*shape).astype(np.float32)
+      n = np.random.randint(1, 2 * signals.shape[-1])
+      n = np.random.choice([None, n])
+      # Normalization not implemented for orthonormal.
+      self._compare(signals, n, norm=None, dct_type=1)
+      for norm in (None, "ortho"):
+        self._compare(signals, n=n, norm=norm, dct_type=2)
+        self._compare(signals, n=n, norm=norm, dct_type=3)
 
   def test_error(self):
     signals = np.random.rand(10)
diff --git a/tensorflow/python/kernel_tests/signal/fft_ops_test.py b/tensorflow/python/kernel_tests/signal/fft_ops_test.py
index 0dc9ae7a104..0eb3b827186 100644
--- a/tensorflow/python/kernel_tests/signal/fft_ops_test.py
+++ b/tensorflow/python/kernel_tests/signal/fft_ops_test.py
@@ -18,107 +18,125 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import contextlib
+import itertools
+
+from absl.testing import parameterized
 import numpy as np
 from six.moves import xrange  # pylint: disable=redefined-builtin
 
 from tensorflow.core.protobuf import config_pb2
+from tensorflow.python.compat import compat
+from tensorflow.python.eager import context
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
-from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gen_spectral_ops
-from tensorflow.python.ops import gradient_checker
+from tensorflow.python.ops import gradient_checker_v2
 from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import spectral_ops_test_util
 from tensorflow.python.ops.signal import fft_ops
 from tensorflow.python.platform import test
 
 VALID_FFT_RANKS = (1, 2, 3)
 
 
+def _forward_compat_context(np_dtype):
+  @contextlib.contextmanager
+  def null_context():
+    yield
+  if np_dtype in (np.float64, np.complex128):
+    return compat.forward_compatibility_horizon(2019, 10, 13)
+  else:
+    return null_context()
+
+
+# TODO(rjryan): Investigate precision issues. We should be able to achieve
+# better tolerances, at least for the complex128 tests.
 class BaseFFTOpsTest(test.TestCase):
 
   def _compare(self, x, rank, fft_length=None, use_placeholder=False,
                rtol=1e-4, atol=1e-4):
-    self._compareForward(x, rank, fft_length, use_placeholder, rtol, atol)
-    self._compareBackward(x, rank, fft_length, use_placeholder, rtol, atol)
+    self._compare_forward(x, rank, fft_length, use_placeholder, rtol, atol)
+    self._compare_backward(x, rank, fft_length, use_placeholder, rtol, atol)
 
-  def _compareForward(self, x, rank, fft_length=None, use_placeholder=False,
-                      rtol=1e-4, atol=1e-4):
-    x_np = self._npFFT(x, rank, fft_length)
-    if use_placeholder:
-      x_ph = array_ops.placeholder(dtype=dtypes.as_dtype(x.dtype))
-      x_tf = self._tfFFT(x_ph, rank, fft_length, feed_dict={x_ph: x})
-    else:
-      x_tf = self._tfFFT(x, rank, fft_length)
-
-    self.assertAllClose(x_np, x_tf, rtol=rtol, atol=atol)
-
-  def _compareBackward(self, x, rank, fft_length=None, use_placeholder=False,
+  def _compare_forward(self, x, rank, fft_length=None, use_placeholder=False,
                        rtol=1e-4, atol=1e-4):
-    x_np = self._npIFFT(x, rank, fft_length)
+    x_np = self._np_fft(x, rank, fft_length)
     if use_placeholder:
       x_ph = array_ops.placeholder(dtype=dtypes.as_dtype(x.dtype))
-      x_tf = self._tfIFFT(x_ph, rank, fft_length, feed_dict={x_ph: x})
+      x_tf = self._tf_fft(x_ph, rank, fft_length, feed_dict={x_ph: x})
     else:
-      x_tf = self._tfIFFT(x, rank, fft_length)
+      x_tf = self._tf_fft(x, rank, fft_length)
 
     self.assertAllClose(x_np, x_tf, rtol=rtol, atol=atol)
 
-  def _checkMemoryFail(self, x, rank):
+  def _compare_backward(self, x, rank, fft_length=None, use_placeholder=False,
+                        rtol=1e-4, atol=1e-4):
+    x_np = self._np_ifft(x, rank, fft_length)
+    if use_placeholder:
+      x_ph = array_ops.placeholder(dtype=dtypes.as_dtype(x.dtype))
+      x_tf = self._tf_ifft(x_ph, rank, fft_length, feed_dict={x_ph: x})
+    else:
+      x_tf = self._tf_ifft(x, rank, fft_length)
+
+    self.assertAllClose(x_np, x_tf, rtol=rtol, atol=atol)
+
+  def _check_memory_fail(self, x, rank):
     config = config_pb2.ConfigProto()
     config.gpu_options.per_process_gpu_memory_fraction = 1e-2
     with self.cached_session(config=config, force_gpu=True):
-      self._tfFFT(x, rank, fft_length=None)
+      self._tf_fft(x, rank, fft_length=None)
 
-  def _checkGradComplex(self, func, x, y, result_is_complex=True,
-                        rtol=1e-2, atol=1e-2):
+  def _check_grad_complex(self, func, x, y, result_is_complex=True,
+                          rtol=1e-2, atol=1e-2):
     with self.cached_session(use_gpu=True):
-      inx = ops.convert_to_tensor(x)
-      iny = ops.convert_to_tensor(y)
-      # func is a forward or inverse, real or complex, batched or unbatched FFT
-      # function with a complex input.
-      z = func(math_ops.complex(inx, iny))
-      # loss = sum(|z|^2)
-      loss = math_ops.reduce_sum(math_ops.real(z * math_ops.conj(z)))
+      def f(inx, iny):
+        inx.set_shape(x.shape)
+        iny.set_shape(y.shape)
+        # func is a forward or inverse, real or complex, batched or unbatched
+        # FFT function with a complex input.
+        z = func(math_ops.complex(inx, iny))
+        # loss = sum(|z|^2)
+        loss = math_ops.reduce_sum(math_ops.real(z * math_ops.conj(z)))
+        return loss
 
-      ((x_jacob_t, x_jacob_n),
-       (y_jacob_t, y_jacob_n)) = gradient_checker.compute_gradient(
-           [inx, iny], [list(x.shape), list(y.shape)],
-           loss, [1],
-           x_init_value=[x, y],
-           delta=1e-2)
+      with _forward_compat_context(x.dtype):
+        ((x_jacob_t, y_jacob_t), (x_jacob_n, y_jacob_n)) = (
+            gradient_checker_v2.compute_gradient(f, [x, y], delta=1e-2))
 
     self.assertAllClose(x_jacob_t, x_jacob_n, rtol=rtol, atol=atol)
     self.assertAllClose(y_jacob_t, y_jacob_n, rtol=rtol, atol=atol)
 
-  def _checkGradReal(self, func, x, rtol=1e-2, atol=1e-2):
-    with self.cached_session(use_gpu=True):
-      inx = ops.convert_to_tensor(x)
+  def _check_grad_real(self, func, x, rtol=1e-2, atol=1e-2):
+    def f(inx):
+      inx.set_shape(x.shape)
       # func is a forward RFFT function (batched or unbatched).
       z = func(inx)
       # loss = sum(|z|^2)
       loss = math_ops.reduce_sum(math_ops.real(z * math_ops.conj(z)))
-      x_jacob_t, x_jacob_n = test.compute_gradient(
-          inx, list(x.shape), loss, [1], x_init_value=x, delta=1e-2)
+      return loss
 
+    with _forward_compat_context(x.dtype):
+      (x_jacob_t,), (x_jacob_n,) = gradient_checker_v2.compute_gradient(
+          f, [x], delta=1e-2)
     self.assertAllClose(x_jacob_t, x_jacob_n, rtol=rtol, atol=atol)
 
 
-class FFTOpsTest(BaseFFTOpsTest):
+@test_util.run_all_in_graph_and_eager_modes
+class FFTOpsTest(BaseFFTOpsTest, parameterized.TestCase):
 
-  def _tfFFT(self, x, rank, fft_length=None, feed_dict=None):
+  def _tf_fft(self, x, rank, fft_length=None, feed_dict=None):
     # fft_length unused for complex FFTs.
     with self.cached_session(use_gpu=True) as sess:
-      return sess.run(self._tfFFTForRank(rank)(x), feed_dict=feed_dict)
+      return sess.run(self._tf_fft_for_rank(rank)(x), feed_dict=feed_dict)
 
-  def _tfIFFT(self, x, rank, fft_length=None, feed_dict=None):
+  def _tf_ifft(self, x, rank, fft_length=None, feed_dict=None):
     # fft_length unused for complex FFTs.
     with self.cached_session(use_gpu=True) as sess:
-      return sess.run(self._tfIFFTForRank(rank)(x), feed_dict=feed_dict)
+      return sess.run(self._tf_ifft_for_rank(rank)(x), feed_dict=feed_dict)
 
-  def _npFFT(self, x, rank, fft_length=None):
+  def _np_fft(self, x, rank, fft_length=None):
     if rank == 1:
       return np.fft.fft2(x, s=fft_length, axes=(-1,))
     elif rank == 2:
@@ -128,7 +146,7 @@ class FFTOpsTest(BaseFFTOpsTest):
     else:
       raise ValueError("invalid rank")
 
-  def _npIFFT(self, x, rank, fft_length=None):
+  def _np_ifft(self, x, rank, fft_length=None):
     if rank == 1:
       return np.fft.ifft2(x, s=fft_length, axes=(-1,))
     elif rank == 2:
@@ -138,7 +156,7 @@ class FFTOpsTest(BaseFFTOpsTest):
     else:
       raise ValueError("invalid rank")
 
-  def _tfFFTForRank(self, rank):
+  def _tf_fft_for_rank(self, rank):
     if rank == 1:
       return fft_ops.fft
     elif rank == 2:
@@ -148,7 +166,7 @@ class FFTOpsTest(BaseFFTOpsTest):
     else:
       raise ValueError("invalid rank")
 
-  def _tfIFFTForRank(self, rank):
+  def _tf_ifft_for_rank(self, rank):
     if rank == 1:
       return fft_ops.ifft
     elif rank == 2:
@@ -158,150 +176,143 @@ class FFTOpsTest(BaseFFTOpsTest):
     else:
       raise ValueError("invalid rank")
 
-  @test_util.run_deprecated_v1
-  def testEmpty(self):
-    with spectral_ops_test_util.fft_kernel_label_map():
-      for np_type in (np.complex64, np.complex128):
-        for rank in VALID_FFT_RANKS:
-          for dims in xrange(rank, rank + 3):
-            x = np.zeros((0,) * dims).astype(np_type)
-            self.assertEqual(x.shape, self._tfFFT(x, rank).shape)
-            self.assertEqual(x.shape, self._tfIFFT(x, rank).shape)
+  @parameterized.parameters(itertools.product(
+      VALID_FFT_RANKS, range(3), (np.complex64, np.complex128)))
+  def test_empty(self, rank, extra_dims, np_type):
+    dims = rank + extra_dims
+    x = np.zeros((0,) * dims).astype(np_type)
+    self.assertEqual(x.shape, self._tf_fft(x, rank).shape)
+    self.assertEqual(x.shape, self._tf_ifft(x, rank).shape)
 
-  @test_util.run_deprecated_v1
-  def testBasic(self):
-    with spectral_ops_test_util.fft_kernel_label_map():
-      for np_type, tol in ((np.complex64, 1e-4), (np.complex128, 1e-8)):
-        for rank in VALID_FFT_RANKS:
-          for dims in xrange(rank, rank + 3):
-            self._compare(
-                np.mod(np.arange(np.power(4, dims)), 10).reshape(
-                    (4,) * dims).astype(np_type), rank, rtol=tol, atol=tol)
+  @parameterized.parameters(
+      itertools.product(VALID_FFT_RANKS, range(3),
+                        (np.complex64, np.complex128)))
+  def test_basic(self, rank, extra_dims, np_type):
+    dims = rank + extra_dims
+    tol = 1e-4 if np_type == np.complex64 else 1e-8
+    self._compare(
+        np.mod(np.arange(np.power(4, dims)), 10).reshape(
+            (4,) * dims).astype(np_type), rank, rtol=tol, atol=tol)
 
-  def testLargeBatch(self):
-    if test.is_gpu_available(cuda_only=True):
-      rank = 1
-      for dims in xrange(rank, rank + 3):
-        for np_type, tol in ((np.complex64, 1e-4), (np.complex128, 1e-5)):
-          self._compare(
-              np.mod(np.arange(np.power(128, dims)), 10).reshape(
-                  (128,) * dims).astype(np_type), rank, rtol=tol, atol=tol)
+  @parameterized.parameters(itertools.product(
+      (1,), range(3), (np.complex64, np.complex128)))
+  def test_large_batch(self, rank, extra_dims, np_type):
+    dims = rank + extra_dims
+    tol = 1e-4 if np_type == np.complex64 else 5e-5
+    self._compare(
+        np.mod(np.arange(np.power(128, dims)), 10).reshape(
+            (128,) * dims).astype(np_type), rank, rtol=tol, atol=tol)
 
   # TODO(yangzihao): Disable before we can figure out a way to
   # properly test memory fail for large batch fft.
-  # def testLargeBatchMemoryFail(self):
+  # def test_large_batch_memory_fail(self):
   #   if test.is_gpu_available(cuda_only=True):
   #     rank = 1
   #     for dims in xrange(rank, rank + 3):
-  #       self._checkMemoryFail(
+  #       self._check_memory_fail(
   #           np.mod(np.arange(np.power(128, dims)), 64).reshape(
   #               (128,) * dims).astype(np.complex64), rank)
 
-  @test_util.run_deprecated_v1
-  def testBasicPlaceholder(self):
-    with spectral_ops_test_util.fft_kernel_label_map():
-      for np_type, tol in ((np.complex64, 1e-4), (np.complex128, 1e-8)):
-        for rank in VALID_FFT_RANKS:
-          for dims in xrange(rank, rank + 3):
-            self._compare(
-                np.mod(np.arange(np.power(4, dims)), 10).reshape(
-                    (4,) * dims).astype(np_type),
-                rank, use_placeholder=True, rtol=tol, atol=tol)
+  @parameterized.parameters(itertools.product(
+      VALID_FFT_RANKS, range(3), (np.complex64, np.complex128)))
+  def test_placeholder(self, rank, extra_dims, np_type):
+    if context.executing_eagerly():
+      return
+    tol = 1e-4 if np_type == np.complex64 else 1e-8
+    dims = rank + extra_dims
+    self._compare(
+        np.mod(np.arange(np.power(4, dims)), 10).reshape(
+            (4,) * dims).astype(np_type),
+        rank, use_placeholder=True, rtol=tol, atol=tol)
 
-  @test_util.run_deprecated_v1
-  def testRandom(self):
-    with spectral_ops_test_util.fft_kernel_label_map():
-      for np_type, tol in ((np.complex64, 1e-4), (np.complex128, 5e-6)):
-        def gen(shape):
-          n = np.prod(shape)
-          re = np.random.uniform(size=n)
-          im = np.random.uniform(size=n)
-          return (re + im * 1j).reshape(shape)
+  @parameterized.parameters(itertools.product(
+      VALID_FFT_RANKS, range(3), (np.complex64, np.complex128)))
+  def test_random(self, rank, extra_dims, np_type):
+    tol = 1e-4 if np_type == np.complex64 else 5e-6
+    dims = rank + extra_dims
+    def gen(shape):
+      n = np.prod(shape)
+      re = np.random.uniform(size=n)
+      im = np.random.uniform(size=n)
+      return (re + im * 1j).reshape(shape)
 
-        for rank in VALID_FFT_RANKS:
-          for dims in xrange(rank, rank + 3):
-            self._compare(gen((4,) * dims).astype(np_type), rank,
-                          rtol=tol, atol=tol)
+    self._compare(gen((4,) * dims).astype(np_type), rank,
+                  rtol=tol, atol=tol)
 
-  @test_util.run_deprecated_v1
-  def testRandom1D(self):
-    with spectral_ops_test_util.fft_kernel_label_map():
-      for np_type in (np.complex64, np.complex128):
-        has_gpu = test.is_gpu_available(cuda_only=True)
-        tol = {(np.complex64, True): 1e-4,
-               (np.complex64, False): 1e-2,
-               (np.complex128, True): 1e-4,
-               (np.complex128, False): 1e-2}[(np_type, has_gpu)]
-        def gen(shape):
-          n = np.prod(shape)
-          re = np.random.uniform(size=n)
-          im = np.random.uniform(size=n)
-          return (re + im * 1j).reshape(shape)
+  @parameterized.parameters(itertools.product(
+      VALID_FFT_RANKS,
+      # Check a variety of sizes (power-of-2, odd, etc.)
+      [128, 256, 512, 1024, 127, 255, 511, 1023],
+      (np.complex64, np.complex128)))
+  def test_random_1d(self, rank, dim, np_type):
+    has_gpu = test.is_gpu_available(cuda_only=True)
+    tol = {(np.complex64, True): 1e-4,
+           (np.complex64, False): 1e-2,
+           (np.complex128, True): 1e-4,
+           (np.complex128, False): 1e-2}[(np_type, has_gpu)]
+    def gen(shape):
+      n = np.prod(shape)
+      re = np.random.uniform(size=n)
+      im = np.random.uniform(size=n)
+      return (re + im * 1j).reshape(shape)
 
-        # Check a variety of power-of-2 FFT sizes.
-        for dim in (128, 256, 512, 1024):
-          self._compare(gen((dim,)).astype(np_type), 1, rtol=tol, atol=tol)
+    self._compare(gen((dim,)).astype(np_type), 1, rtol=tol, atol=tol)
 
-        # Check a variety of non-power-of-2 FFT sizes.
-        for dim in (127, 255, 511, 1023):
-          self._compare(gen((dim,)).astype(np_type), 1, rtol=tol, atol=tol)
-
-  @test_util.run_deprecated_v1
-  def testError(self):
+  def test_error(self):
+    # TODO(rjryan): Fix this test under Eager.
+    if context.executing_eagerly():
+      return
     for rank in VALID_FFT_RANKS:
       for dims in xrange(0, rank):
         x = np.zeros((1,) * dims).astype(np.complex64)
         with self.assertRaisesWithPredicateMatch(
             ValueError, "Shape must be .*rank {}.*".format(rank)):
-          self._tfFFT(x, rank)
+          self._tf_fft(x, rank)
         with self.assertRaisesWithPredicateMatch(
             ValueError, "Shape must be .*rank {}.*".format(rank)):
-          self._tfIFFT(x, rank)
+          self._tf_ifft(x, rank)
 
-  @test_util.run_deprecated_v1
-  def testGrad_Simple(self):
-    with spectral_ops_test_util.fft_kernel_label_map():
-      for np_type, tol in ((np.float32, 1e-4), (np.float64, 1e-10)):
-        for rank in VALID_FFT_RANKS:
-          for dims in xrange(rank, rank + 2):
-            re = np.ones(shape=(4,) * dims, dtype=np_type) / 10.0
-            im = np.zeros(shape=(4,) * dims, dtype=np_type)
-            self._checkGradComplex(self._tfFFTForRank(rank), re, im,
-                                   rtol=tol, atol=tol)
-            self._checkGradComplex(self._tfIFFTForRank(rank), re, im,
-                                   rtol=tol, atol=tol)
+  @parameterized.parameters(itertools.product(
+      VALID_FFT_RANKS, range(2), (np.float32, np.float64)))
+  def test_grad_simple(self, rank, extra_dims, np_type):
+    tol = 1e-4 if np_type == np.float32 else 1e-10
+    dims = rank + extra_dims
+    re = np.ones(shape=(4,) * dims, dtype=np_type) / 10.0
+    im = np.zeros(shape=(4,) * dims, dtype=np_type)
+    self._check_grad_complex(self._tf_fft_for_rank(rank), re, im,
+                             rtol=tol, atol=tol)
+    self._check_grad_complex(self._tf_ifft_for_rank(rank), re, im,
+                             rtol=tol, atol=tol)
 
-  @test_util.run_deprecated_v1
-  def testGrad_Random(self):
-    with spectral_ops_test_util.fft_kernel_label_map():
-      for np_type, tol in ((np.float32, 1e-2), (np.float64, 1e-10)):
-        for rank in VALID_FFT_RANKS:
-          for dims in xrange(rank, rank + 2):
-            re = np.random.rand(*((3,) * dims)).astype(np_type) * 2 - 1
-            im = np.random.rand(*((3,) * dims)).astype(np_type) * 2 - 1
-            self._checkGradComplex(self._tfFFTForRank(rank), re, im,
-                                   rtol=tol, atol=tol)
-            self._checkGradComplex(self._tfIFFTForRank(rank), re, im,
-                                   rtol=tol, atol=tol)
+  @parameterized.parameters(itertools.product(
+      VALID_FFT_RANKS, range(2), (np.float32, np.float64)))
+  def test_grad_random(self, rank, extra_dims, np_type):
+    dims = rank + extra_dims
+    tol = 1e-2 if np_type == np.float32 else 1e-10
+    re = np.random.rand(*((3,) * dims)).astype(np_type) * 2 - 1
+    im = np.random.rand(*((3,) * dims)).astype(np_type) * 2 - 1
+    self._check_grad_complex(self._tf_fft_for_rank(rank), re, im,
+                             rtol=tol, atol=tol)
+    self._check_grad_complex(self._tf_ifft_for_rank(rank), re, im,
+                             rtol=tol, atol=tol)
 
 
-class RFFTOpsTest(BaseFFTOpsTest):
+@test_util.run_all_in_graph_and_eager_modes
+class RFFTOpsTest(BaseFFTOpsTest, parameterized.TestCase):
 
-  def _compareBackward(self, x, rank, fft_length=None, use_placeholder=False):
-    super(RFFTOpsTest, self)._compareBackward(x, rank, fft_length,
-                                              use_placeholder)
-
-  def _tfFFT(self, x, rank, fft_length=None, feed_dict=None):
-    with self.cached_session(use_gpu=True) as sess:
+  def _tf_fft(self, x, rank, fft_length=None, feed_dict=None):
+    with _forward_compat_context(x.dtype), self.cached_session(
+        use_gpu=True) as sess:
       return sess.run(
-          self._tfFFTForRank(rank)(x, fft_length), feed_dict=feed_dict)
+          self._tf_fft_for_rank(rank)(x, fft_length), feed_dict=feed_dict)
 
-  def _tfIFFT(self, x, rank, fft_length=None, feed_dict=None):
-    with self.cached_session(use_gpu=True) as sess:
+  def _tf_ifft(self, x, rank, fft_length=None, feed_dict=None):
+    with _forward_compat_context(x.dtype), self.cached_session(
+        use_gpu=True) as sess:
       return sess.run(
-          self._tfIFFTForRank(rank)(x, fft_length), feed_dict=feed_dict)
+          self._tf_ifft_for_rank(rank)(x, fft_length), feed_dict=feed_dict)
 
-  def _npFFT(self, x, rank, fft_length=None):
+  def _np_fft(self, x, rank, fft_length=None):
     if rank == 1:
       return np.fft.rfft2(x, s=fft_length, axes=(-1,))
     elif rank == 2:
@@ -311,7 +322,7 @@ class RFFTOpsTest(BaseFFTOpsTest):
     else:
       raise ValueError("invalid rank")
 
-  def _npIFFT(self, x, rank, fft_length=None):
+  def _np_ifft(self, x, rank, fft_length=None):
     if rank == 1:
       return np.fft.irfft2(x, s=fft_length, axes=(-1,))
     elif rank == 2:
@@ -321,7 +332,7 @@ class RFFTOpsTest(BaseFFTOpsTest):
     else:
       raise ValueError("invalid rank")
 
-  def _tfFFTForRank(self, rank):
+  def _tf_fft_for_rank(self, rank):
     if rank == 1:
       return fft_ops.rfft
     elif rank == 2:
@@ -331,7 +342,7 @@ class RFFTOpsTest(BaseFFTOpsTest):
     else:
       raise ValueError("invalid rank")
 
-  def _tfIFFTForRank(self, rank):
+  def _tf_ifft_for_rank(self, rank):
     if rank == 1:
       return fft_ops.irfft
     elif rank == 2:
@@ -341,286 +352,319 @@ class RFFTOpsTest(BaseFFTOpsTest):
     else:
       raise ValueError("invalid rank")
 
-  @test_util.run_deprecated_v1
-  def testEmpty(self):
-    with spectral_ops_test_util.fft_kernel_label_map():
-      for rank in VALID_FFT_RANKS:
-        for dims in xrange(rank, rank + 3):
-          x = np.zeros((0,) * dims).astype(np.float32)
-          self.assertEqual(x.shape, self._tfFFT(x, rank).shape)
-          x = np.zeros((0,) * dims).astype(np.complex64)
-          self.assertEqual(x.shape, self._tfIFFT(x, rank).shape)
+  @parameterized.parameters(itertools.product(
+      VALID_FFT_RANKS, range(3), (np.float32, np.float64)))
 
-  @test_util.run_deprecated_v1
-  def testBasic(self):
-    with spectral_ops_test_util.fft_kernel_label_map():
-      for rank in VALID_FFT_RANKS:
-        for dims in xrange(rank, rank + 3):
-          for size in (5, 6):
-            inner_dim = size // 2 + 1
-            r2c = np.mod(np.arange(np.power(size, dims)), 10).reshape(
-                (size,) * dims)
-            self._compareForward(r2c.astype(np.float32), rank, (size,) * rank)
-            c2r = np.mod(np.arange(np.power(size, dims - 1) * inner_dim),
-                         10).reshape((size,) * (dims - 1) + (inner_dim,))
-            self._compareBackward(
-                c2r.astype(np.complex64), rank, (size,) * rank)
+  def test_empty(self, rank, extra_dims, np_rtype):
+    np_ctype = np.complex64 if np_rtype == np.float32 else np.complex128
+    dims = rank + extra_dims
+    x = np.zeros((0,) * dims).astype(np_rtype)
+    self.assertEqual(x.shape, self._tf_fft(x, rank).shape)
+    x = np.zeros((0,) * dims).astype(np_ctype)
+    self.assertEqual(x.shape, self._tf_ifft(x, rank).shape)
 
-  def testLargeBatch(self):
-    if test.is_gpu_available(cuda_only=True):
-      rank = 1
-      for dims in xrange(rank, rank + 3):
-        for size in (64, 128):
-          inner_dim = size // 2 + 1
-          r2c = np.mod(np.arange(np.power(size, dims)), 10).reshape(
-              (size,) * dims)
-          self._compareForward(r2c.astype(np.float32), rank, (size,) * rank)
-          c2r = np.mod(np.arange(np.power(size, dims - 1) * inner_dim),
-                       10).reshape((size,) * (dims - 1) + (inner_dim,))
-          self._compareBackward(c2r.astype(np.complex64), rank, (size,) * rank)
+  @parameterized.parameters(itertools.product(
+      VALID_FFT_RANKS, range(3), (5, 6), (np.float32, np.float64)))
+  def test_basic(self, rank, extra_dims, size, np_rtype):
+    np_ctype = np.complex64 if np_rtype == np.float32 else np.complex128
+    tol = 1e-4 if np_rtype == np.float32 else 5e-5
+    dims = rank + extra_dims
+    inner_dim = size // 2 + 1
+    r2c = np.mod(np.arange(np.power(size, dims)), 10).reshape(
+        (size,) * dims)
+    self._compare_forward(r2c.astype(np_rtype), rank, (size,) * rank,
+                          rtol=tol, atol=tol)
+    c2r = np.mod(np.arange(np.power(size, dims - 1) * inner_dim),
+                 10).reshape((size,) * (dims - 1) + (inner_dim,))
+    self._compare_backward(
+        c2r.astype(np_ctype), rank, (size,) * rank,
+        rtol=tol, atol=tol)
 
-  @test_util.run_deprecated_v1
-  def testBasicPlaceholder(self):
-    with spectral_ops_test_util.fft_kernel_label_map():
-      for rank in VALID_FFT_RANKS:
-        for dims in xrange(rank, rank + 3):
-          for size in (5, 6):
-            inner_dim = size // 2 + 1
-            r2c = np.mod(np.arange(np.power(size, dims)), 10).reshape(
-                (size,) * dims)
-            self._compareForward(
-                r2c.astype(np.float32),
-                rank, (size,) * rank,
-                use_placeholder=True)
-            c2r = np.mod(np.arange(np.power(size, dims - 1) * inner_dim),
-                         10).reshape((size,) * (dims - 1) + (inner_dim,))
-            self._compareBackward(
-                c2r.astype(np.complex64),
-                rank, (size,) * rank,
-                use_placeholder=True)
+  @parameterized.parameters(itertools.product(
+      (1,), range(3), (64, 128), (np.float32, np.float64)))
+  def test_large_batch(self, rank, extra_dims, size, np_rtype):
+    np_ctype = np.complex64 if np_rtype == np.float32 else np.complex128
+    tol = 1e-4 if np_rtype == np.float32 else 1e-5
+    dims = rank + extra_dims
+    inner_dim = size // 2 + 1
+    r2c = np.mod(np.arange(np.power(size, dims)), 10).reshape(
+        (size,) * dims)
+    self._compare_forward(r2c.astype(np_rtype), rank, (size,) * rank,
+                          rtol=tol, atol=tol)
+    c2r = np.mod(np.arange(np.power(size, dims - 1) * inner_dim),
+                 10).reshape((size,) * (dims - 1) + (inner_dim,))
+    self._compare_backward(c2r.astype(np_ctype), rank, (size,) * rank,
+                           rtol=tol, atol=tol)
 
-  @test_util.run_deprecated_v1
-  def testFftLength(self):
-    if test.is_gpu_available(cuda_only=True):
-      with spectral_ops_test_util.fft_kernel_label_map():
-        for rank in VALID_FFT_RANKS:
-          for dims in xrange(rank, rank + 3):
-            for size in (5, 6):
-              inner_dim = size // 2 + 1
-              r2c = np.mod(np.arange(np.power(size, dims)), 10).reshape(
-                  (size,) * dims)
-              c2r = np.mod(np.arange(np.power(size, dims - 1) * inner_dim),
-                           10).reshape((size,) * (dims - 1) + (inner_dim,))
-              # Test truncation (FFT size < dimensions).
-              fft_length = (size - 2,) * rank
-              self._compareForward(r2c.astype(np.float32), rank, fft_length)
-              self._compareBackward(c2r.astype(np.complex64), rank, fft_length)
-              # Confirm it works with unknown shapes as well.
-              self._compareForward(
-                  r2c.astype(np.float32),
-                  rank,
-                  fft_length,
-                  use_placeholder=True)
-              self._compareBackward(
-                  c2r.astype(np.complex64),
-                  rank,
-                  fft_length,
-                  use_placeholder=True)
-              # Test padding (FFT size > dimensions).
-              fft_length = (size + 2,) * rank
-              self._compareForward(r2c.astype(np.float32), rank, fft_length)
-              self._compareBackward(c2r.astype(np.complex64), rank, fft_length)
-              # Confirm it works with unknown shapes as well.
-              self._compareForward(
-                  r2c.astype(np.float32),
-                  rank,
-                  fft_length,
-                  use_placeholder=True)
-              self._compareBackward(
-                  c2r.astype(np.complex64),
-                  rank,
-                  fft_length,
-                  use_placeholder=True)
+  @parameterized.parameters(itertools.product(
+      VALID_FFT_RANKS, range(3), (5, 6), (np.float32, np.float64)))
+  def test_placeholder(self, rank, extra_dims, size, np_rtype):
+    if context.executing_eagerly():
+      return
+    np_ctype = np.complex64 if np_rtype == np.float32 else np.complex128
+    tol = 1e-4 if np_rtype == np.float32 else 1e-8
+    dims = rank + extra_dims
+    inner_dim = size // 2 + 1
+    r2c = np.mod(np.arange(np.power(size, dims)), 10).reshape(
+        (size,) * dims)
+    self._compare_forward(
+        r2c.astype(np_rtype),
+        rank, (size,) * rank,
+        use_placeholder=True,
+        rtol=tol, atol=tol)
+    c2r = np.mod(np.arange(np.power(size, dims - 1) * inner_dim),
+                 10).reshape((size,) * (dims - 1) + (inner_dim,))
+    self._compare_backward(
+        c2r.astype(np_ctype),
+        rank, (size,) * rank,
+        use_placeholder=True,
+        rtol=tol, atol=tol)
 
-  @test_util.run_deprecated_v1
-  def testRandom(self):
-    with spectral_ops_test_util.fft_kernel_label_map():
-      def gen_real(shape):
-        n = np.prod(shape)
-        re = np.random.uniform(size=n)
-        ret = re.reshape(shape)
-        return ret
+  @parameterized.parameters(itertools.product(
+      VALID_FFT_RANKS, range(3), (5, 6), (np.float32, np.float64)))
+  def test_fft_lenth_truncate(self, rank, extra_dims, size, np_rtype):
+    """Test truncation (FFT size < dimensions)."""
+    np_ctype = np.complex64 if np_rtype == np.float32 else np.complex128
+    tol = 1e-4 if np_rtype == np.float32 else 8e-5
+    dims = rank + extra_dims
+    inner_dim = size // 2 + 1
+    r2c = np.mod(np.arange(np.power(size, dims)), 10).reshape(
+        (size,) * dims)
+    c2r = np.mod(np.arange(np.power(size, dims - 1) * inner_dim),
+                 10).reshape((size,) * (dims - 1) + (inner_dim,))
+    fft_length = (size - 2,) * rank
+    self._compare_forward(r2c.astype(np_rtype), rank, fft_length,
+                          rtol=tol, atol=tol)
+    self._compare_backward(c2r.astype(np_ctype), rank, fft_length,
+                           rtol=tol, atol=tol)
+    # Confirm it works with unknown shapes as well.
+    if not context.executing_eagerly():
+      self._compare_forward(
+          r2c.astype(np_rtype),
+          rank,
+          fft_length,
+          use_placeholder=True,
+          rtol=tol, atol=tol)
+      self._compare_backward(
+          c2r.astype(np_ctype),
+          rank,
+          fft_length,
+          use_placeholder=True,
+          rtol=tol, atol=tol)
 
-      def gen_complex(shape):
-        n = np.prod(shape)
-        re = np.random.uniform(size=n)
-        im = np.random.uniform(size=n)
-        ret = (re + im * 1j).reshape(shape)
-        return ret
+  @parameterized.parameters(itertools.product(
+      VALID_FFT_RANKS, range(3), (5, 6), (np.float32, np.float64)))
+  def test_fft_lenth_pad(self, rank, extra_dims, size, np_rtype):
+    """Test padding (FFT size > dimensions)."""
+    np_ctype = np.complex64 if np_rtype == np.float32 else np.complex128
+    tol = 1e-4 if np_rtype == np.float32 else 8e-5
+    dims = rank + extra_dims
+    inner_dim = size // 2 + 1
+    r2c = np.mod(np.arange(np.power(size, dims)), 10).reshape(
+        (size,) * dims)
+    c2r = np.mod(np.arange(np.power(size, dims - 1) * inner_dim),
+                 10).reshape((size,) * (dims - 1) + (inner_dim,))
+    fft_length = (size + 2,) * rank
+    self._compare_forward(r2c.astype(np_rtype), rank, fft_length,
+                          rtol=tol, atol=tol)
+    self._compare_backward(c2r.astype(np_ctype), rank, fft_length,
+                           rtol=tol, atol=tol)
+    # Confirm it works with unknown shapes as well.
+    if not context.executing_eagerly():
+      self._compare_forward(
+          r2c.astype(np_rtype),
+          rank,
+          fft_length,
+          use_placeholder=True,
+          rtol=tol, atol=tol)
+      self._compare_backward(
+          c2r.astype(np_ctype),
+          rank,
+          fft_length,
+          use_placeholder=True,
+          rtol=tol, atol=tol)
 
-      for rank in VALID_FFT_RANKS:
-        for dims in xrange(rank, rank + 3):
-          for size in (5, 6):
-            inner_dim = size // 2 + 1
-            self._compareForward(gen_real((size,) * dims), rank, (size,) * rank)
-            complex_dims = (size,) * (dims - 1) + (inner_dim,)
-            self._compareBackward(
-                gen_complex(complex_dims), rank, (size,) * rank)
+  @parameterized.parameters(itertools.product(
+      VALID_FFT_RANKS, range(3), (5, 6), (np.float32, np.float64)))
+  def test_random(self, rank, extra_dims, size, np_rtype):
+    def gen_real(shape):
+      n = np.prod(shape)
+      re = np.random.uniform(size=n)
+      ret = re.reshape(shape)
+      return ret
 
-  @test_util.run_deprecated_v1
-  def testError(self):
-    with spectral_ops_test_util.fft_kernel_label_map():
-      for rank in VALID_FFT_RANKS:
-        for dims in xrange(0, rank):
-          x = np.zeros((1,) * dims).astype(np.complex64)
-          with self.assertRaisesWithPredicateMatch(
-              ValueError, "Shape .* must have rank at least {}".format(rank)):
-            self._tfFFT(x, rank)
-          with self.assertRaisesWithPredicateMatch(
-              ValueError, "Shape .* must have rank at least {}".format(rank)):
-            self._tfIFFT(x, rank)
-        for dims in xrange(rank, rank + 2):
-          x = np.zeros((1,) * rank)
+    def gen_complex(shape):
+      n = np.prod(shape)
+      re = np.random.uniform(size=n)
+      im = np.random.uniform(size=n)
+      ret = (re + im * 1j).reshape(shape)
+      return ret
+    np_ctype = np.complex64 if np_rtype == np.float32 else np.complex128
+    tol = 1e-4 if np_rtype == np.float32 else 1e-5
+    dims = rank + extra_dims
+    inner_dim = size // 2 + 1
+    self._compare_forward(gen_real((size,) * dims).astype(np_rtype),
+                          rank, (size,) * rank,
+                          rtol=tol, atol=tol)
+    complex_dims = (size,) * (dims - 1) + (inner_dim,)
+    self._compare_backward(
+        gen_complex(complex_dims).astype(np_ctype),
+        rank, (size,) * rank,
+        rtol=tol, atol=tol)
 
-          # Test non-rank-1 fft_length produces an error.
-          fft_length = np.zeros((1, 1)).astype(np.int32)
-          with self.assertRaisesWithPredicateMatch(ValueError,
-                                                   "Shape .* must have rank 1"):
-            self._tfFFT(x, rank, fft_length)
-          with self.assertRaisesWithPredicateMatch(ValueError,
-                                                   "Shape .* must have rank 1"):
-            self._tfIFFT(x, rank, fft_length)
-
-          # Test wrong fft_length length.
-          fft_length = np.zeros((rank + 1,)).astype(np.int32)
-          with self.assertRaisesWithPredicateMatch(
-              ValueError, "Dimension must be .*but is {}.*".format(rank + 1)):
-            self._tfFFT(x, rank, fft_length)
-          with self.assertRaisesWithPredicateMatch(
-              ValueError, "Dimension must be .*but is {}.*".format(rank + 1)):
-            self._tfIFFT(x, rank, fft_length)
-
-        # Test that calling the kernel directly without padding to fft_length
-        # produces an error.
-        rffts_for_rank = {
-            1: [gen_spectral_ops.rfft, gen_spectral_ops.irfft],
-            2: [gen_spectral_ops.rfft2d, gen_spectral_ops.irfft2d],
-            3: [gen_spectral_ops.rfft3d, gen_spectral_ops.irfft3d]
-        }
-        rfft_fn, irfft_fn = rffts_for_rank[rank]
+  def test_error(self):
+    # TODO(rjryan): Fix this test under Eager.
+    if context.executing_eagerly():
+      return
+    for rank in VALID_FFT_RANKS:
+      for dims in xrange(0, rank):
+        x = np.zeros((1,) * dims).astype(np.complex64)
         with self.assertRaisesWithPredicateMatch(
-            errors.InvalidArgumentError,
-            "Input dimension .* must have length of at least 6 but got: 5"):
-          x = np.zeros((5,) * rank).astype(np.float32)
-          fft_length = [6] * rank
-          with self.cached_session():
-            self.evaluate(rfft_fn(x, fft_length))
-
+            ValueError, "Shape .* must have rank at least {}".format(rank)):
+          self._tf_fft(x, rank)
         with self.assertRaisesWithPredicateMatch(
-            errors.InvalidArgumentError,
-            "Input dimension .* must have length of at least .* but got: 3"):
-          x = np.zeros((3,) * rank).astype(np.complex64)
-          fft_length = [6] * rank
-          with self.cached_session():
-            self.evaluate(irfft_fn(x, fft_length))
+            ValueError, "Shape .* must have rank at least {}".format(rank)):
+          self._tf_ifft(x, rank)
+      for dims in xrange(rank, rank + 2):
+        x = np.zeros((1,) * rank)
 
-  @test_util.run_deprecated_v1
-  def testGrad_Simple(self):
-    with spectral_ops_test_util.fft_kernel_label_map():
-      for rank in VALID_FFT_RANKS:
-        # rfft3d/irfft3d do not have gradients yet.
-        if rank == 3:
-          continue
-        for dims in xrange(rank, rank + 2):
-          for size in (5, 6):
-            re = np.ones(shape=(size,) * dims, dtype=np.float32)
-            im = -np.ones(shape=(size,) * dims, dtype=np.float32)
-            self._checkGradReal(self._tfFFTForRank(rank), re)
-            self._checkGradComplex(
-                self._tfIFFTForRank(rank), re, im, result_is_complex=False)
+        # Test non-rank-1 fft_length produces an error.
+        fft_length = np.zeros((1, 1)).astype(np.int32)
+        with self.assertRaisesWithPredicateMatch(ValueError,
+                                                 "Shape .* must have rank 1"):
+          self._tf_fft(x, rank, fft_length)
+        with self.assertRaisesWithPredicateMatch(ValueError,
+                                                 "Shape .* must have rank 1"):
+          self._tf_ifft(x, rank, fft_length)
 
-  @test_util.run_deprecated_v1
-  def testGrad_Random(self):
-    with spectral_ops_test_util.fft_kernel_label_map():
-      for rank in VALID_FFT_RANKS:
-        # rfft3d/irfft3d do not have gradients yet.
-        if rank == 3:
-          continue
-        for dims in xrange(rank, rank + 2):
-          for size in (5, 6):
-            re = np.random.rand(*((size,) * dims)).astype(np.float32) * 2 - 1
-            im = np.random.rand(*((size,) * dims)).astype(np.float32) * 2 - 1
-            self._checkGradReal(self._tfFFTForRank(rank), re)
-            self._checkGradComplex(
-                self._tfIFFTForRank(rank), re, im, result_is_complex=False)
+        # Test wrong fft_length length.
+        fft_length = np.zeros((rank + 1,)).astype(np.int32)
+        with self.assertRaisesWithPredicateMatch(
+            ValueError, "Dimension must be .*but is {}.*".format(rank + 1)):
+          self._tf_fft(x, rank, fft_length)
+        with self.assertRaisesWithPredicateMatch(
+            ValueError, "Dimension must be .*but is {}.*".format(rank + 1)):
+          self._tf_ifft(x, rank, fft_length)
+
+      # Test that calling the kernel directly without padding to fft_length
+      # produces an error.
+      rffts_for_rank = {
+          1: [gen_spectral_ops.rfft, gen_spectral_ops.irfft],
+          2: [gen_spectral_ops.rfft2d, gen_spectral_ops.irfft2d],
+          3: [gen_spectral_ops.rfft3d, gen_spectral_ops.irfft3d]
+      }
+      rfft_fn, irfft_fn = rffts_for_rank[rank]
+      with self.assertRaisesWithPredicateMatch(
+          errors.InvalidArgumentError,
+          "Input dimension .* must have length of at least 6 but got: 5"):
+        x = np.zeros((5,) * rank).astype(np.float32)
+        fft_length = [6] * rank
+        with self.cached_session():
+          self.evaluate(rfft_fn(x, fft_length))
+
+      with self.assertRaisesWithPredicateMatch(
+          errors.InvalidArgumentError,
+          "Input dimension .* must have length of at least .* but got: 3"):
+        x = np.zeros((3,) * rank).astype(np.complex64)
+        fft_length = [6] * rank
+        with self.cached_session():
+          self.evaluate(irfft_fn(x, fft_length))
+
+  @parameterized.parameters(itertools.product(
+      VALID_FFT_RANKS, range(2), (5, 6), (np.float32, np.float64)))
+  def test_grad_simple(self, rank, extra_dims, size, np_rtype):
+    # rfft3d/irfft3d do not have gradients yet.
+    if rank == 3:
+      return
+    dims = rank + extra_dims
+    tol = 1e-3 if np_rtype == np.float32 else 1e-10
+    re = np.ones(shape=(size,) * dims, dtype=np_rtype)
+    im = -np.ones(shape=(size,) * dims, dtype=np_rtype)
+    self._check_grad_real(self._tf_fft_for_rank(rank), re,
+                          rtol=tol, atol=tol)
+    self._check_grad_complex(
+        self._tf_ifft_for_rank(rank), re, im, result_is_complex=False,
+        rtol=tol, atol=tol)
+
+  @parameterized.parameters(itertools.product(
+      VALID_FFT_RANKS, range(2), (5, 6), (np.float32, np.float64)))
+  def test_grad_random(self, rank, extra_dims, size, np_rtype):
+    # rfft3d/irfft3d do not have gradients yet.
+    if rank == 3:
+      return
+    dims = rank + extra_dims
+    tol = 1e-2 if np_rtype == np.float32 else 1e-10
+    re = np.random.rand(*((size,) * dims)).astype(np_rtype) * 2 - 1
+    im = np.random.rand(*((size,) * dims)).astype(np_rtype) * 2 - 1
+    self._check_grad_real(self._tf_fft_for_rank(rank), re,
+                          rtol=tol, atol=tol)
+    self._check_grad_complex(
+        self._tf_ifft_for_rank(rank), re, im, result_is_complex=False,
+        rtol=tol, atol=tol)
 
 
-class FFTShiftTest(test.TestCase):
+@test_util.run_all_in_graph_and_eager_modes
+class FFTShiftTest(test.TestCase, parameterized.TestCase):
 
-  @test_util.run_deprecated_v1
-  def testDefinition(self):
+  def test_definition(self):
     with self.session():
       x = [0, 1, 2, 3, 4, -4, -3, -2, -1]
       y = [-4, -3, -2, -1, 0, 1, 2, 3, 4]
-      self.assertAllEqual(fft_ops.fftshift(x).eval(), y)
-      self.assertAllEqual(fft_ops.ifftshift(y).eval(), x)
+      self.assertAllEqual(fft_ops.fftshift(x), y)
+      self.assertAllEqual(fft_ops.ifftshift(y), x)
       x = [0, 1, 2, 3, 4, -5, -4, -3, -2, -1]
       y = [-5, -4, -3, -2, -1, 0, 1, 2, 3, 4]
-      self.assertAllEqual(fft_ops.fftshift(x).eval(), y)
-      self.assertAllEqual(fft_ops.ifftshift(y).eval(), x)
+      self.assertAllEqual(fft_ops.fftshift(x), y)
+      self.assertAllEqual(fft_ops.ifftshift(y), x)
 
-  @test_util.run_deprecated_v1
-  def testAxesKeyword(self):
+  def test_axes_keyword(self):
     with self.session():
       freqs = [[0, 1, 2], [3, 4, -4], [-3, -2, -1]]
       shifted = [[-1, -3, -2], [2, 0, 1], [-4, 3, 4]]
-      self.assertAllEqual(fft_ops.fftshift(freqs, axes=(0, 1)).eval(), shifted)
+      self.assertAllEqual(fft_ops.fftshift(freqs, axes=(0, 1)), shifted)
       self.assertAllEqual(
-          fft_ops.fftshift(freqs, axes=0).eval(),
-          fft_ops.fftshift(freqs, axes=(0,)).eval())
-      self.assertAllEqual(fft_ops.ifftshift(shifted, axes=(0, 1)).eval(), freqs)
+          fft_ops.fftshift(freqs, axes=0),
+          fft_ops.fftshift(freqs, axes=(0,)))
+      self.assertAllEqual(fft_ops.ifftshift(shifted, axes=(0, 1)), freqs)
       self.assertAllEqual(
-          fft_ops.ifftshift(shifted, axes=0).eval(),
-          fft_ops.ifftshift(shifted, axes=(0,)).eval())
-      self.assertAllEqual(fft_ops.fftshift(freqs).eval(), shifted)
-      self.assertAllEqual(fft_ops.ifftshift(shifted).eval(), freqs)
+          fft_ops.ifftshift(shifted, axes=0),
+          fft_ops.ifftshift(shifted, axes=(0,)))
+      self.assertAllEqual(fft_ops.fftshift(freqs), shifted)
+      self.assertAllEqual(fft_ops.ifftshift(shifted), freqs)
 
-  @test_util.run_deprecated_v1
-  def testNumpyCompatibility(self):
+  def test_numpy_compatibility(self):
     with self.session():
       x = [0, 1, 2, 3, 4, -4, -3, -2, -1]
       y = [-4, -3, -2, -1, 0, 1, 2, 3, 4]
-      self.assertAllEqual(fft_ops.fftshift(x).eval(), np.fft.fftshift(x))
-      self.assertAllEqual(fft_ops.ifftshift(y).eval(), np.fft.ifftshift(y))
+      self.assertAllEqual(fft_ops.fftshift(x), np.fft.fftshift(x))
+      self.assertAllEqual(fft_ops.ifftshift(y), np.fft.ifftshift(y))
       x = [0, 1, 2, 3, 4, -5, -4, -3, -2, -1]
       y = [-5, -4, -3, -2, -1, 0, 1, 2, 3, 4]
-      self.assertAllEqual(fft_ops.fftshift(x).eval(), np.fft.fftshift(x))
-      self.assertAllEqual(fft_ops.ifftshift(y).eval(), np.fft.ifftshift(y))
+      self.assertAllEqual(fft_ops.fftshift(x), np.fft.fftshift(x))
+      self.assertAllEqual(fft_ops.ifftshift(y), np.fft.ifftshift(y))
       freqs = [[0, 1, 2], [3, 4, -4], [-3, -2, -1]]
       shifted = [[-1, -3, -2], [2, 0, 1], [-4, 3, 4]]
       self.assertAllEqual(
-          fft_ops.fftshift(freqs, axes=(0, 1)).eval(),
+          fft_ops.fftshift(freqs, axes=(0, 1)),
           np.fft.fftshift(freqs, axes=(0, 1)))
       self.assertAllEqual(
-          fft_ops.ifftshift(shifted, axes=(0, 1)).eval(),
+          fft_ops.ifftshift(shifted, axes=(0, 1)),
           np.fft.ifftshift(shifted, axes=(0, 1)))
 
-
-  @test_util.run_deprecated_v1
-  def testPlaceholder(self):
+  @parameterized.parameters(None, 1, ([1, 2],))
+  def test_placeholder(self, axes):
+    if context.executing_eagerly():
+      return
     x = array_ops.placeholder(shape=[None, None, None], dtype="float32")
-    axes_to_test = [None, 1, [1, 2]]
-    for axes in axes_to_test:
-      y_fftshift = fft_ops.fftshift(x, axes=axes)
-      y_ifftshift = fft_ops.ifftshift(x, axes=axes)
-      with self.session() as sess:
-        x_np = np.random.rand(16, 256, 256)
-        y_fftshift_res, y_ifftshift_res = sess.run(
-            [y_fftshift, y_ifftshift],
-            feed_dict={x: x_np},
-        )
-        self.assertAllClose(y_fftshift_res, np.fft.fftshift(x_np, axes=axes))
-        self.assertAllClose(y_ifftshift_res, np.fft.ifftshift(x_np, axes=axes))
+    y_fftshift = fft_ops.fftshift(x, axes=axes)
+    y_ifftshift = fft_ops.ifftshift(x, axes=axes)
+    x_np = np.random.rand(16, 256, 256)
+    with self.session() as sess:
+      y_fftshift_res, y_ifftshift_res = sess.run(
+          [y_fftshift, y_ifftshift],
+          feed_dict={x: x_np})
+    self.assertAllClose(y_fftshift_res, np.fft.fftshift(x_np, axes=axes))
+    self.assertAllClose(y_ifftshift_res, np.fft.ifftshift(x_np, axes=axes))
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/kernel_tests/signal/mel_ops_test.py b/tensorflow/python/kernel_tests/signal/mel_ops_test.py
index a36a8c3f758..c3cb0b8eb37 100644
--- a/tensorflow/python/kernel_tests/signal/mel_ops_test.py
+++ b/tensorflow/python/kernel_tests/signal/mel_ops_test.py
@@ -20,6 +20,7 @@ from __future__ import print_function
 
 import numpy as np
 
+from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
@@ -130,6 +131,7 @@ def spectrogram_to_mel_matrix(num_mel_bins=20,
   return mel_weights_matrix
 
 
+@tf_test_util.run_all_in_graph_and_eager_modes
 class LinearToMelTest(test.TestCase):
 
   def test_matches_reference_implementation(self):
@@ -143,21 +145,21 @@ class LinearToMelTest(test.TestCase):
         # Settings used by Tacotron (https://arxiv.org/abs/1703.10135).
         (80, 1025, 24000.0, 80.0, 12000.0, dtypes.float64),
     ]
-    with self.session(use_gpu=True):
-      for config in configs:
-        mel_matrix_np = spectrogram_to_mel_matrix(*config)
-        mel_matrix = mel_ops.linear_to_mel_weight_matrix(*config)
-        self.assertAllClose(mel_matrix_np, self.evaluate(mel_matrix), atol=3e-6)
+    for config in configs:
+      mel_matrix_np = spectrogram_to_mel_matrix(*config)
+      mel_matrix = mel_ops.linear_to_mel_weight_matrix(*config)
+      self.assertAllClose(mel_matrix_np, mel_matrix, atol=3e-6)
 
-  @tf_test_util.run_deprecated_v1
   def test_dtypes(self):
     # LinSpace is not supported for tf.float16.
-    for dtype in (dtypes.bfloat16, dtypes.float32, dtypes.float64):
+    for dtype in (dtypes.float32, dtypes.float64):
       self.assertEqual(dtype,
                        mel_ops.linear_to_mel_weight_matrix(dtype=dtype).dtype)
 
-  @tf_test_util.run_deprecated_v1
   def test_error(self):
+    # TODO(rjryan): Error types are different under eager.
+    if context.executing_eagerly():
+      return
     with self.assertRaises(ValueError):
       mel_ops.linear_to_mel_weight_matrix(num_mel_bins=0)
     with self.assertRaises(ValueError):
@@ -180,7 +182,9 @@ class LinearToMelTest(test.TestCase):
 
   def test_constant_folding(self):
     """Mel functions should be constant foldable."""
-    # TODO(rjryan): tf.bloat16 cannot be constant folded by Grappler.
+    if context.executing_eagerly():
+      return
+    # TODO(rjryan): tf.bfloat16 cannot be constant folded by Grappler.
     for dtype in (dtypes.float32, dtypes.float64):
       g = ops.Graph()
       with g.as_default():
@@ -188,18 +192,14 @@ class LinearToMelTest(test.TestCase):
         rewritten_graph = test_util.grappler_optimize(g, [mel_matrix])
         self.assertEqual(1, len(rewritten_graph.node))
 
-  @tf_test_util.run_deprecated_v1
   def test_num_spectrogram_bins_dynamic(self):
-    with self.session(use_gpu=True):
-      num_spectrogram_bins = array_ops.placeholder(shape=(),
-                                                   dtype=dtypes.int32)
-      mel_matrix_np = spectrogram_to_mel_matrix(
-          20, 129, 8000.0, 125.0, 3800.0)
-      mel_matrix = mel_ops.linear_to_mel_weight_matrix(
-          20, num_spectrogram_bins, 8000.0, 125.0, 3800.0)
-      self.assertAllClose(
-          mel_matrix_np,
-          mel_matrix.eval(feed_dict={num_spectrogram_bins: 129}), atol=3e-6)
+    num_spectrogram_bins = array_ops.placeholder_with_default(
+        ops.convert_to_tensor(129, dtype=dtypes.int32), shape=())
+    mel_matrix_np = spectrogram_to_mel_matrix(
+        20, 129, 8000.0, 125.0, 3800.0)
+    mel_matrix = mel_ops.linear_to_mel_weight_matrix(
+        20, num_spectrogram_bins, 8000.0, 125.0, 3800.0)
+    self.assertAllClose(mel_matrix_np, mel_matrix, atol=3e-6)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/kernel_tests/signal/mfcc_ops_test.py b/tensorflow/python/kernel_tests/signal/mfcc_ops_test.py
index 935922657cd..a8eee72aa06 100644
--- a/tensorflow/python/kernel_tests/signal/mfcc_ops_test.py
+++ b/tensorflow/python/kernel_tests/signal/mfcc_ops_test.py
@@ -18,12 +18,12 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.python.eager import context
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import random_ops
-from tensorflow.python.ops import spectral_ops_test_util
 from tensorflow.python.ops.signal import mfcc_ops
 from tensorflow.python.platform import test
 
@@ -31,9 +31,9 @@ from tensorflow.python.platform import test
 # TODO(rjryan): We have no open source tests for MFCCs at the moment. Internally
 # at Google, this code is tested against a reference implementation that follows
 # HTK conventions.
+@test_util.run_all_in_graph_and_eager_modes
 class MFCCTest(test.TestCase):
 
-  @test_util.run_deprecated_v1
   def test_error(self):
     # num_mel_bins must be positive.
     with self.assertRaises(ValueError):
@@ -45,23 +45,19 @@ class MFCCTest(test.TestCase):
       signal = array_ops.zeros((2, 3, 5), dtype=dtypes.float64)
       mfcc_ops.mfccs_from_log_mel_spectrograms(signal)
 
-  @test_util.run_deprecated_v1
   def test_basic(self):
     """A basic test that the op runs on random input."""
-    with spectral_ops_test_util.fft_kernel_label_map():
-      with self.session(use_gpu=True):
-        signal = random_ops.random_normal((2, 3, 5))
-        mfcc_ops.mfccs_from_log_mel_spectrograms(signal).eval()
+    signal = random_ops.random_normal((2, 3, 5))
+    self.evaluate(mfcc_ops.mfccs_from_log_mel_spectrograms(signal))
 
-  @test_util.run_deprecated_v1
   def test_unknown_shape(self):
     """A test that the op runs when shape and rank are unknown."""
-    with spectral_ops_test_util.fft_kernel_label_map():
-      with self.session(use_gpu=True):
-        signal = array_ops.placeholder_with_default(
-            random_ops.random_normal((2, 3, 5)), tensor_shape.TensorShape(None))
-        self.assertIsNone(signal.shape.ndims)
-        mfcc_ops.mfccs_from_log_mel_spectrograms(signal).eval()
+    if context.executing_eagerly():
+      return
+    signal = array_ops.placeholder_with_default(
+        random_ops.random_normal((2, 3, 5)), tensor_shape.TensorShape(None))
+    self.assertIsNone(signal.shape.ndims)
+    self.evaluate(mfcc_ops.mfccs_from_log_mel_spectrograms(signal))
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/kernel_tests/signal/reconstruction_ops_test.py b/tensorflow/python/kernel_tests/signal/reconstruction_ops_test.py
index a9bcbb8cd4b..cf92f351d85 100644
--- a/tensorflow/python/kernel_tests/signal/reconstruction_ops_test.py
+++ b/tensorflow/python/kernel_tests/signal/reconstruction_ops_test.py
@@ -31,6 +31,7 @@ from tensorflow.python.ops.signal import reconstruction_ops
 from tensorflow.python.platform import test
 
 
+@test_util.run_all_in_graph_and_eager_modes
 class ReconstructionOpsTest(test.TestCase):
 
   def __init__(self, *args, **kwargs):
@@ -56,76 +57,48 @@ class ReconstructionOpsTest(test.TestCase):
   def test_all_ones(self):
     signal = array_ops.ones([3, 5])
     reconstruction = reconstruction_ops.overlap_and_add(signal, 2)
-
     self.assertEqual(reconstruction.shape.as_list(), [9])
+    expected_output = np.array([1, 1, 2, 2, 3, 2, 2, 1, 1])
+    self.assertAllClose(reconstruction, expected_output)
 
-    with self.session(use_gpu=True):
-      output = self.evaluate(reconstruction)
-
-      expected_output = np.array([1, 1, 2, 2, 3, 2, 2, 1, 1])
-
-      self.assertAllClose(output, expected_output)
-
-  @test_util.run_deprecated_v1
   def test_unknown_shapes(self):
-    # This test uses placeholders and does not work in eager mode.
+    # This test uses placeholders and does not work in Eager mode.
     if context.executing_eagerly():
       return
-
-    signal = array_ops.placeholder(dtype=dtypes.int32, shape=[None, None, None])
-    frame_step = array_ops.placeholder(dtype=dtypes.int32, shape=[])
+    signal = array_ops.placeholder_with_default(
+        np.ones((4, 3, 5)).astype(np.int32), shape=[None, None, None])
+    frame_step = array_ops.placeholder_with_default(2, shape=[])
     reconstruction = reconstruction_ops.overlap_and_add(signal, frame_step)
-
     self.assertEqual(reconstruction.shape.as_list(), [None, None])
+    expected_output = np.array([[1, 1, 2, 2, 3, 2, 2, 1, 1]] * 4)
+    self.assertAllClose(reconstruction, expected_output)
 
-    with self.session(use_gpu=True) as sess:
-      output = sess.run(reconstruction,
-                        feed_dict={signal: np.ones([4, 3, 5]), frame_step: 2})
-
-      expected_output = np.array([[1, 1, 2, 2, 3, 2, 2, 1, 1]] * 4)
-
-      self.assertAllClose(output, expected_output)
-
-  @test_util.run_deprecated_v1
   def test_unknown_rank(self):
     # This test uses placeholders and does not work in eager mode.
     if context.executing_eagerly():
       return
-
-    signal = array_ops.placeholder(dtype=dtypes.int32, shape=None)
-    frame_step = array_ops.placeholder(dtype=dtypes.int32, shape=[])
+    signal = array_ops.placeholder_with_default(
+        np.ones((4, 3, 5)).astype(np.int32), shape=None)
+    frame_step = array_ops.placeholder_with_default(2, shape=[])
     reconstruction = reconstruction_ops.overlap_and_add(signal, frame_step)
 
     self.assertEqual(reconstruction.shape, None)
+    expected_output = np.array([[1, 1, 2, 2, 3, 2, 2, 1, 1]] * 4)
+    self.assertAllClose(reconstruction, expected_output)
 
-    with self.session(use_gpu=True) as sess:
-      output = sess.run(reconstruction,
-                        feed_dict={signal: np.ones([4, 3, 5]), frame_step: 2})
-
-      expected_output = np.array([[1, 1, 2, 2, 3, 2, 2, 1, 1]] * 4)
-
-      self.assertAllClose(output, expected_output)
-
-  @test_util.run_deprecated_v1
   def test_fast_path(self):
+    if context.executing_eagerly():
+      return
     # This test uses tensor names and does not work in eager mode.
     if context.executing_eagerly():
       return
-
     signal = array_ops.ones([3, 5])
     frame_step = 5
     reconstruction = reconstruction_ops.overlap_and_add(signal, frame_step)
-
     self.assertEqual(reconstruction.name, "overlap_and_add/fast_path:0")
+    expected_output = np.ones([15])
+    self.assertAllClose(reconstruction, expected_output)
 
-    with self.session(use_gpu=True) as sess:
-      output = self.evaluate(reconstruction)
-
-      expected_output = np.ones([15])
-
-      self.assertAllClose(output, expected_output)
-
-  @test_util.run_deprecated_v1
   def test_simple(self):
     def make_input(frame_length, num_frames=3):
       """Generate a tensor of num_frames frames of frame_length."""
@@ -150,37 +123,33 @@ class ReconstructionOpsTest(test.TestCase):
         (make_input(4), [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12], 4),
     ]
 
-    with self.session(use_gpu=True):
-      for signal, expected, frame_hop in configurations:
-        reconstruction = reconstruction_ops.overlap_and_add(
-            np.array(signal), frame_hop).eval()
-        expected_output = np.array(expected)
-        self.assertAllClose(reconstruction, expected_output)
+    for signal, expected, frame_hop in configurations:
+      reconstruction = reconstruction_ops.overlap_and_add(
+          np.array(signal), frame_hop)
+      expected_output = np.array(expected)
+      self.assertAllClose(reconstruction, expected_output)
 
   def test_powers(self):
     signal = constant_op.constant(np.squeeze(self.powers[0, :, :]),
                                   dtype=dtypes.int64)
     reconstruction = reconstruction_ops.overlap_and_add(signal, self.frame_hop)
 
-    with self.session(use_gpu=True):
-      output = self.evaluate(reconstruction)
-      string_output = [np.base_repr(x, self.bases[0]) for x in output]
-
-      self.assertEqual(string_output, self.expected_string)
+    output = self.evaluate(reconstruction)
+    string_output = [np.base_repr(x, self.bases[0]) for x in output]
+    self.assertEqual(string_output, self.expected_string)
 
   def test_batch(self):
     signal = constant_op.constant(self.powers, dtype=dtypes.int64)
     reconstruction = reconstruction_ops.overlap_and_add(signal, self.frame_hop)
 
-    with self.session(use_gpu=True):
-      output = self.evaluate(reconstruction)
+    output = self.evaluate(reconstruction)
 
-      accumulator = True
-      for i in range(self.batch_size):
-        string_output = [np.base_repr(x, self.bases[i]) for x in output[i, :]]
-        accumulator = accumulator and (string_output == self.expected_string)
+    accumulator = True
+    for i in range(self.batch_size):
+      string_output = [np.base_repr(x, self.bases[i]) for x in output[i, :]]
+      accumulator = accumulator and (string_output == self.expected_string)
 
-      self.assertTrue(accumulator)
+    self.assertTrue(accumulator)
 
   def test_one_element_batch(self):
     input_matrix = np.squeeze(self.powers[0, :, :])
@@ -188,17 +157,18 @@ class ReconstructionOpsTest(test.TestCase):
     signal = constant_op.constant(input_matrix, dtype=dtypes.float32)
     reconstruction = reconstruction_ops.overlap_and_add(signal, self.frame_hop)
 
-    with self.session(use_gpu=True):
-      output = self.evaluate(reconstruction)
+    output = self.evaluate(reconstruction)
 
-      string_output = [np.base_repr(int(x), self.bases[0]) for x in
-                       np.squeeze(output)]
+    string_output = [np.base_repr(int(x), self.bases[0]) for x in
+                     np.squeeze(output)]
 
-      self.assertEqual(output.shape, (1, 9))
-      self.assertEqual(string_output, self.expected_string)
+    self.assertEqual(output.shape, (1, 9))
+    self.assertEqual(string_output, self.expected_string)
 
-  @test_util.run_deprecated_v1
   def test_gradient(self):
+    # TODO(rjryan): Eager gradient tests.
+    if context.executing_eagerly():
+      return
     configurations = [
         ((1, 128), 1),
         ((5, 35), 17),
@@ -208,43 +178,45 @@ class ReconstructionOpsTest(test.TestCase):
         ((2, 2, 2, 10, 128), 125),
     ]
 
-    with self.session(use_gpu=True) as sess:
-      for shape, frame_hop in configurations:
-        signal = array_ops.zeros(shape)
-        reconstruction = reconstruction_ops.overlap_and_add(signal, frame_hop)
-        loss = math_ops.reduce_sum(reconstruction)
-        # Increasing any sample in the input frames by one will increase the sum
-        # of all the samples in the reconstruction by 1, so the gradient should
-        # be all ones, no matter the shape or hop.
-        gradient = sess.run(gradients_impl.gradients([loss], [signal])[0])
-        self.assertTrue((gradient == 1.0).all())
-
-  @test_util.run_deprecated_v1
-  def test_gradient_batch(self):
-    with self.session(use_gpu=True) as sess:
-      signal = array_ops.zeros((2, 10, 10))
-      frame_hop = 10
+    for shape, frame_hop in configurations:
+      signal = array_ops.zeros(shape)
       reconstruction = reconstruction_ops.overlap_and_add(signal, frame_hop)
-
-      # Multiply the first batch-item's reconstruction by zeros. This will block
-      # gradient from flowing into the first batch item from the loss. Multiply
-      # the second batch item by the integers from 0 to 99. Since there is zero
-      # overlap, the gradient for this batch item will be 0-99 shaped as (10,
-      # 10).
-      reconstruction *= array_ops.stack(
-          [array_ops.zeros((100,)),
-           math_ops.cast(math_ops.range(100), dtypes.float32)])
       loss = math_ops.reduce_sum(reconstruction)
+      # Increasing any sample in the input frames by one will increase the sum
+      # of all the samples in the reconstruction by 1, so the gradient should
+      # be all ones, no matter the shape or hop.
+      gradient = self.evaluate(gradients_impl.gradients([loss], [signal])[0])
+      self.assertTrue((gradient == 1.0).all())
 
-      # Verify that only the second batch item receives gradient.
-      gradient = sess.run(gradients_impl.gradients([loss], [signal])[0])
-      expected_gradient = np.stack([
-          np.zeros((10, 10)),
-          np.reshape(np.arange(100).astype(np.float32), (10, 10))])
-      self.assertAllEqual(expected_gradient, gradient)
+  def test_gradient_batch(self):
+    # TODO(rjryan): Eager gradient tests.
+    if context.executing_eagerly():
+      return
+    signal = array_ops.zeros((2, 10, 10))
+    frame_hop = 10
+    reconstruction = reconstruction_ops.overlap_and_add(signal, frame_hop)
+
+    # Multiply the first batch-item's reconstruction by zeros. This will block
+    # gradient from flowing into the first batch item from the loss. Multiply
+    # the second batch item by the integers from 0 to 99. Since there is zero
+    # overlap, the gradient for this batch item will be 0-99 shaped as (10,
+    # 10).
+    reconstruction *= array_ops.stack(
+        [array_ops.zeros((100,)),
+         math_ops.cast(math_ops.range(100), dtypes.float32)])
+    loss = math_ops.reduce_sum(reconstruction)
+
+    # Verify that only the second batch item receives gradient.
+    gradient = self.evaluate(gradients_impl.gradients([loss], [signal])[0])
+    expected_gradient = np.stack([
+        np.zeros((10, 10)),
+        np.reshape(np.arange(100).astype(np.float32), (10, 10))])
+    self.assertAllEqual(expected_gradient, gradient)
 
-  @test_util.run_deprecated_v1
   def test_gradient_numerical(self):
+    # TODO(rjryan): Eager gradient tests.
+    if context.executing_eagerly():
+      return
     with self.session(use_gpu=True):
       shape = (2, 10, 10)
       framed_signal = array_ops.zeros(shape)
diff --git a/tensorflow/python/kernel_tests/signal/shape_ops_test.py b/tensorflow/python/kernel_tests/signal/shape_ops_test.py
index 32ac76e80d0..6d9c77a0136 100644
--- a/tensorflow/python/kernel_tests/signal/shape_ops_test.py
+++ b/tensorflow/python/kernel_tests/signal/shape_ops_test.py
@@ -20,9 +20,11 @@ from __future__ import print_function
 
 import numpy as np
 
+from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import test_util as tf_test_util
 from tensorflow.python.kernel_tests.signal import test_util
 from tensorflow.python.ops import array_ops
@@ -31,50 +33,47 @@ from tensorflow.python.ops.signal import shape_ops
 from tensorflow.python.platform import test
 
 
+@tf_test_util.run_all_in_graph_and_eager_modes
 class FrameTest(test.TestCase):
 
-  @tf_test_util.run_deprecated_v1
   def test_mapping_of_indices_without_padding(self):
-    with self.session(use_gpu=True):
-      tensor = constant_op.constant(np.arange(9152), dtypes.int32)
-      tensor = array_ops.expand_dims(tensor, 0)
+    tensor = constant_op.constant(np.arange(9152), dtypes.int32)
+    tensor = array_ops.expand_dims(tensor, 0)
 
-      result = shape_ops.frame(tensor, 512, 180, pad_end=False).eval()
+    result = shape_ops.frame(tensor, 512, 180, pad_end=False)
 
-      expected = np.tile(np.arange(512), (49, 1))
-      expected += np.tile(np.arange(49) * 180, (512, 1)).T
+    expected = np.tile(np.arange(512), (49, 1))
+    expected += np.tile(np.arange(49) * 180, (512, 1)).T
 
-      expected = np.expand_dims(expected, axis=0)
-      expected = np.array(expected, dtype=np.int32)
+    expected = np.expand_dims(expected, axis=0)
+    expected = np.array(expected, dtype=np.int32)
+    self.assertAllEqual(expected, result)
 
-      self.assertAllEqual(expected, result)
-
-  @tf_test_util.run_deprecated_v1
   def test_mapping_of_indices_with_padding(self):
-    with self.session(use_gpu=True):
-      tensor = constant_op.constant(np.arange(10000), dtypes.int32)
-      tensor = array_ops.expand_dims(tensor, 0)
+    tensor = constant_op.constant(np.arange(10000), dtypes.int32)
+    tensor = array_ops.expand_dims(tensor, 0)
 
-      result = shape_ops.frame(tensor, 512, 192, pad_end=True).eval()
+    result = shape_ops.frame(tensor, 512, 192, pad_end=True)
 
-      expected = np.tile(np.arange(512), (53, 1))
-      expected += np.tile(np.arange(53) * 192, (512, 1)).T
+    expected = np.tile(np.arange(512), (53, 1))
+    expected += np.tile(np.arange(53) * 192, (512, 1)).T
 
-      expected[expected >= 10000] = 0
+    expected[expected >= 10000] = 0
 
-      expected = np.expand_dims(expected, axis=0)
-      expected = np.array(expected, dtype=np.int32)
+    expected = np.expand_dims(expected, axis=0)
+    expected = np.array(expected, dtype=np.int32)
 
-      self.assertAllEqual(expected, result)
+    self.assertAllEqual(expected, result)
 
-  @tf_test_util.run_deprecated_v1
   def test_invalid_inputs(self):
     # Rank 0 input signal.
     with self.assertRaises(ValueError):
       shape_ops.frame(1, 1, 1)
 
-    # If the rank is unknown, do not raise an exception.
-    shape_ops.frame(array_ops.placeholder(dtypes.float32), 1, 1)
+    if not context.executing_eagerly():
+      # If the rank is unknown, do not raise an exception.
+      shape_ops.frame(array_ops.placeholder_with_default(
+          1, shape=tensor_shape.TensorShape(None)), 1, 1)
 
     # Non-scalar frame_length.
     with self.assertRaises(ValueError):
@@ -88,24 +87,23 @@ class FrameTest(test.TestCase):
     with self.assertRaises(ValueError):
       shape_ops.frame([1], 1, 1, pad_end=True, pad_value=[1])
 
-  @tf_test_util.run_deprecated_v1
   def test_length_zero(self):
     signal = constant_op.constant([], dtype=dtypes.float32)
     frame_length = 2
     frame_step = 1
 
-    with self.session(use_gpu=True):
-      result = shape_ops.frame(signal, frame_length, frame_step,
-                               pad_end=True, pad_value=99).eval()
-      self.assertEqual((0, 2), result.shape)
+    result = self.evaluate(shape_ops.frame(
+        signal, frame_length, frame_step, pad_end=True, pad_value=99))
+    self.assertEqual((0, 2), result.shape)
 
-      result = shape_ops.frame(signal, frame_length, frame_step,
-                               pad_end=False).eval()
-      self.assertEqual((0, 2), result.shape)
+    result = self.evaluate(
+        shape_ops.frame(signal, frame_length, frame_step, pad_end=False))
+    self.assertEqual((0, 2), result.shape)
 
-  @tf_test_util.run_deprecated_v1
   def test_shape_inference(self):
-    signal = array_ops.placeholder(dtypes.int32, shape=[1, 1])
+    if context.executing_eagerly():
+      return
+    signal = array_ops.zeros((1, 1), dtype=dtypes.int32)
     frame_length = 2
     frame_step = 1
     # Shape inference is able to detect the rank and inner-most dimension
@@ -120,8 +118,9 @@ class FrameTest(test.TestCase):
 
     # If frame_length is not known, rank and (known) outer and inner dimensions
     # are inferred.
-    signal = array_ops.placeholder(dtypes.int32, shape=[1, 2, 3, 4])
-    frame_length = array_ops.placeholder(dtypes.int32, shape=[])
+    signal = array_ops.zeros([1, 2, 3, 4], dtype=dtypes.int32)
+    frame_length = array_ops.placeholder_with_default(
+        ops.convert_to_tensor(0, dtypes.int32), shape=[])
     frame_step = 1
     result = shape_ops.frame(signal, frame_length, frame_step,
                              pad_end=True, pad_value=99, axis=1)
@@ -133,8 +132,9 @@ class FrameTest(test.TestCase):
 
     # If frame_length and inner-most dimension is known, rank, inner dimensions,
     # and known outer dimensions are inferred.
-    signal = array_ops.placeholder(dtypes.int32,
-                                   shape=[None, 5, None, 20, 5, 3])
+    signal = array_ops.placeholder_with_default(
+        array_ops.zeros((0, 5, 0, 20, 5, 3), dtype=dtypes.int32),
+        shape=[None, 5, None, 20, 5, 3])
     frame_length = 4
     frame_step = 3
     result = shape_ops.frame(signal, frame_length, frame_step,
@@ -155,67 +155,61 @@ class FrameTest(test.TestCase):
         for pad_end in [False, True]:
           op = shape_ops.frame(signal, frame_length, frame_step,
                                pad_end=pad_end, pad_value=99)
-          with self.cached_session(use_gpu=True):
-            result = self.evaluate(op)
+          result = self.evaluate(op)
           self.assertEqual(op.shape.as_list(), list(result.shape))
 
-  @tf_test_util.run_deprecated_v1
   def test_basic_mono(self):
     signal = np.arange(6)
     frame_length = 3
     frame_step = 2
 
-    with self.session(use_gpu=True):
-      for rank in range(5):
-        nd_signal = np.reshape(signal, (1,) * rank + signal.shape)
+    for rank in range(5):
+      nd_signal = np.reshape(signal, (1,) * rank + signal.shape)
 
-        # With padding, we pad the last frame with pad_value.
-        result = shape_ops.frame(nd_signal, frame_length, frame_step,
-                                 pad_end=True, pad_value=99).eval()
-        expected_inner_frames = np.array([[0, 1, 2], [2, 3, 4], [4, 5, 99]])
-        expected = np.reshape(
-            expected_inner_frames, (1,) * rank + expected_inner_frames.shape)
-        self.assertAllEqual(expected, result)
+      # With padding, we pad the last frame with pad_value.
+      result = shape_ops.frame(nd_signal, frame_length, frame_step,
+                               pad_end=True, pad_value=99)
+      expected_inner_frames = np.array([[0, 1, 2], [2, 3, 4], [4, 5, 99]])
+      expected = np.reshape(
+          expected_inner_frames, (1,) * rank + expected_inner_frames.shape)
+      self.assertAllEqual(expected, result)
 
-        # Without padding, we drop the last frame.
-        expected_inner_frames = np.array([[0, 1, 2], [2, 3, 4]])
-        expected = np.reshape(
-            expected_inner_frames, (1,) * rank + expected_inner_frames.shape)
-        result = shape_ops.frame(nd_signal, frame_length, frame_step,
-                                 pad_end=False).eval()
-        self.assertAllEqual(expected, result)
+      # Without padding, we drop the last frame.
+      expected_inner_frames = np.array([[0, 1, 2], [2, 3, 4]])
+      expected = np.reshape(
+          expected_inner_frames, (1,) * rank + expected_inner_frames.shape)
+      result = shape_ops.frame(nd_signal, frame_length, frame_step,
+                               pad_end=False)
+      self.assertAllEqual(expected, result)
 
-  @tf_test_util.run_deprecated_v1
   def test_basic_stereo(self):
     signal = np.vstack([np.arange(6),
                         np.arange(6) + 10])
     frame_length = 3
     frame_step = 2
 
-    with self.session(use_gpu=True):
-      for rank in range(5):
-        nd_signal = np.reshape(signal, (1,) * rank + signal.shape)
+    for rank in range(5):
+      nd_signal = np.reshape(signal, (1,) * rank + signal.shape)
 
-        # With padding, we pad the last frame with pad_value.
-        result = shape_ops.frame(nd_signal, frame_length, frame_step,
-                                 pad_end=True, pad_value=99).eval()
-        expected_inner_frames = np.array([
-            [[0, 1, 2], [2, 3, 4], [4, 5, 99]],
-            [[10, 11, 12], [12, 13, 14], [14, 15, 99]]])
-        expected = np.reshape(
-            expected_inner_frames, (1,) * rank + expected_inner_frames.shape)
-        self.assertAllEqual(expected, result)
+      # With padding, we pad the last frame with pad_value.
+      result = shape_ops.frame(nd_signal, frame_length, frame_step,
+                               pad_end=True, pad_value=99)
+      expected_inner_frames = np.array([
+          [[0, 1, 2], [2, 3, 4], [4, 5, 99]],
+          [[10, 11, 12], [12, 13, 14], [14, 15, 99]]])
+      expected = np.reshape(
+          expected_inner_frames, (1,) * rank + expected_inner_frames.shape)
+      self.assertAllEqual(expected, result)
 
-        # Without padding, we drop the last frame.
-        expected_inner_frames = np.array([[[0, 1, 2], [2, 3, 4]],
-                                          [[10, 11, 12], [12, 13, 14]]])
-        expected = np.reshape(
-            expected_inner_frames, (1,) * rank + expected_inner_frames.shape)
-        result = shape_ops.frame(nd_signal, frame_length, frame_step,
-                                 pad_end=False).eval()
-        self.assertAllEqual(expected, result)
+      # Without padding, we drop the last frame.
+      expected_inner_frames = np.array([[[0, 1, 2], [2, 3, 4]],
+                                        [[10, 11, 12], [12, 13, 14]]])
+      expected = np.reshape(
+          expected_inner_frames, (1,) * rank + expected_inner_frames.shape)
+      result = shape_ops.frame(nd_signal, frame_length, frame_step,
+                               pad_end=False)
+      self.assertAllEqual(expected, result)
 
-  @tf_test_util.run_deprecated_v1
   def test_complex_shape(self):
     signal = np.vstack([np.arange(6),
                         np.arange(6) + 10,
@@ -227,98 +221,94 @@ class FrameTest(test.TestCase):
     frame_length = 3
     frame_step = 2
 
-    with self.session(use_gpu=True):
-      # With padding, we pad the last frame with pad_value.
-      result = shape_ops.frame(signal, frame_length, frame_step,
-                               pad_end=True, pad_value=99).eval()
-      # Resulting shape is (2, 1, 3, 1, 3, 3).
-      expected = [[[[[[0, 1, 2], [2, 3, 4], [4, 5, 99]]],
-                    [[[10, 11, 12], [12, 13, 14], [14, 15, 99]]],
-                    [[[20, 21, 22], [22, 23, 24], [24, 25, 99]]]]],
-                  [[[[[30, 31, 32], [32, 33, 34], [34, 35, 99]]],
-                    [[[40, 41, 42], [42, 43, 44], [44, 45, 99]]],
-                    [[[50, 51, 52], [52, 53, 54], [54, 55, 99]]]]]]
-      self.assertAllEqual(expected, result)
+    # With padding, we pad the last frame with pad_value.
+    result = shape_ops.frame(signal, frame_length, frame_step,
+                             pad_end=True, pad_value=99)
+    # Resulting shape is (2, 1, 3, 1, 3, 3).
+    expected = [[[[[[0, 1, 2], [2, 3, 4], [4, 5, 99]]],
+                  [[[10, 11, 12], [12, 13, 14], [14, 15, 99]]],
+                  [[[20, 21, 22], [22, 23, 24], [24, 25, 99]]]]],
+                [[[[[30, 31, 32], [32, 33, 34], [34, 35, 99]]],
+                  [[[40, 41, 42], [42, 43, 44], [44, 45, 99]]],
+                  [[[50, 51, 52], [52, 53, 54], [54, 55, 99]]]]]]
+    self.assertAllEqual(expected, result)
 
-      result = shape_ops.frame(signal, frame_length, frame_step,
-                               pad_end=False).eval()
-      # Resulting shape is (2, 1, 3, 1, 3, 2).
-      expected = [[[[[[0, 1, 2], [2, 3, 4]]],
-                    [[[10, 11, 12], [12, 13, 14]]],
-                    [[[20, 21, 22], [22, 23, 24]]]]],
-                  [[[[[30, 31, 32], [32, 33, 34]]],
-                    [[[40, 41, 42], [42, 43, 44]]],
-                    [[[50, 51, 52], [52, 53, 54]]]]]]
-      self.assertAllEqual(expected, result)
+    result = shape_ops.frame(signal, frame_length, frame_step,
+                             pad_end=False)
+    # Resulting shape is (2, 1, 3, 1, 3, 2).
+    expected = [[[[[[0, 1, 2], [2, 3, 4]]],
+                  [[[10, 11, 12], [12, 13, 14]]],
+                  [[[20, 21, 22], [22, 23, 24]]]]],
+                [[[[[30, 31, 32], [32, 33, 34]]],
+                  [[[40, 41, 42], [42, 43, 44]]],
+                  [[[50, 51, 52], [52, 53, 54]]]]]]
+    self.assertAllEqual(expected, result)
 
   def test_axis(self):
     signal = np.reshape(np.arange(16), (2, 4, 2))
-    with self.session(use_gpu=True):
-      result = shape_ops.frame(signal, frame_length=2, frame_step=2,
-                               pad_end=True, axis=1)
-      expected = np.reshape(np.arange(16), (2, 2, 2, 2))
-      self.assertAllEqual(expected, self.evaluate(result))
+    result = shape_ops.frame(signal, frame_length=2, frame_step=2,
+                             pad_end=True, axis=1)
+    expected = np.reshape(np.arange(16), (2, 2, 2, 2))
+    self.assertAllEqual(expected, self.evaluate(result))
 
-      result = shape_ops.frame(signal, frame_length=2, frame_step=1,
-                               pad_end=True, axis=1)
-      expected = [[[[0, 1], [2, 3]],
-                   [[2, 3], [4, 5]],
-                   [[4, 5], [6, 7]],
-                   [[6, 7], [0, 0]]],
-                  [[[8, 9], [10, 11]],
-                   [[10, 11], [12, 13]],
-                   [[12, 13], [14, 15]],
-                   [[14, 15], [0, 0]]]]
-      self.assertAllEqual(expected, self.evaluate(result))
+    result = shape_ops.frame(signal, frame_length=2, frame_step=1,
+                             pad_end=True, axis=1)
+    expected = [[[[0, 1], [2, 3]],
+                 [[2, 3], [4, 5]],
+                 [[4, 5], [6, 7]],
+                 [[6, 7], [0, 0]]],
+                [[[8, 9], [10, 11]],
+                 [[10, 11], [12, 13]],
+                 [[12, 13], [14, 15]],
+                 [[14, 15], [0, 0]]]]
+    self.assertAllEqual(expected, self.evaluate(result))
 
-      result = shape_ops.frame(signal, frame_length=3, frame_step=1,
-                               pad_end=True, axis=1)
-      expected = [[[[0, 1], [2, 3], [4, 5]],
-                   [[2, 3], [4, 5], [6, 7]],
-                   [[4, 5], [6, 7], [0, 0]],
-                   [[6, 7], [0, 0], [0, 0]]],
-                  [[[8, 9], [10, 11], [12, 13]],
-                   [[10, 11], [12, 13], [14, 15]],
-                   [[12, 13], [14, 15], [0, 0]],
-                   [[14, 15], [0, 0], [0, 0]]]]
-      self.assertAllEqual(expected, self.evaluate(result))
+    result = shape_ops.frame(signal, frame_length=3, frame_step=1,
+                             pad_end=True, axis=1)
+    expected = [[[[0, 1], [2, 3], [4, 5]],
+                 [[2, 3], [4, 5], [6, 7]],
+                 [[4, 5], [6, 7], [0, 0]],
+                 [[6, 7], [0, 0], [0, 0]]],
+                [[[8, 9], [10, 11], [12, 13]],
+                 [[10, 11], [12, 13], [14, 15]],
+                 [[12, 13], [14, 15], [0, 0]],
+                 [[14, 15], [0, 0], [0, 0]]]]
+    self.assertAllEqual(expected, self.evaluate(result))
 
-  @tf_test_util.run_deprecated_v1
   def test_window_larger_than_signal(self):
     signal = constant_op.constant([[1, 2], [11, 12]], dtype=dtypes.float32)
     frame_length = 4
     frame_step = 1
 
-    with self.session(use_gpu=True):
-      result = shape_ops.frame(signal, frame_length, frame_step,
-                               pad_end=True, pad_value=99).eval()
-      self.assertAllClose([[[1, 2, 99, 99], [2, 99, 99, 99]],
-                           [[11, 12, 99, 99], [12, 99, 99, 99]]], result)
+    result = shape_ops.frame(signal, frame_length, frame_step,
+                             pad_end=True, pad_value=99)
+    self.assertAllClose([[[1, 2, 99, 99], [2, 99, 99, 99]],
+                         [[11, 12, 99, 99], [12, 99, 99, 99]]], result)
 
-      result = shape_ops.frame(signal, frame_length, frame_step,
-                               pad_end=False).eval()
-      self.assertEqual((2, 0, 4), result.shape)
+    result = shape_ops.frame(signal, frame_length, frame_step,
+                             pad_end=False)
+    self.assertEqual((2, 0, 4), result.shape)
 
-      frame_step = 2
-      result = shape_ops.frame(signal, frame_length, frame_step,
-                               pad_end=True, pad_value=99).eval()
-      self.assertAllClose([[[1, 2, 99, 99]], [[11, 12, 99, 99]]], result)
+    frame_step = 2
+    result = shape_ops.frame(signal, frame_length, frame_step,
+                             pad_end=True, pad_value=99)
+    self.assertAllClose([[[1, 2, 99, 99]], [[11, 12, 99, 99]]], result)
 
-      result = shape_ops.frame(signal, frame_length, frame_step,
-                               pad_end=False).eval()
-      self.assertEqual((2, 0, 4), result.shape)
+    result = shape_ops.frame(signal, frame_length, frame_step,
+                             pad_end=False)
+    self.assertEqual((2, 0, 4), result.shape)
 
   def test_preserves_type(self):
     signal = math_ops.range(10, dtype=dtypes.float64)
     frame_length = 2
     frame_step = 3
 
-    with self.session(use_gpu=True):
-      result = shape_ops.frame(signal, frame_length, frame_step)
-      self.assertEqual(result.dtype, signal.dtype)
+    result = shape_ops.frame(signal, frame_length, frame_step)
+    self.assertEqual(result.dtype, signal.dtype)
 
-  @tf_test_util.run_deprecated_v1
   def test_dynamic_tensor(self):
+    if context.executing_eagerly():
+      return
     # Show that frame works even when the dimensions of its input are
     # not known at graph creation time.
     input_signal = np.vstack([np.arange(4), np.arange(4) + 10,
@@ -326,18 +316,17 @@ class FrameTest(test.TestCase):
     frame_length = 2
     frame_step = 2
 
-    with self.session(use_gpu=True) as sess:
-      signal_placeholder = array_ops.placeholder(shape=(None, None),
-                                                 dtype=dtypes.float32)
-      result = sess.run(shape_ops.frame(
-          signal_placeholder, frame_length, frame_step),
-                        feed_dict={signal_placeholder: input_signal})
-      self.assertAllEqual([[[0, 1], [2, 3]],
-                           [[10, 11], [12, 13]],
-                           [[20, 21], [22, 23]]], result)
+    signal_placeholder = array_ops.placeholder_with_default(
+        input_signal, shape=(None, None))
+    result = self.evaluate(
+        shape_ops.frame(signal_placeholder, frame_length, frame_step))
+    self.assertAllEqual([[[0, 1], [2, 3]],
+                         [[10, 11], [12, 13]],
+                         [[20, 21], [22, 23]]], result)
 
-  @tf_test_util.run_deprecated_v1
   def test_gradient_numerical(self):
+    if context.executing_eagerly():
+      return
     with self.session(use_gpu=True):
       signal_shape = (2, 128)
       signal = array_ops.ones(signal_shape)
@@ -350,6 +339,8 @@ class FrameTest(test.TestCase):
 
   def test_constant_folding(self):
     """frame should be constant foldable for constant inputs."""
+    if context.executing_eagerly():
+      return
     for pad_end in [True, False]:
       g = ops.Graph()
       with g.as_default():
diff --git a/tensorflow/python/kernel_tests/signal/spectral_ops_test.py b/tensorflow/python/kernel_tests/signal/spectral_ops_test.py
index 7b9748c7f26..fbf62941b01 100644
--- a/tensorflow/python/kernel_tests/signal/spectral_ops_test.py
+++ b/tensorflow/python/kernel_tests/signal/spectral_ops_test.py
@@ -20,17 +20,19 @@ from __future__ import print_function
 
 import numpy as np
 
+from tensorflow.python.eager import context
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gradients_impl
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import random_ops
-from tensorflow.python.ops import spectral_ops_test_util
 from tensorflow.python.ops.signal import spectral_ops
 from tensorflow.python.ops.signal import window_ops
 from tensorflow.python.platform import test
 
 
+@test_util.run_all_in_graph_and_eager_modes
 class SpectralOpsTest(test.TestCase):
 
   @staticmethod
@@ -80,75 +82,70 @@ class SpectralOpsTest(test.TestCase):
     return output
 
   def _compare(self, signal, frame_length, frame_step, fft_length):
-    with spectral_ops_test_util.fft_kernel_label_map(), (
-        self.cached_session(use_gpu=True)) as sess:
-      actual_stft = spectral_ops.stft(
-          signal, frame_length, frame_step, fft_length, pad_end=False)
-      signal_ph = array_ops.placeholder(dtype=dtypes.as_dtype(signal.dtype))
-      actual_stft_from_ph = spectral_ops.stft(
-          signal_ph, frame_length, frame_step, fft_length, pad_end=False)
+    actual_stft = spectral_ops.stft(
+        signal, frame_length, frame_step, fft_length, pad_end=False)
+    signal_ph = array_ops.placeholder_with_default(signal, shape=signal.shape)
+    actual_stft_from_ph = spectral_ops.stft(
+        signal_ph, frame_length, frame_step, fft_length, pad_end=False)
 
-      actual_inverse_stft = spectral_ops.inverse_stft(
-          actual_stft, frame_length, frame_step, fft_length)
+    actual_inverse_stft = spectral_ops.inverse_stft(
+        actual_stft, frame_length, frame_step, fft_length)
 
-      actual_stft, actual_stft_from_ph, actual_inverse_stft = sess.run(
-          [actual_stft, actual_stft_from_ph, actual_inverse_stft],
-          feed_dict={signal_ph: signal})
+    actual_stft, actual_stft_from_ph, actual_inverse_stft = self.evaluate(
+        [actual_stft, actual_stft_from_ph, actual_inverse_stft])
 
-      actual_stft_ph = array_ops.placeholder(dtype=actual_stft.dtype)
-      actual_inverse_stft_from_ph = sess.run(
-          spectral_ops.inverse_stft(
-              actual_stft_ph, frame_length, frame_step, fft_length),
-          feed_dict={actual_stft_ph: actual_stft})
+    actual_stft_ph = array_ops.placeholder_with_default(
+        actual_stft, shape=actual_stft.shape)
+    actual_inverse_stft_from_ph = self.evaluate(
+        spectral_ops.inverse_stft(
+            actual_stft_ph, frame_length, frame_step, fft_length))
 
-      # Confirm that there is no difference in output when shape/rank is fully
-      # unknown or known.
-      self.assertAllClose(actual_stft, actual_stft_from_ph)
-      self.assertAllClose(actual_inverse_stft, actual_inverse_stft_from_ph)
+    # Confirm that there is no difference in output when shape/rank is fully
+    # unknown or known.
+    self.assertAllClose(actual_stft, actual_stft_from_ph)
+    self.assertAllClose(actual_inverse_stft, actual_inverse_stft_from_ph)
 
-      expected_stft = SpectralOpsTest._np_stft(
-          signal, fft_length, frame_step, frame_length)
-      self.assertAllClose(expected_stft, actual_stft, 1e-4, 1e-4)
+    expected_stft = SpectralOpsTest._np_stft(
+        signal, fft_length, frame_step, frame_length)
+    self.assertAllClose(expected_stft, actual_stft, 1e-4, 1e-4)
 
-      expected_inverse_stft = SpectralOpsTest._np_inverse_stft(
-          expected_stft, fft_length, frame_step, frame_length)
-      self.assertAllClose(
-          expected_inverse_stft, actual_inverse_stft, 1e-4, 1e-4)
+    expected_inverse_stft = SpectralOpsTest._np_inverse_stft(
+        expected_stft, fft_length, frame_step, frame_length)
+    self.assertAllClose(
+        expected_inverse_stft, actual_inverse_stft, 1e-4, 1e-4)
 
   def test_shapes(self):
-    with spectral_ops_test_util.fft_kernel_label_map(), (
-        self.session(use_gpu=True)):
-      signal = np.zeros((512,)).astype(np.float32)
+    signal = np.zeros((512,)).astype(np.float32)
 
-      # If fft_length is not provided, the smallest enclosing power of 2 of
-      # frame_length (8) is used.
-      stft = spectral_ops.stft(signal, frame_length=7, frame_step=8,
-                               pad_end=True)
-      self.assertAllEqual([64, 5], stft.shape.as_list())
-      self.assertAllEqual([64, 5], self.evaluate(stft).shape)
+    # If fft_length is not provided, the smallest enclosing power of 2 of
+    # frame_length (8) is used.
+    stft = spectral_ops.stft(signal, frame_length=7, frame_step=8,
+                             pad_end=True)
+    self.assertAllEqual([64, 5], stft.shape.as_list())
+    self.assertAllEqual([64, 5], self.evaluate(stft).shape)
 
-      stft = spectral_ops.stft(signal, frame_length=8, frame_step=8,
-                               pad_end=True)
-      self.assertAllEqual([64, 5], stft.shape.as_list())
-      self.assertAllEqual([64, 5], self.evaluate(stft).shape)
+    stft = spectral_ops.stft(signal, frame_length=8, frame_step=8,
+                             pad_end=True)
+    self.assertAllEqual([64, 5], stft.shape.as_list())
+    self.assertAllEqual([64, 5], self.evaluate(stft).shape)
 
-      stft = spectral_ops.stft(signal, frame_length=8, frame_step=8,
-                               fft_length=16, pad_end=True)
-      self.assertAllEqual([64, 9], stft.shape.as_list())
-      self.assertAllEqual([64, 9], self.evaluate(stft).shape)
+    stft = spectral_ops.stft(signal, frame_length=8, frame_step=8,
+                             fft_length=16, pad_end=True)
+    self.assertAllEqual([64, 9], stft.shape.as_list())
+    self.assertAllEqual([64, 9], self.evaluate(stft).shape)
 
-      stft = spectral_ops.stft(signal, frame_length=16, frame_step=8,
-                               fft_length=8, pad_end=True)
-      self.assertAllEqual([64, 5], stft.shape.as_list())
-      self.assertAllEqual([64, 5], self.evaluate(stft).shape)
+    stft = spectral_ops.stft(signal, frame_length=16, frame_step=8,
+                             fft_length=8, pad_end=True)
+    self.assertAllEqual([64, 5], stft.shape.as_list())
+    self.assertAllEqual([64, 5], self.evaluate(stft).shape)
 
-      stft = np.zeros((32, 9)).astype(np.complex64)
+    stft = np.zeros((32, 9)).astype(np.complex64)
 
-      inverse_stft = spectral_ops.inverse_stft(stft, frame_length=8,
-                                               fft_length=16, frame_step=8)
-      expected_length = (stft.shape[0] - 1) * 8 + 8
-      self.assertAllEqual([256], inverse_stft.shape.as_list())
-      self.assertAllEqual([expected_length], self.evaluate(inverse_stft).shape)
+    inverse_stft = spectral_ops.inverse_stft(stft, frame_length=8,
+                                             fft_length=16, frame_step=8)
+    expected_length = (stft.shape[0] - 1) * 8 + 8
+    self.assertAllEqual([256], inverse_stft.shape.as_list())
+    self.assertAllEqual([expected_length], self.evaluate(inverse_stft).shape)
 
   def test_stft_and_inverse_stft(self):
     """Test that spectral_ops.stft/inverse_stft match a NumPy implementation."""
@@ -187,36 +184,34 @@ class SpectralOpsTest(test.TestCase):
       # Generate a random white Gaussian signal.
       signal = random_ops.random_normal([signal_length])
 
-      with spectral_ops_test_util.fft_kernel_label_map(), (
-          self.cached_session(use_gpu=True)) as sess:
-        stft = spectral_ops.stft(signal, frame_length, frame_step, fft_length,
-                                 pad_end=False)
-        inverse_stft = spectral_ops.inverse_stft(stft, frame_length, frame_step,
-                                                 fft_length)
-        inverse_stft_corrected = spectral_ops.inverse_stft(
-            stft, frame_length, frame_step, fft_length,
-            window_fn=spectral_ops.inverse_stft_window_fn(frame_step))
-        signal, inverse_stft, inverse_stft_corrected = sess.run(
-            [signal, inverse_stft, inverse_stft_corrected])
+      stft = spectral_ops.stft(signal, frame_length, frame_step, fft_length,
+                               pad_end=False)
+      inverse_stft = spectral_ops.inverse_stft(stft, frame_length, frame_step,
+                                               fft_length)
+      inverse_stft_corrected = spectral_ops.inverse_stft(
+          stft, frame_length, frame_step, fft_length,
+          window_fn=spectral_ops.inverse_stft_window_fn(frame_step))
+      signal, inverse_stft, inverse_stft_corrected = self.evaluate(
+          [signal, inverse_stft, inverse_stft_corrected])
 
-        # Truncate signal to the size of inverse stft.
-        signal = signal[:inverse_stft.shape[0]]
+      # Truncate signal to the size of inverse stft.
+      signal = signal[:inverse_stft.shape[0]]
 
-        # Ignore the frame_length samples at either edge.
-        signal = signal[frame_length:-frame_length]
-        inverse_stft = inverse_stft[frame_length:-frame_length]
-        inverse_stft_corrected = inverse_stft_corrected[
-            frame_length:-frame_length]
+      # Ignore the frame_length samples at either edge.
+      signal = signal[frame_length:-frame_length]
+      inverse_stft = inverse_stft[frame_length:-frame_length]
+      inverse_stft_corrected = inverse_stft_corrected[
+          frame_length:-frame_length]
 
-        # Check that the inverse and original signal are close up to a scale
-        # factor.
-        inverse_stft_scaled = inverse_stft / np.mean(np.abs(inverse_stft))
-        signal_scaled = signal / np.mean(np.abs(signal))
-        self.assertLess(np.std(inverse_stft_scaled - signal_scaled), threshold)
+      # Check that the inverse and original signal are close up to a scale
+      # factor.
+      inverse_stft_scaled = inverse_stft / np.mean(np.abs(inverse_stft))
+      signal_scaled = signal / np.mean(np.abs(signal))
+      self.assertLess(np.std(inverse_stft_scaled - signal_scaled), threshold)
 
-        # Check that the inverse with correction and original signal are close.
-        self.assertLess(np.std(inverse_stft_corrected - signal),
-                        corrected_threshold)
+      # Check that the inverse with correction and original signal are close.
+      self.assertLess(np.std(inverse_stft_corrected - signal),
+                      corrected_threshold)
 
   def test_inverse_stft_window_fn(self):
     """Test that inverse_stft_window_fn has unit gain at each window phase."""
@@ -233,10 +228,7 @@ class SpectralOpsTest(test.TestCase):
       hann_window = window_ops.hann_window(frame_length, dtype=dtypes.float32)
       inverse_window_fn = spectral_ops.inverse_stft_window_fn(frame_step)
       inverse_window = inverse_window_fn(frame_length, dtype=dtypes.float32)
-
-      with self.cached_session(use_gpu=True) as sess:
-        hann_window, inverse_window = self.evaluate(
-            [hann_window, inverse_window])
+      hann_window, inverse_window = self.evaluate([hann_window, inverse_window])
 
       # Expect unit gain at each phase of the window.
       product_window = hann_window * inverse_window
@@ -262,11 +254,6 @@ class SpectralOpsTest(test.TestCase):
       hann_window = window_ops.hann_window(frame_length, dtype=dtypes.float32)
       inverse_window_fn = spectral_ops.inverse_stft_window_fn(frame_step)
       inverse_window = inverse_window_fn(frame_length, dtype=dtypes.float32)
-
-      with self.cached_session(use_gpu=True) as sess:
-        hann_window, inverse_window = self.evaluate(
-            [hann_window, inverse_window])
-
       self.assertAllClose(hann_window, inverse_window * 1.5)
 
   @staticmethod
@@ -280,8 +267,10 @@ class SpectralOpsTest(test.TestCase):
 
   def test_gradients(self):
     """Test that spectral_ops.stft has a working gradient."""
-    with spectral_ops_test_util.fft_kernel_label_map(), (
-        self.session(use_gpu=True)) as sess:
+    # TODO(rjryan): Update gradient tests for Eager.
+    if context.executing_eagerly():
+      return
+    with self.session(use_gpu=True) as sess:
       signal_length = 512
 
       # An all-zero signal has all zero gradients with respect to the sum of the
@@ -299,8 +288,10 @@ class SpectralOpsTest(test.TestCase):
       self.assertFalse((sinusoid_gradient == 0.0).all())
 
   def test_gradients_numerical(self):
-    with spectral_ops_test_util.fft_kernel_label_map(), (
-        self.session(use_gpu=True)):
+    # TODO(rjryan): Update gradient tests for Eager.
+    if context.executing_eagerly():
+      return
+    with self.session(use_gpu=True):
       # Tuples of (signal_length, frame_length, frame_step, fft_length,
       # stft_bound, inverse_stft_bound).
       # TODO(rjryan): Investigate why STFT gradient error is so high.
diff --git a/tensorflow/python/kernel_tests/signal/test_util.py b/tensorflow/python/kernel_tests/signal/test_util.py
index 4d1807e513c..9f7de258be1 100644
--- a/tensorflow/python/kernel_tests/signal/test_util.py
+++ b/tensorflow/python/kernel_tests/signal/test_util.py
@@ -19,6 +19,9 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.core.protobuf import config_pb2
+from tensorflow.lite.python import interpreter
+from tensorflow.lite.python import lite
+from tensorflow.python.eager import def_function
 from tensorflow.python.grappler import tf_optimizer
 from tensorflow.python.training import saver
 
@@ -45,3 +48,55 @@ def grappler_optimize(graph, fetches=None, config_proto=None):
       graph.add_to_collection('train_op', fetch)
   metagraph = saver.export_meta_graph(graph_def=graph.as_graph_def())
   return tf_optimizer.OptimizeGraph(config_proto, metagraph)
+
+
+def tflite_convert(fn, input_templates, use_mlir=False):
+  """Converts the provided fn to tf.lite model.
+
+  Args:
+    fn: A callable that expects a list of inputs like input_templates that
+      returns a tensor or structure of tensors.
+    input_templates: A list of Tensors, ndarrays or TensorSpecs describing the
+      inputs that fn expects. The actual values of the Tensors or ndarrays are
+      unused.
+    use_mlir: Experimental. Whether to use the tf.lite MLIR converter.
+
+  Returns:
+    The serialized tf.lite model.
+  """
+  fn = def_function.function(fn)
+  concrete_func = fn.get_concrete_function(*input_templates)
+  converter = lite.TFLiteConverterV2([concrete_func])
+  converter.experimental_enable_mlir_converter = use_mlir
+  return converter.convert()
+
+
+def evaluate_tflite_model(tflite_model, input_ndarrays):
+  """Evaluates the provided tf.lite model with the given input ndarrays.
+
+  Args:
+    tflite_model: bytes. The serialized tf.lite model.
+    input_ndarrays: A list of NumPy arrays to feed as input to the model.
+
+  Returns:
+    A list ndarrays produced by the model.
+
+  Raises:
+    ValueError: If the number of input arrays does not match the number of
+      inputs the model expects.
+  """
+  the_interpreter = interpreter.Interpreter(model_content=tflite_model)
+  the_interpreter.allocate_tensors()
+
+  input_details = the_interpreter.get_input_details()
+  output_details = the_interpreter.get_output_details()
+
+  if len(input_details) != len(input_ndarrays):
+    raise ValueError('Wrong number of inputs: provided=%s, '
+                     'input_details=%s output_details=%s' % (
+                         input_ndarrays, input_details, output_details))
+  for input_tensor, data in zip(input_details, input_ndarrays):
+    the_interpreter.set_tensor(input_tensor['index'], data)
+  the_interpreter.invoke()
+  return [the_interpreter.get_tensor(details['index'])
+          for details in output_details]
diff --git a/tensorflow/python/kernel_tests/signal/window_ops_test.py b/tensorflow/python/kernel_tests/signal/window_ops_test.py
index a72cdb288bb..18eb0681df0 100644
--- a/tensorflow/python/kernel_tests/signal/window_ops_test.py
+++ b/tensorflow/python/kernel_tests/signal/window_ops_test.py
@@ -20,10 +20,13 @@ from __future__ import print_function
 
 import functools
 
+from absl.testing import parameterized
 import numpy as np
 
+from tensorflow.python.eager import context
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_spec
 from tensorflow.python.framework import test_util as tf_test_util
 from tensorflow.python.kernel_tests.signal import test_util
 from tensorflow.python.ops.signal import window_ops
@@ -56,27 +59,27 @@ def _scipy_raised_cosine(length, symmetric=True, a=0.5, b=0.5):
   return window
 
 
-class WindowOpsTest(test.TestCase):
+@tf_test_util.run_all_in_graph_and_eager_modes
+class WindowOpsTest(test.TestCase, parameterized.TestCase):
 
   def setUp(self):
+    super(WindowOpsTest, self).setUp()
     self._window_lengths = [1, 2, 3, 4, 5, 31, 64, 128]
     self._dtypes = [(dtypes.float16, 1e-2),
                     (dtypes.float32, 1e-6),
                     (dtypes.float64, 1e-9)]
 
   def _compare_window_fns(self, np_window_fn, tf_window_fn):
-    with self.session(use_gpu=True):
-      for window_length in self._window_lengths:
-        for periodic in [False, True]:
-          for tf_dtype, tol in self._dtypes:
-            np_dtype = tf_dtype.as_numpy_dtype
-            expected = np_window_fn(window_length,
-                                    symmetric=not periodic).astype(np_dtype)
-            actual = tf_window_fn(window_length, periodic=periodic,
-                                  dtype=tf_dtype).eval()
-            self.assertAllClose(expected, actual, tol, tol)
+    for window_length in self._window_lengths:
+      for periodic in [False, True]:
+        for tf_dtype, tol in self._dtypes:
+          np_dtype = tf_dtype.as_numpy_dtype
+          expected = np_window_fn(window_length,
+                                  symmetric=not periodic).astype(np_dtype)
+          actual = tf_window_fn(window_length, periodic=periodic,
+                                dtype=tf_dtype)
+          self.assertAllClose(expected, actual, tol, tol)
 
-  @tf_test_util.run_deprecated_v1
   def test_hann_window(self):
     """Check that hann_window matches scipy.signal.hann behavior."""
     # The Hann window is a raised cosine window with parameters alpha=0.5 and
@@ -86,7 +89,6 @@ class WindowOpsTest(test.TestCase):
         functools.partial(_scipy_raised_cosine, a=0.5, b=0.5),
         window_ops.hann_window)
 
-  @tf_test_util.run_deprecated_v1
   def test_hamming_window(self):
     """Check that hamming_window matches scipy.signal.hamming's behavior."""
     # The Hamming window is a raised cosine window with parameters alpha=0.54
@@ -98,6 +100,8 @@ class WindowOpsTest(test.TestCase):
 
   def test_constant_folding(self):
     """Window functions should be constant foldable for constant inputs."""
+    if context.executing_eagerly():
+      return
     for window_fn in (window_ops.hann_window, window_ops.hamming_window):
       for dtype, _ in self._dtypes:
         for periodic in [False, True]:
@@ -105,7 +109,28 @@ class WindowOpsTest(test.TestCase):
           with g.as_default():
             window = window_fn(100, periodic=periodic, dtype=dtype)
             rewritten_graph = test_util.grappler_optimize(g, [window])
-            self.assertEqual(1, len(rewritten_graph.node))
+            self.assertLen(rewritten_graph.node, 1)
+
+  @parameterized.parameters(
+      # Due to control flow, only MLIR is supported.
+      # Only float32 is supported.
+      (window_ops.hann_window, 10, False, dtypes.float32, True),
+      (window_ops.hann_window, 10, True, dtypes.float32, True),
+      (window_ops.hamming_window, 10, False, dtypes.float32, True),
+      (window_ops.hamming_window, 10, True, dtypes.float32, True))
+  def test_tflite_convert(self, window_fn, window_length, periodic, dtype,
+                          use_mlir):
+    def fn(window_length):
+      return window_fn(window_length, periodic, dtype=dtype)
+
+    tflite_model = test_util.tflite_convert(
+        fn, [tensor_spec.TensorSpec(shape=[], dtype=dtypes.int32)], use_mlir)
+    window_length = np.array(window_length).astype(np.int32)
+    actual_output, = test_util.evaluate_tflite_model(
+        tflite_model, [window_length])
+
+    expected_output = self.evaluate(fn(window_length))
+    self.assertAllClose(actual_output, expected_output, rtol=1e-7, atol=1e-7)
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/kernel_tests/svd_op_test.py b/tensorflow/python/kernel_tests/svd_op_test.py
index 278ec9d93b6..bbcab12a163 100644
--- a/tensorflow/python/kernel_tests/svd_op_test.py
+++ b/tensorflow/python/kernel_tests/svd_op_test.py
@@ -406,7 +406,7 @@ if __name__ == "__main__":
             _AddTest(SvdGradOpTest, "SvdGrad", name,
                      _GetSvdGradOpTest(dtype, shape, compute_uv, full_matrices))
             # The results are too inacurate for float32.
-            if dtype == np.float64:
+            if dtype in (np.float64, np.complex128):
               _AddTest(
                   SvdGradGradOpTest, "SvdGradGrad", name,
                   _GetSvdGradGradOpTest(dtype, shape, compute_uv,
diff --git a/tensorflow/python/kernel_tests/variables_test.py b/tensorflow/python/kernel_tests/variables_test.py
index c5f83d78727..84df5099dc7 100644
--- a/tensorflow/python/kernel_tests/variables_test.py
+++ b/tensorflow/python/kernel_tests/variables_test.py
@@ -321,7 +321,6 @@ class VariablesTestCase(test.TestCase, parameterized.TestCase):
   def testCachingDevice(self):
     with self.cached_session():
       var = variables.Variable(2.0)
-      self.assertEqual(var.device, var.value().device)
       self.assertEqual(var.device, var.initialized_value().device)
 
       var_cached = variables.Variable(2.0, caching_device="/job:foo")
diff --git a/tensorflow/python/kernel_tests/while_v2_test.py b/tensorflow/python/kernel_tests/while_v2_test.py
index fefeb594bea..3bb88c9aad3 100644
--- a/tensorflow/python/kernel_tests/while_v2_test.py
+++ b/tensorflow/python/kernel_tests/while_v2_test.py
@@ -25,11 +25,9 @@ from tensorflow.core.protobuf import rewriter_config_pb2
 from tensorflow.python.eager import backprop
 from tensorflow.python.eager import context
 from tensorflow.python.eager import def_function
-from tensorflow.python.ops import control_flow_util_v2
-from tensorflow.python.ops import control_flow_v2_toggles
-from tensorflow.python.ops import random_ops
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import function
 from tensorflow.python.framework import meta_graph
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
@@ -37,16 +35,18 @@ from tensorflow.python.grappler import tf_optimizer
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import control_flow_util
+from tensorflow.python.ops import control_flow_util_v2
+from tensorflow.python.ops import control_flow_v2_toggles
 from tensorflow.python.ops import gradients_impl
 from tensorflow.python.ops import list_ops
 from tensorflow.python.ops import map_fn
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.ops import while_v2
 from tensorflow.python.ops.while_v2 import while_loop as while_loop_v2
 from tensorflow.python.platform import test
 
-
 def random_gamma(shape):  # pylint: disable=invalid-name
   return random_ops.random_gamma(shape, 1.0)
 
@@ -859,9 +859,41 @@ class WhileV2Test(test.TestCase, parameterized.TestCase):
         Body, [m, sum_of_powers],
         return_same_structure=False)[1]
     grad = gradients_impl.gradients(result, [n])
-    with self.cached_session() as sess:
-      self.assertEqual(self.evaluate(result), 364.)
-      self.assertSequenceEqual(self.evaluate(grad), [547.])
+    self.assertEqual(self.evaluate(result), 364.)
+    self.assertSequenceEqual(self.evaluate(grad), [547.])
+
+  @test_util.run_deprecated_v1
+  def testNestedWhileWithLegacyDefun(self):
+    n = constant_op.constant(3.)
+    m = constant_op.constant(5.)
+    sum_of_powers = constant_op.constant(0.)
+
+    def Body(i, previous_sum):
+      prod = constant_op.constant(1.)
+
+      def InnerBodyWrapper(c, v):
+
+        @function.Defun(dtypes.float32, dtypes.float32)
+        def InnerBody(c, v):
+          return c - 1., v * n
+
+        results = InnerBody(c, v)
+        results[0].set_shape([])
+        results[1].set_shape([])
+        return results
+
+      return i - 1., previous_sum + while_loop_v2(
+          lambda c, _: c > 0,
+          InnerBodyWrapper, [i, prod],
+          return_same_structure=False)[1]
+
+    result = while_loop_v2(
+        lambda i, _: i >= 0,
+        Body, [m, sum_of_powers],
+        return_same_structure=False)[1]
+    grad = gradients_impl.gradients(result, [n])
+    self.assertEqual(self.evaluate(result), 364.)
+    self.assertSequenceEqual(self.evaluate(grad), [547.])
 
   @test_util.run_deprecated_v1
   def testIdentityNodeInBody(self):
@@ -875,9 +907,8 @@ class WhileV2Test(test.TestCase, parameterized.TestCase):
     ret = while_loop_v2(
         lambda v: v < 8., Body, [x], return_same_structure=False)
     grad = gradients_impl.gradients(ret, [x])
-    with self.cached_session() as sess:
-      self.assertEqual(self.evaluate(ret), 16.)
-      self.assertSequenceEqual(self.evaluate(grad), [32.])
+    self.assertEqual(self.evaluate(ret), 16.)
+    self.assertSequenceEqual(self.evaluate(grad), [32.])
 
   @test_util.run_deprecated_v1
   def testForwardPassRewrite(self):
diff --git a/tensorflow/python/layers/core_test.py b/tensorflow/python/layers/core_test.py
index b40a2682381..afd288739b6 100644
--- a/tensorflow/python/layers/core_test.py
+++ b/tensorflow/python/layers/core_test.py
@@ -19,6 +19,7 @@ from __future__ import division
 from __future__ import print_function
 
 import collections
+import platform
 
 import numpy as np
 
@@ -556,6 +557,23 @@ class FlattenTest(test.TestCase):
       self.assertEqual(list(np_output.shape), [5, 6])
       self.assertEqual(y.get_shape().as_list(), [5, None])
 
+  @test_util.run_deprecated_v1
+  def testFlattenLargeDim(self):
+    if any(platform.win32_ver()):
+      self.skipTest('values are truncated on windows causing test failures')
+
+    x = array_ops.placeholder(shape=(None, 21316, 21316, 80), dtype='float32')
+    y = core_layers.Flatten()(x)
+    self.assertEqual(y.shape.as_list(), [None, 21316 * 21316 * 80])
+
+  @test_util.run_deprecated_v1
+  def testFlattenLargeBatchDim(self):
+    batch_size = np.iinfo(np.int32).max + 10
+    x = array_ops.placeholder(
+        shape=(batch_size, None, None, 1), dtype='float32')
+    y = core_layers.Flatten()(x)
+    self.assertEqual(y.shape.as_list(), [batch_size, None])
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/lib/core/py_exception_registry.h b/tensorflow/python/lib/core/py_exception_registry.h
index 2b0f23b548c..d761ab418c1 100644
--- a/tensorflow/python/lib/core/py_exception_registry.h
+++ b/tensorflow/python/lib/core/py_exception_registry.h
@@ -18,6 +18,7 @@ limitations under the License.
 #include <map>
 
 #include "tensorflow/c/c_api.h"
+#include "tensorflow/core/lib/core/error_codes.pb.h"
 #include "tensorflow/core/platform/logging.h"
 
 #ifndef PyObject_HEAD
@@ -60,6 +61,10 @@ class PyExceptionRegistry {
   // called before using this function. `code` should not be TF_OK.
   static PyObject* Lookup(TF_Code code);
 
+  static inline PyObject* Lookup(error::Code code) {
+    return Lookup(static_cast<TF_Code>(code));
+  }
+
  private:
   static PyExceptionRegistry* singleton_;
   PyExceptionRegistry() = default;
diff --git a/tensorflow/python/lib/core/py_func.cc b/tensorflow/python/lib/core/py_func.cc
index 18443401d58..dadd8c7a3d7 100644
--- a/tensorflow/python/lib/core/py_func.cc
+++ b/tensorflow/python/lib/core/py_func.cc
@@ -186,7 +186,7 @@ Status DoCallPyFunc(PyCall* call, bool* out_log_on_error) {
     CHECK_NE(ctx, nullptr);
     TF_RETURN_IF_ERROR(MakeArgTuple(call, ctx->context, &args));
     new_executor.reset(new EagerExecutor(call->eager_async));
-    old_executor = ctx->context->Executor();
+    old_executor = &ctx->context->Executor();
     ctx->context->SetExecutorForThread(new_executor.get());
   } else {
     TF_RETURN_IF_ERROR(MakeArgTuple(call, nullptr, &args));
diff --git a/tensorflow/python/lib/core/pybind11_absl.h b/tensorflow/python/lib/core/pybind11_absl.h
new file mode 100644
index 00000000000..09f9681fcdf
--- /dev/null
+++ b/tensorflow/python/lib/core/pybind11_absl.h
@@ -0,0 +1,40 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_PYTHON_LIB_CORE_PYBIND11_ABSL_H_
+#define TENSORFLOW_PYTHON_LIB_CORE_PYBIND11_ABSL_H_
+
+#include "pybind11/pybind11.h"
+#include "tensorflow/core/platform/stringpiece.h"
+
+#if !defined(PYBIND11_CPP17)
+
+namespace pybind11 {
+namespace detail {
+
+// Convert between tensorflow::StringPiece (aka absl::string_view) and Python.
+//
+// pybind11 supports std::string_view, and absl::string_view is meant to be a
+// drop-in replacement for std::string_view, so we can just use the built in
+// implementation.
+template <>
+struct type_caster<tensorflow::StringPiece>
+    : string_caster<tensorflow::StringPiece, true> {};
+
+}  // namespace detail
+}  // namespace pybind11
+
+#endif  // !defined(PYBIND11_CPP17)
+#endif  // TENSORFLOW_PYTHON_LIB_CORE_PYBIND11_ABSL_H_
diff --git a/tensorflow/python/lib/core/pybind11_status.h b/tensorflow/python/lib/core/pybind11_status.h
new file mode 100644
index 00000000000..ca3baeb982f
--- /dev/null
+++ b/tensorflow/python/lib/core/pybind11_status.h
@@ -0,0 +1,66 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_PYTHON_LIB_CORE_PYBIND11_STATUS_H_
+#define TENSORFLOW_PYTHON_LIB_CORE_PYBIND11_STATUS_H_
+
+#include <Python.h>
+
+#include "pybind11/pybind11.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/python/lib/core/py_exception_registry.h"
+
+namespace tensorflow {
+
+namespace py = ::pybind11;
+
+namespace pybind11 {
+
+inline void MaybeRaiseFromStatus(const Status& status) {
+  if (!status.ok()) {
+    // TODO(slebedev): translate to builtin exception classes instead?
+    auto* exc_type = PyExceptionRegistry::Lookup(status.code());
+    PyErr_SetObject(
+        exc_type,
+        py::make_tuple(nullptr, nullptr, status.error_message()).ptr());
+    throw py::error_already_set();
+  }
+}
+
+}  // namespace pybind11
+}  // namespace tensorflow
+
+namespace pybind11 {
+namespace detail {
+
+// Raise an exception if a given status is not OK, otherwise return None.
+//
+// The correspondence between status codes and exception classes is given
+// by PyExceptionRegistry. Note that the registry should be initialized
+// in order to be used, see PyExceptionRegistry::Init.
+template <>
+struct type_caster<::tensorflow::Status> {
+ public:
+  PYBIND11_TYPE_CASTER(::tensorflow::Status, _("Status"));
+  static handle cast(::tensorflow::Status status, return_value_policy, handle) {
+    tensorflow::pybind11::MaybeRaiseFromStatus(status);
+    return none();
+  }
+};
+
+}  // namespace detail
+}  // namespace pybind11
+
+#endif  // TENSORFLOW_PYTHON_LIB_CORE_PYBIND11_STATUS_H_
diff --git a/tensorflow/python/lib/io/file_io.py b/tensorflow/python/lib/io/file_io.py
index 9ac66af2213..129f5ef9974 100644
--- a/tensorflow/python/lib/io/file_io.py
+++ b/tensorflow/python/lib/io/file_io.py
@@ -399,8 +399,8 @@ def create_dir(dirname):
 
   Args:
     dirname: string, name of the directory to be created
-  Notes: The parent directories need to exist. Use recursive_create_dir instead
-    if there is the possibility that the parent dirs don't exist.
+  Notes: The parent directories need to exist. Use `tf.io.gfile.makedirs`
+    instead if there is the possibility that the parent dirs don't exist.
 
   Raises:
     errors.OpError: If the operation fails.
@@ -414,8 +414,8 @@ def create_dir_v2(path):
 
   Args:
     path: string, name of the directory to be created
-  Notes: The parent directories need to exist. Use recursive_create_dir instead
-    if there is the possibility that the parent dirs don't exist.
+  Notes: The parent directories need to exist. Use `tf.io.gfile.makedirs`
+    instead if there is the possibility that the parent dirs don't exist.
 
   Raises:
     errors.OpError: If the operation fails.
diff --git a/tensorflow/python/ops/array_ops.py b/tensorflow/python/ops/array_ops.py
index c1e9e99ed67..eaee420f8a8 100644
--- a/tensorflow/python/ops/array_ops.py
+++ b/tensorflow/python/ops/array_ops.py
@@ -25,6 +25,7 @@ import six
 from tensorflow.python.compat import compat
 from tensorflow.python.eager import context
 from tensorflow.python.framework import common_shapes
+from tensorflow.python.framework import composite_tensor
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
@@ -194,6 +195,8 @@ def identity(input, name=None):  # pylint: disable=redefined-builtin
   Returns:
     A `Tensor`. Has the same type as `input`.
   """
+  if isinstance(input, composite_tensor.CompositeTensor):
+    return nest.map_structure(identity, input, expand_composites=True)
   if context.executing_eagerly() and not hasattr(input, "graph"):
     # Make sure we get an input with handle data attached from resource
     # variables. Variables have correct handle data when graph building.
@@ -466,8 +469,8 @@ def shape_internal(input, name=None, optimize=True, out_type=dtypes.int32):
       return gen_math_ops.cast(input.dense_shape, out_type)
     else:
       if not context.executing_eagerly():
-        input_tensor = ops.convert_to_tensor(input)
-        input_shape = input_tensor.get_shape()
+        input = ops.convert_to_tensor(input)
+        input_shape = input.get_shape()
         if optimize and input_shape.is_fully_defined():
           return constant(input_shape.as_list(), out_type, name=name)
       return gen_array_ops.shape(input, name=name, out_type=out_type)
@@ -545,11 +548,10 @@ def size_internal(input, name=None, optimize=True, out_type=dtypes.int32):
   Returns:
     A `Tensor` of type `out_type`. Defaults to `tf.int32`.
   """
-  if (context.executing_eagerly()
-      and not hasattr(input, "graph")
-      and not isinstance(
-          input, (sparse_tensor.SparseTensor, sparse_tensor.SparseTensorValue))
-     ):
+  if (context.executing_eagerly() and not hasattr(input, "graph") and
+      not isinstance(
+          input,
+          (sparse_tensor.SparseTensor, sparse_tensor.SparseTensorValue))):
     input = ops.convert_to_tensor(input)
     np_out_type = out_type.as_numpy_dtype
     num_elements = np.prod(input._shape_tuple(), dtype=np_out_type)  # pylint: disable=protected-access
@@ -560,8 +562,8 @@ def size_internal(input, name=None, optimize=True, out_type=dtypes.int32):
       return gen_math_ops.prod(
           gen_math_ops.cast(input.dense_shape, out_type), 0, name=name)
     else:
-      input_tensor = ops.convert_to_tensor(input)
-      input_shape = input_tensor.get_shape()
+      input = ops.convert_to_tensor(input)
+      input_shape = input.get_shape()
       if optimize:
         if input_shape.is_fully_defined():
           return constant(input_shape.num_elements(), out_type, name=name)
@@ -621,8 +623,8 @@ def rank_internal(input, name=None, optimize=True):
         input, (sparse_tensor.SparseTensor, sparse_tensor.SparseTensorValue)):
       return gen_array_ops.size(input.dense_shape, name=name)
     else:
-      input_tensor = ops.convert_to_tensor(input)
-      input_shape = input_tensor.get_shape()
+      input = ops.convert_to_tensor(input)
+      input_shape = input.get_shape()
       if optimize and input_shape.ndims is not None:
         return constant(input_shape.ndims, dtypes.int32, name=name)
       return gen_array_ops.rank(input, name=name)
@@ -1343,38 +1345,34 @@ def concat(values, axis, name="concat"):
 
   For example:
 
-  ```python
-  t1 = [[1, 2, 3], [4, 5, 6]]
-  t2 = [[7, 8, 9], [10, 11, 12]]
-  tf.concat([t1, t2], 0)  # [[1, 2, 3], [4, 5, 6], [7, 8, 9], [10, 11, 12]]
-  tf.concat([t1, t2], 1)  # [[1, 2, 3, 7, 8, 9], [4, 5, 6, 10, 11, 12]]
+  >>> t1 = [[1, 2, 3], [4, 5, 6]]
+  >>> t2 = [[7, 8, 9], [10, 11, 12]]
+  >>> concat([t1, t2], 0)
+  <tf.Tensor: id=..., shape=(4, 3), dtype=int32, numpy=
+  array([[ 1,  2,  3],
+         [ 4,  5,  6],
+         [ 7,  8,  9],
+         [10, 11, 12]], dtype=int32)>
+
+  >>> concat([t1, t2], 1)
+  <tf.Tensor: id=..., shape=(2, 6), dtype=int32, numpy=
+  array([[ 1,  2,  3,  7,  8,  9],
+         [ 4,  5,  6, 10, 11, 12]], dtype=int32)>
 
-  # tensor t3 with shape [2, 3]
-  # tensor t4 with shape [2, 3]
-  tf.shape(tf.concat([t3, t4], 0))  # [4, 3]
-  tf.shape(tf.concat([t3, t4], 1))  # [2, 6]
-  ```
   As in Python, the `axis` could also be negative numbers. Negative `axis`
   are interpreted as counting from the end of the rank, i.e.,
    `axis + rank(values)`-th dimension.
 
   For example:
 
-  ```python
-  t1 = [[[1, 2], [2, 3]], [[4, 4], [5, 3]]]
-  t2 = [[[7, 4], [8, 4]], [[2, 10], [15, 11]]]
-  tf.concat([t1, t2], -1)
-  ```
-
-  would produce:
-
-  ```python
-  [[[ 1,  2,  7,  4],
-    [ 2,  3,  8,  4]],
-
-   [[ 4,  4,  2, 10],
-    [ 5,  3, 15, 11]]]
-  ```
+  >>> t1 = [[[1, 2], [2, 3]], [[4, 4], [5, 3]]]
+  >>> t2 = [[[7, 4], [8, 4]], [[2, 10], [15, 11]]]
+  >>> tf.concat([t1, t2], -1)
+  <tf.Tensor: id=..., shape=(2, 2, 4), dtype=int32, numpy=
+    array([[[ 1,  2,  7,  4],
+            [ 2,  3,  8,  4]],
+           [[ 4,  4,  2, 10],
+            [ 5,  3, 15, 11]]], dtype=int32)>
 
   Note: If you are concatenating along a new axis consider using stack.
   E.g.
@@ -1469,7 +1467,7 @@ def boolean_mask(tensor, mask, name="boolean_mask", axis=None):
 
   def _apply_mask_1d(reshaped_tensor, mask, axis=None):
     """Mask tensor along dimension 0 with a 1-D mask."""
-    indices = squeeze(where(mask), axis=[1])
+    indices = squeeze(where_v2(mask), axis=[1])
     return gather(reshaped_tensor, indices, axis=axis)
 
   with ops.name_scope(name, values=[tensor, mask]):
@@ -1655,9 +1653,9 @@ def split(value, num_or_size_splits, axis=0, num=None, name="split"):
   Args:
     value: The `Tensor` to split.
     num_or_size_splits: Either an integer indicating the number of splits along
-      split_dim or a 1-D integer `Tensor` or Python list containing the sizes of
-      each output tensor along split_dim. If a scalar then it must evenly divide
-      `value.shape[axis]`; otherwise the sum of sizes along the split dimension
+      `axis` or a 1-D integer `Tensor` or Python list containing the sizes of
+      each output tensor along `axis`. If a scalar, then it must evenly divide
+      `value.shape[axis]`; otherwise the sum of sizes along the split axis
       must match that of the `value`.
     axis: An integer or scalar `int32` `Tensor`. The dimension along which to
       split. Must be in the range `[-rank(value), rank(value))`. Defaults to 0.
@@ -2284,6 +2282,7 @@ def matrix_set_diag(
   return gen_array_ops.matrix_set_diag(
       input=input, diagonal=diagonal, name=name)
 
+
 # pylint: enable=invalid-name
 
 
@@ -3202,7 +3201,11 @@ def required_space_to_batch_paddings(input_shape,
 @tf_export(v1=["nn.space_to_batch", "space_to_batch"])
 @deprecation.deprecated_endpoints("space_to_batch")
 def space_to_batch(  # pylint: disable=missing-docstring
-    input, paddings, block_size=None, name=None, block_shape=None):  # pylint: disable=redefined-builtin
+    input,  # pylint: disable=redefined-builtin
+    paddings,
+    block_size=None,
+    name=None,
+    block_shape=None):  # pylint: disable=redefined-builtin
   block_size = deprecation.deprecated_argument_lookup("block_shape",
                                                       block_shape, "block_size",
                                                       block_size)
@@ -3306,10 +3309,10 @@ def batch_to_space_v2(input, block_shape, crops, name=None):  # pylint: disable=
         block_shape[M-1], batch / prod(block_shape), input_shape[1], ...,
         input_shape[N-1]]  2. Permute dimensions of `reshaped` to produce
         `permuted` of shape [batch / prod(block_shape),  input_shape[1],
-        block_shape[0], ..., input_shape[M], block_shape[M-1],
-        input_shape[M+1], ..., input_shape[N-1]]  3. Reshape `permuted` to
-        produce `reshaped_permuted` of shape [batch / prod(block_shape),
-        input_shape[1] * block_shape[0], ..., input_shape[M] * block_shape[M-1],
+        block_shape[0], ..., input_shape[M], block_shape[M-1], input_shape[M+1],
+        ..., input_shape[N-1]]  3. Reshape `permuted` to produce
+        `reshaped_permuted` of shape [batch / prod(block_shape), input_shape[1]
+        * block_shape[0], ..., input_shape[M] * block_shape[M-1],
         input_shape[M+1], ..., input_shape[N-1]]  4. Crop the start and end of
         dimensions `[1, ..., M]` of `reshaped_permuted` according to `crops` to
         produce the
@@ -3331,10 +3334,10 @@ def batch_to_space_v2(input, block_shape, crops, name=None):  # pylint: disable=
           `block_shape = [2, 2]`, and `crops = [[0, 0], [0, 0]]`:  ``` x =
             [[[[1], [3]], [[9], [11]]], [[[2], [4]], [[10], [12]]], [[[5], [7]],
             [[13], [15]]], [[[6], [8]], [[14], [16]]]] ```
-      The output tensor has shape `[1, 4, 4, 1]` and value:  ``` x = [[[1],
-        [2],  [3],  [4]], [[5],   [6],  [7],  [8]], [[9],  [10], [11],  [12]],
-        [[13], [14], [15],  [16]]] ```  (4) For the following input of shape
-        `[8, 1, 3, 1]`,
+      The output tensor has shape `[1, 4, 4, 1]` and value:  ``` x = [[[1], [2],
+        [3],  [4]], [[5],   [6],  [7],  [8]], [[9],  [10], [11],  [12]], [[13],
+        [14], [15],  [16]]] ```  (4) For the following input of shape `[8, 1, 3,
+        1]`,
           `block_shape = [2, 2]`, and `crops = [[0, 0], [2, 0]]`:  ``` x =
             [[[[0], [1], [3]]], [[[0], [9], [11]]], [[[0], [2], [4]]], [[[0],
             [10], [12]]], [[[0], [5], [7]]], [[[0], [13], [15]]], [[[0], [6],
@@ -3628,8 +3631,8 @@ def squeeze(input, axis=None, name=None, squeeze_dims=None):
     axis: An optional list of `ints`. Defaults to `[]`. If specified, only
       squeezes the dimensions listed. The dimension index starts at 0. It is an
       error to squeeze a dimension that is not 1. Must be in the range
-      `[-rank(input), rank(input))`.
-      Must be specified if `input` is a `RaggedTensor`.
+      `[-rank(input), rank(input))`. Must be specified if `input` is a
+      `RaggedTensor`.
     name: A name for the operation (optional).
     squeeze_dims: Deprecated keyword argument that is now axis.
 
@@ -3949,8 +3952,7 @@ def gather(params,
     # without introducing a circular dependency.
     return params.sparse_read(indices, name=name)
   except AttributeError:
-    return gen_array_ops.gather_v2(
-        params, indices, axis, name=name)
+    return gen_array_ops.gather_v2(params, indices, axis, name=name)
 
 
 @tf_export("gather", v1=[])
@@ -4368,6 +4370,7 @@ def batch_gather_nd(params, indices, batch_dims, name=None):
 
 # Define quantize_v2 here in order to make name the second-to-last attribute,
 # because round_mode was added later.
+# (And also now because of 'axis' processing).
 @tf_export(v1=["quantize_v2"])
 @deprecation.deprecated(
     "2017-10-25",
@@ -4380,7 +4383,27 @@ def quantize_v2(
     T,
     mode="MIN_COMBINED",
     name=None,
-    round_mode="HALF_AWAY_FROM_ZERO"):
+    round_mode="HALF_AWAY_FROM_ZERO",
+    narrow_range=False,
+    axis=None):
+  if axis is None:
+    axis = -1
+  elif axis < 0:
+    if input.shape.ndims is None:
+      raise ValueError("input should have known rank to use negative axis.")
+    axis %= input.shape.ndims
+
+  if compat.forward_compatible(2019, 9, 25) or axis >= 0:
+    return gen_array_ops.quantize_v2(
+        input,
+        min_range,
+        max_range,
+        T=T,
+        mode=mode,
+        name=name,
+        round_mode=round_mode,
+        narrow_range=narrow_range,
+        axis=axis)
   return gen_array_ops.quantize_v2(
       input,
       min_range,
@@ -4399,14 +4422,30 @@ quantize_v2.__doc__ = """Please use `tf.quantization.quantize` instead."""
 # version of TensorFlow.
 @tf_export("quantization.quantize", v1=["quantization.quantize", "quantize"])
 @deprecation.deprecated_endpoints("quantize")
-def quantize(input,  # pylint: disable=redefined-builtin
-             min_range,
-             max_range,
-             T,
-             mode="MIN_COMBINED",
-             round_mode="HALF_AWAY_FROM_ZERO",
-             name=None):
-  return gen_array_ops.quantize_v2(
+def quantize(
+    input,  # pylint: disable=redefined-builtin
+    min_range,
+    max_range,
+    T,
+    mode="MIN_COMBINED",
+    round_mode="HALF_AWAY_FROM_ZERO",
+    name=None,
+    narrow_range=False,
+    axis=None):
+  """Quantize the input tensor."""
+  if (compat.forward_compatible(2019, 9, 25) or narrow_range or
+      axis is not None):
+    return quantize_v2(
+        input,
+        min_range,
+        max_range,
+        T,
+        mode=mode,
+        round_mode=round_mode,
+        name=name,
+        narrow_range=narrow_range,
+        axis=axis)
+  return quantize_v2(
       input,
       min_range,
       max_range,
@@ -4416,17 +4455,43 @@ def quantize(input,  # pylint: disable=redefined-builtin
       name=name)
 
 
+@tf_export("quantization.dequantize", v1=["quantization.dequantize",
+                                          "dequantize"])
+@deprecation.deprecated_endpoints("dequantize")
+def dequantize(
+    input,  # pylint: disable=redefined-builtin
+    min_range,
+    max_range,
+    mode="MIN_COMBINED",
+    name=None,
+    axis=None):
+  """Dequantize tensor to the specified range."""
+  if axis is None:
+    axis = -1
+  elif axis < 0:
+    if input.shape.ndims is None:
+      raise ValueError("input should have known rank to use negative axis.")
+    axis %= input.shape.ndims
+
+  if compat.forward_compatible(2019, 9, 25) or axis >= 0:
+    return gen_array_ops.dequantize(
+        input, min_range, max_range, mode=mode, name=name, axis=axis)
+  return gen_array_ops.dequantize(
+      input, min_range, max_range, mode=mode, name=name)
+
+
 @tf_export("quantization.quantize_and_dequantize")
-def quantize_and_dequantize(input,  # pylint: disable=redefined-builtin
-                            input_min,
-                            input_max,
-                            signed_input=True,
-                            num_bits=8,
-                            range_given=False,
-                            round_mode="HALF_TO_EVEN",
-                            name=None,
-                            narrow_range=False,
-                            axis=None):
+def quantize_and_dequantize(
+    input,  # pylint: disable=redefined-builtin
+    input_min,
+    input_max,
+    signed_input=True,
+    num_bits=8,
+    range_given=False,
+    round_mode="HALF_TO_EVEN",
+    name=None,
+    narrow_range=False,
+    axis=None):
   """Quantizes then dequantizes a tensor.
 
   Args:
@@ -4447,20 +4512,19 @@ def quantize_and_dequantize(input,  # pylint: disable=redefined-builtin
     narrow_range: If true, then the absolute value of the quantized minimum
       value is the same as the quantized maximum value, instead of 1 greater.
       i.e. for 8 bit quantization, the minimum value is -127 instead of -128.
-    axis: Integer. If specified, refers to a dimension of the input tensor,
-      such that quantization will be per slice along that dimension.
+    axis: Integer. If specified, refers to a dimension of the input tensor, such
+      that quantization will be per slice along that dimension.
 
   Returns:
     A `Tensor`. Each element is the result of quantizing and dequantizing the
     corresponding element of `input`.
   """
-  if axis is not None:
-    if axis < 0:
-      if input.shape.ndims is None:
-        raise ValueError("input should have known rank to use negative axis.")
-      axis %= input.shape.ndims
-  else:
+  if axis is None:
     axis = -1
+  elif axis < 0:
+    if input.shape.ndims is None:
+      raise ValueError("input should have known rank to use negative axis.")
+    axis %= input.shape.ndims
 
   if compat.forward_compatible(2019, 9, 25) or axis >= 0:
     return gen_array_ops.quantize_and_dequantize_v2(
@@ -4652,8 +4716,8 @@ def extract_image_patches_v2(images, sizes, strides, rates, padding, name=None):
 
   Args:
     images: A 4-D Tensor with shape `[batch, in_rows, in_cols, depth]
-    sizes: The size of the extracted patches. Must
-      be [1, size_rows, size_cols, 1].
+    sizes: The size of the extracted patches. Must be [1, size_rows, size_cols,
+      1].
     strides: A 1-D Tensor of length 4. How far the centers of two consecutive
       patches are in the images. Must be: `[1, stride_rows, stride_cols, 1]`.
     rates: A 1-D Tensor of length 4. Must be: `[1, rate_rows, rate_cols, 1]`.
@@ -4810,14 +4874,20 @@ def repeat_with_axis(data, repeats, axis, name=None):
     A tensor with `max(N, 1)` dimensions.  Has the same shape as `data`,
     except that dimension `axis` has size `sum(repeats)`.
   #### Examples:
-    ```python
     >>> repeat(['a', 'b', 'c'], repeats=[3, 0, 2], axis=0)
-    ['a', 'a', 'a', 'c', 'c']
+    <tf.Tensor: ..., shape=(5,), dtype=string,
+    numpy=array([b'a', b'a', b'a', b'c', b'c'], dtype=object)>
     >>> repeat([[1, 2], [3, 4]], repeats=[2, 3], axis=0)
-    [[1, 2], [1, 2], [3, 4], [3, 4], [3, 4]]
+    <tf.Tensor: ..., shape=(5, 2), dtype=int32, numpy=
+    array([[1, 2],
+           [1, 2],
+           [3, 4],
+           [3, 4],
+           [3, 4]], dtype=int32)>
     >>> repeat([[1, 2], [3, 4]], repeats=[2, 3], axis=1)
-    [[1, 1, 2, 2, 2], [3, 3, 4, 4, 4]]
-    ```
+    <tf.Tensor: ..., shape=(2, 5), dtype=int32, numpy=
+    array([[1, 1, 2, 2, 2],
+           [3, 3, 4, 4, 4]], dtype=int32)>
   """
   if not isinstance(axis, int):
     raise TypeError("axis must be an int; got %s" % type(axis).__name__)
@@ -4916,7 +4986,7 @@ def _with_nonzero_rank(data):
 
 @tf_export("repeat")
 def repeat(input, repeats, axis=None, name=None):  # pylint: disable=redefined-builtin
-  """Repeat elements of `input`
+  """Repeat elements of `input`.
 
   Args:
     input: An `N`-dimensional Tensor.
@@ -4932,18 +5002,31 @@ def repeat(input, repeats, axis=None, name=None):  # pylint: disable=redefined-b
       If axis is None then the output array is flattened to match the flattened
       input array.
   #### Examples:
-    ```python
+
     >>> repeat(['a', 'b', 'c'], repeats=[3, 0, 2], axis=0)
-    ['a', 'a', 'a', 'c', 'c']
+    <tf.Tensor: ..., shape=(5,), dtype=string,
+    numpy=array([b'a', b'a', b'a', b'c', b'c'], dtype=object)>
+
     >>> repeat([[1, 2], [3, 4]], repeats=[2, 3], axis=0)
-    [[1, 2], [1, 2], [3, 4], [3, 4], [3, 4]]
+    <tf.Tensor: id=..., shape=(5, 2), dtype=int32, numpy=
+    array([[1, 2],
+           [1, 2],
+           [3, 4],
+           [3, 4],
+           [3, 4]], dtype=int32)>
+
     >>> repeat([[1, 2], [3, 4]], repeats=[2, 3], axis=1)
-    [[1, 1, 2, 2, 2], [3, 3, 4, 4, 4]]
+    <tf.Tensor: id=..., shape=(2, 5), dtype=int32, numpy=
+    array([[1, 1, 2, 2, 2],
+           [3, 3, 4, 4, 4]], dtype=int32)>
+
     >>> repeat(3, repeats=4)
-    [3, 3, 3, 3]
+    <tf.Tensor: id=..., shape=(4,), dtype=int32,
+    numpy=array([3, 3, 3, 3], dtype=int32)>
+
     >>> repeat([[1,2], [3,4]], repeats=2)
-    [1, 1, 2, 2, 3, 3, 4, 4]
-    ```
+    <tf.Tensor: id=..., shape=(8,), dtype=int32,
+    numpy=array([1, 1, 2, 2, 3, 3, 4, 4], dtype=int32)>
   """
   if axis is None:
     input = reshape(input, [-1])
diff --git a/tensorflow/python/ops/check_ops.py b/tensorflow/python/ops/check_ops.py
index 3d3cc7f8336..34106f61fd8 100644
--- a/tensorflow/python/ops/check_ops.py
+++ b/tensorflow/python/ops/check_ops.py
@@ -310,7 +310,7 @@ def _binary_assert(sym, opname, op_func, static_func, x, y, data, summarize,
     static_func: Function that, if passed numpy ndarray versions of the two
       inputs to the assertion, will return a Boolean ndarray with containing
       True in all positions where the assertion PASSES.
-      i.e. lambda x,y: (x == y) for assert_equal()
+      i.e. np.equal for assert_equal()
     x:  Numeric `Tensor`.
     y:  Numeric `Tensor`, same dtype as and broadcastable to `x`.
     data:  The tensors to print out if the condition is False.  Defaults to
@@ -366,7 +366,7 @@ def _binary_assert(sym, opname, op_func, static_func, x, y, data, summarize,
       x_static = tensor_util.constant_value(x)
       y_static = tensor_util.constant_value(y)
       if x_static is not None and y_static is not None:
-        condition_static = static_func(x_static, y_static).all()
+        condition_static = np.all(static_func(x_static, y_static))
         _assert_static(condition_static, data)
       return control_flow_ops.Assert(condition, data, summarize=summarize)
 
@@ -654,9 +654,8 @@ def assert_equal(x, y, data=None, summarize=None, message=None, name=None):  # p
     # Short-circuit if x and y are the same tensor.
     if x is y:
       return None if context.executing_eagerly() else control_flow_ops.no_op()
-  return _binary_assert('==', 'assert_equal', math_ops.equal,
-                        lambda x, y: (x == y),
-                        x, y, data, summarize, message, name)
+  return _binary_assert('==', 'assert_equal', math_ops.equal, np.equal, x, y,
+                        data, summarize, message, name)
 
 
 @tf_export('debugging.assert_none_equal', v1=[])
@@ -703,8 +702,7 @@ def assert_none_equal_v2(x, y, summarize=None, message=None, name=None):
 def assert_none_equal(
     x, y, data=None, summarize=None, message=None, name=None):
   return _binary_assert('!=', 'assert_none_equal', math_ops.not_equal,
-                        lambda x, y: (x != y), x, y, data, summarize, message,
-                        name)
+                        np.not_equal, x, y, data, summarize, message, name)
 
 
 @tf_export('debugging.assert_near', v1=[])
@@ -877,8 +875,8 @@ def assert_less_v2(x, y, message=None, summarize=None, name=None):
 @tf_export(v1=['debugging.assert_less', 'assert_less'])
 @_binary_assert_doc('<')
 def assert_less(x, y, data=None, summarize=None, message=None, name=None):
-  return _binary_assert('<', 'assert_less', math_ops.less, lambda x, y: (x < y),
-                        x, y, data, summarize, message, name)
+  return _binary_assert('<', 'assert_less', math_ops.less, np.less, x, y, data,
+                        summarize, message, name)
 
 
 @tf_export('debugging.assert_less_equal', v1=[])
@@ -922,8 +920,7 @@ def assert_less_equal_v2(x, y, message=None, summarize=None, name=None):
 @_binary_assert_doc('<=')
 def assert_less_equal(x, y, data=None, summarize=None, message=None, name=None):
   return _binary_assert('<=', 'assert_less_equal', math_ops.less_equal,
-                        lambda x, y: (x <= y), x, y, data, summarize, message,
-                        name)
+                        np.less_equal, x, y, data, summarize, message, name)
 
 
 @tf_export('debugging.assert_greater', 'assert_greater', v1=[])
@@ -965,9 +962,8 @@ def assert_greater_v2(x, y, message=None, summarize=None, name=None):
 @tf_export(v1=['debugging.assert_greater', 'assert_greater'])
 @_binary_assert_doc('>')
 def assert_greater(x, y, data=None, summarize=None, message=None, name=None):  # pylint: disable=missing-docstring
-  return _binary_assert('>', 'assert_greater', math_ops.greater,
-                        lambda x, y: (x > y),
-                        x, y, data, summarize, message, name)
+  return _binary_assert('>', 'assert_greater', math_ops.greater, np.greater, x,
+                        y, data, summarize, message, name)
 
 
 @tf_export('debugging.assert_greater_equal', v1=[])
@@ -1013,8 +1009,7 @@ def assert_greater_equal_v2(x, y, message=None, summarize=None, name=None):
 def assert_greater_equal(x, y, data=None, summarize=None, message=None,
                          name=None):
   return _binary_assert('>=', 'assert_greater_equal', math_ops.greater_equal,
-                        lambda x, y: (x >= y), x, y, data, summarize, message,
-                        name)
+                        np.greater_equal, x, y, data, summarize, message, name)
 
 
 def _assert_rank_condition(
@@ -1026,8 +1021,8 @@ def _assert_rank_condition(
     rank:  Scalar `Tensor`.
     static_condition:   A python function that takes `[actual_rank, given_rank]`
       and returns `True` if the condition is satisfied, `False` otherwise.
-    dynamic_condition:  An `op` that takes [actual_rank, given_rank]
-      and return `True` if the condition is satisfied, `False` otherwise.
+    dynamic_condition:  An `op` that takes [actual_rank, given_rank] and return
+      `True` if the condition is satisfied, `False` otherwise.
     data:  The tensors to print out if the condition is false.  Defaults to
       error message and first few entries of `x`.
     summarize: Print this many entries of each tensor.
@@ -2159,4 +2154,3 @@ def ensure_shape(x, shape, name=None):
 def _ensure_shape_grad(op, grad):
   del op  # Unused.
   return grad
-
diff --git a/tensorflow/python/ops/collective_ops_test.py b/tensorflow/python/ops/collective_ops_test.py
index a626f1c2594..c7fe931a0f7 100644
--- a/tensorflow/python/ops/collective_ops_test.py
+++ b/tensorflow/python/ops/collective_ops_test.py
@@ -158,7 +158,7 @@ class CollectiveOpTest(test.TestCase):
               # to `all_reduce` has an explicit device string.  We don't use
               # `identity` because `cast` is more resilient to getting optimized
               # away by various optimization passes.
-              input_tensor = math_ops.cast(device_tensors[j], dtypes.float64)
+              input_tensor = math_ops.cast(device_tensors[j], dtypes.float16)
               collective_op = collective_ops.all_reduce(
                   input_tensor, group_size, group_key, instances[j],
                   'Add', 'Id')
diff --git a/tensorflow/python/ops/confusion_matrix.py b/tensorflow/python/ops/confusion_matrix.py
index bdee7b406f9..3e885975b03 100644
--- a/tensorflow/python/ops/confusion_matrix.py
+++ b/tensorflow/python/ops/confusion_matrix.py
@@ -67,9 +67,11 @@ def remove_squeezable_dimensions(
     if (labels_rank is not None) and (predictions_rank is not None):
       # Use static rank.
       rank_diff = predictions_rank - labels_rank
-      if rank_diff == expected_rank_diff + 1:
+      if (rank_diff == expected_rank_diff + 1 and
+          predictions_shape.dims[-1].is_compatible_with(1)):
         predictions = array_ops.squeeze(predictions, [-1])
-      elif rank_diff == expected_rank_diff - 1:
+      elif (rank_diff == expected_rank_diff - 1 and
+            labels_shape.dims[-1].is_compatible_with(1)):
         labels = array_ops.squeeze(labels, [-1])
       return labels, predictions
 
diff --git a/tensorflow/python/ops/control_flow_ops.py b/tensorflow/python/ops/control_flow_ops.py
index 411130fedf0..22ab85d1ca7 100644
--- a/tensorflow/python/ops/control_flow_ops.py
+++ b/tensorflow/python/ops/control_flow_ops.py
@@ -3503,8 +3503,8 @@ def switch_case(branch_index,
   branch will be selected. `tf.switch_case` is more like a C++ switch/case
   statement than `tf.case`, which is more like an if/elif/elif/else chain.
 
-  The `branch_fns` parameter is either a list
-  of (int, callable) pairs, or simply a list of callables (in which case the
+  The `branch_fns` parameter is either a dict from `int` to callables, or list
+  of (`int`, callable) pairs, or simply a list of callables (in which case the
   index is implicitly the key). The `branch_index` `Tensor` is used to select an
   element in `branch_fns` with matching `int` key, falling back to `default`
   if none match, or `max(keys)` if no `default` is provided. The keys must form
@@ -3514,12 +3514,6 @@ def switch_case(branch_index,
   callables must return the same (possibly nested) value structure of lists,
   tuples, and/or named tuples.
 
-  @compatibility(v2)
-  `branch_fns` could be a dictionary in v1. However, tf.Tensor and
-  tf.Variable are no longer hashable in v2, so cannot be used as a key for a
-  dictionary.  Please use a list or a tuple instead.
-  @end_compatibility
-
   **Example:**
 
   Pseudocode:
@@ -3550,9 +3544,10 @@ def switch_case(branch_index,
   Args:
     branch_index: An int Tensor specifying which of `branch_fns` should be
       executed.
-    branch_fns: A `list` of (int, callable) pairs, or simply a list of
-    callables (in which case the index serves as the key). Each callable must
-    return a matching structure of tensors.
+    branch_fns: A `dict` mapping `int`s to callables, or a `list` of
+      (`int`, callable) pairs, or simply a list of callables (in which case the
+      index serves as the key). Each callable must return a matching structure
+      of tensors.
     default: Optional callable that returns a structure of tensors.
     name: A name for this operation (optional).
 
diff --git a/tensorflow/python/ops/control_flow_util_v2.py b/tensorflow/python/ops/control_flow_util_v2.py
index 3aec9192698..a28574041a0 100644
--- a/tensorflow/python/ops/control_flow_util_v2.py
+++ b/tensorflow/python/ops/control_flow_util_v2.py
@@ -31,6 +31,7 @@ from tensorflow.python.ops import control_flow_util
 from tensorflow.python.ops import control_flow_v2_func_graphs
 from tensorflow.python.util import tf_contextlib
 
+
 _EXPERIMENTAL_OUTPUT_ALL_INTERMEDIATES_OVERRIDE = None
 
 CondBranchFuncGraph = control_flow_v2_func_graphs.CondBranchFuncGraph
@@ -258,7 +259,18 @@ def output_all_intermediates():
 
 def get_func_graph(op, input_shapes, func_name):
   """Generates and returns a FuncGraph for the given op and input_shapes."""
-  fdef = op.graph._get_function(func_name).definition  # pylint: disable=protected-access
+  graph = op.graph
+  # Recursively search the func in graphs.
+  while graph is not None:
+    func = graph._get_function(func_name)  # pylint: disable=protected-access
+    if func is not None:
+      fdef = func.definition
+      break
+    if hasattr(graph, "outer_graph"):
+      graph = graph.outer_graph
+    else:
+      break
+
   # `op.graph` may not be the same as `ops.get_default_graph()` e.g.
   # in the case of nested if ops or when the gradient is being computed
   # from inside a Defun. We build the `func_graph` with `op.graph` as its
@@ -266,5 +278,6 @@ def get_func_graph(op, input_shapes, func_name):
   # forward pass. We need this so that we can resolve references to tensors
   # in `func_graph` from its gradient graph in `_resolve_grad_inputs`.
   with op.graph.as_default():
-    func_graph = function_def_to_graph.function_def_to_graph(fdef, input_shapes)
+    func_graph = function_def_to_graph.function_def_to_graph(
+        fdef, input_shapes)
   return func_graph
diff --git a/tensorflow/python/ops/custom_gradient.py b/tensorflow/python/ops/custom_gradient.py
index f4f85b0fa5f..56f5c20e324 100644
--- a/tensorflow/python/ops/custom_gradient.py
+++ b/tensorflow/python/ops/custom_gradient.py
@@ -320,7 +320,7 @@ def _graph_mode_decorator(f, primals, *args, **kwargs):
     inputs = args
   else:
     primals = [ops.convert_to_tensor(x) for x in nest.flatten(primals)]
-    inputs = primals
+    inputs = primals + args
   variables_in_tape = frozenset([
       v.experimental_ref() for v in tape.watched_variables()
   ]) - frozenset(v.experimental_ref() for v in inputs)
diff --git a/tensorflow/python/ops/dequantize_op_test.py b/tensorflow/python/ops/dequantize_op_test.py
index 794985b2dbb..4ad6eeb3e6b 100644
--- a/tensorflow/python/ops/dequantize_op_test.py
+++ b/tensorflow/python/ops/dequantize_op_test.py
@@ -70,6 +70,44 @@ class DequantizeOpTest(test.TestCase):
     self._testDequantizeOp(np.array([-2, 4, -17]), -5.0, -3.0, dtypes.qint8)
     self._testDequantizeOp(np.array([0, -4, 42, -108]), 5.0, 40.0, dtypes.qint8)
 
+  def testAxis(self):
+    # Generates a tensor of the specified `shape` using values from `values`
+    # scaled by (slice_idx + 1) along `axis` dimension.
+    def scale_per_slice(shape, axis, values):
+      # Note: repeats the values if the shape is larger than values.
+      out = np.take(values, np.remainder(np.arange(np.prod(shape)),
+                                         len(values))).reshape(shape)
+      if axis is not None:
+        scale_shape = [1] * len(shape)
+        scale_shape[axis] = shape[axis]
+        out *= np.arange(1, shape[axis] + 1).reshape(scale_shape)
+      return out
+
+    shape = np.array([2, 3, 4, 5])
+    values = np.array([-128, -64, 0, 38, 102, 71, 64], dtype=np.int32)
+    dequant_values = np.array([-2, -1.0, 0, 0.59375, 1.59375, 1.109375, 1.0],
+                              dtype=np.float32)
+    for axis in [None, 0, 1, 2, 3]:
+      inputs = constant_op.constant(
+          scale_per_slice(shape, None, values), dtype=dtypes.qint8)
+      expected_dequantized = scale_per_slice(shape, axis, dequant_values)
+      if axis is None:
+        min_range, max_range = -2.0, 1.6
+      else:
+        num_slices = shape[axis]
+        min_range, max_range = [], []
+        for slice_idx in range(num_slices):
+          min_range.append(-2.0 * (slice_idx + 1))
+          max_range.append(1.6 * (slice_idx + 1))
+      dequantized = self.evaluate(
+          array_ops.dequantize(
+              inputs, min_range, max_range, mode="SCALED", axis=axis))
+      self.assertAllEqual(dequantized, expected_dequantized)
+      if axis is not None:
+        dequantized = self.evaluate(
+            array_ops.dequantize(
+                inputs, min_range, max_range, mode="SCALED", axis=(axis - 4)))
+        self.assertAllClose(dequantized, expected_dequantized)
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/ops/gradient_checker_v2.py b/tensorflow/python/ops/gradient_checker_v2.py
index 4cf88dbf02c..f8e1c7b75cd 100644
--- a/tensorflow/python/ops/gradient_checker_v2.py
+++ b/tensorflow/python/ops/gradient_checker_v2.py
@@ -164,7 +164,7 @@ def _compute_theoretical_jacobian(f, y_shape, y_dtype, xs, param):
   dy_data_flat = dy_data.ravel().view(y_dtype.real_dtype.as_numpy_dtype)
   grad_fn_unprep = backprop.gradients_function(f, [param])
   grad_fn = _prepare(lambda dy, *xs: grad_fn_unprep(*xs, dy=dy),
-                     [y_dtype] + [x.dtype for x in xs])
+                     [y_dtype] + [z.dtype for z in xs])
   for col in range(y_size):
     dy_data_flat[col] = 1
     grad = _to_numpy(grad_fn(dy_data, *xs)[0])
diff --git a/tensorflow/python/ops/gradient_checker_v2_test.py b/tensorflow/python/ops/gradient_checker_v2_test.py
index d1205c36185..711a0c1b6f0 100644
--- a/tensorflow/python/ops/gradient_checker_v2_test.py
+++ b/tensorflow/python/ops/gradient_checker_v2_test.py
@@ -143,6 +143,22 @@ class GradientCheckerTest(test.TestCase):
         f, [x]))
     self.assertEqual(error, 0)
 
+  def testEmptyMatMul(self):
+
+    def f(x, y):
+      return math_ops.matmul(x, y)
+
+    x = constant_op.constant(
+        np.random.random_sample((0, 3)), dtype=dtypes.float32)
+    y = constant_op.constant(
+        np.random.random_sample((3, 4)), dtype=dtypes.float32)
+    for grad in gradient_checker.compute_gradient(f, [x, y]):
+      self.assertEqual(grad[0].shape, (0, 0))
+      self.assertEqual(grad[1].shape, (12, 0))
+    error = gradient_checker.max_error(
+        *gradient_checker.compute_gradient(f, [x, y]))
+    self.assertEqual(error, 0)
+
   def testEmptyFails(self):
     @custom_gradient.custom_gradient
     def id_bad_grad(x):
diff --git a/tensorflow/python/ops/init_ops.py b/tensorflow/python/ops/init_ops.py
index f300a331e1f..d418fa64c52 100644
--- a/tensorflow/python/ops/init_ops.py
+++ b/tensorflow/python/ops/init_ops.py
@@ -173,12 +173,8 @@ class Constant(Initializer):
     of the `value` list, even reshaped, as shown in the two commented lines
     below the `value` list initialization.
 
-  ```
   >>> value = [0, 1, 2, 3, 4, 5, 6, 7]
-  >>> # value = np.array(value)
-  >>> # value = value.reshape([2, 4])
   >>> init = tf.compat.v1.constant_initializer(value)
-  >>>
   >>> # fitting shape
   >>> with tf.compat.v1.Session():
   ...   x = tf.compat.v1.get_variable('x', shape=[2, 4], initializer=init)
@@ -186,30 +182,28 @@ class Constant(Initializer):
   ...   print(x.eval())
   [[0. 1. 2. 3.]
    [4. 5. 6. 7.]]
-  >>>
   >>> # Larger shape
   >>> with tf.compat.v1.Session():
-  ...   x = tf.compat.v1.get_variable('x', shape=[3, 4], initializer=init)
-  ...   x.initializer.run()
-  ...   print(x.eval())
-  [[ 0.  1.  2.  3.]
-   [ 4.  5.  6.  7.]
-   [ 7.  7.  7.  7.]]
-  >>>
+  ...   y = tf.compat.v1.get_variable('y', shape=[3, 4], initializer=init)
+  ...   y.initializer.run()
+  ...   print(y.eval())
+  [[0.  1.  2.  3.]
+   [4.  5.  6.  7.]
+   [7.  7.  7.  7.]]
   >>> # Smaller shape
   >>> with tf.compat.v1.Session():
-  ...   x = tf.compat.v1.get_variable('x', shape=[2, 3], initializer=init)
+  ...   z = tf.compat.v1.get_variable('z', shape=[2, 3], initializer=init)
+  Traceback (most recent call last):
+  ...
   ValueError: Too many elements provided. Needed at most 6, but received 8
-  >>>
   >>> # Shape verification
-  >>> init_verify = tf.compat.v1.constant_initializer(value,
-  verify_shape=True)
+  >>> init_verify = tf.compat.v1.constant_initializer(value, verify_shape=True)
   >>> with tf.compat.v1.Session():
-  ...  x = tf.compat.v1.get_variable('x', shape=[3, 4],
-  ...                                 initializer=init_verify)
+  ...  u = tf.compat.v1.get_variable('u', shape=[3, 4],
+  ...                                initializer=init_verify)
+  Traceback (most recent call last):
+  ...
   TypeError: Expected Tensor's shape: (3, 4), got (8,).
-  >>>
-  ```
   """
 
   @deprecated_args(None,
diff --git a/tensorflow/python/ops/init_ops_v2.py b/tensorflow/python/ops/init_ops_v2.py
index 558d8cdeb29..c490921a285 100644
--- a/tensorflow/python/ops/init_ops_v2.py
+++ b/tensorflow/python/ops/init_ops_v2.py
@@ -149,12 +149,8 @@ class Constant(Initializer):
     of the `value` list, even reshaped, as shown in the two commented lines
     below the `value` list initialization.
 
-  ```python
   >>> value = [0, 1, 2, 3, 4, 5, 6, 7]
-  >>> # value = np.array(value)
-  >>> # value = value.reshape([2, 4])
   >>> init = tf.compat.v1.constant_initializer(value)
-  >>>
   >>> # Fitting shape
   >>> with tf.compat.v1.Session():
   ...   x = tf.compat.v1.get_variable('x', shape=[2, 4], initializer=init)
@@ -164,18 +160,19 @@ class Constant(Initializer):
    [4. 5. 6. 7.]]
   >>> # Larger shape
   >>> with tf.compat.v1.Session():
-  ...   x = tf.compat.v1.get_variable('x', shape=[3, 4], initializer=init)
-  ...   x.initializer.run()
-  ...   print(x.eval())
-  [[ 0.  1.  2.  3.]
-   [ 4.  5.  6.  7.]
-   [ 7.  7.  7.  7.]]
+  ...   y = tf.compat.v1.get_variable('y', shape=[3, 4], initializer=init)
+  ...   y.initializer.run()
+  ...   print(y.eval())
+  [[0. 1. 2. 3.]
+   [4. 5. 6. 7.]
+   [7. 7. 7. 7.]]
   >>> # Smaller shape
   >>> with tf.compat.v1.Session():
-  ...   x = tf.compat.v1.get_variable('x', shape=[2, 3], initializer=init)
+  ...   z = tf.compat.v1.get_variable('z', shape=[2, 3], initializer=init)
+  Traceback (most recent call last):
+  ...
   ValueError: Too many elements provided. Needed at most 6, but received 8
 
-  ```
   """
 
   def __init__(self, value=0):
diff --git a/tensorflow/python/ops/linalg/linear_operator_circulant.py b/tensorflow/python/ops/linalg/linear_operator_circulant.py
index aa40223b014..45da3c83f3d 100644
--- a/tensorflow/python/ops/linalg/linear_operator_circulant.py
+++ b/tensorflow/python/ops/linalg/linear_operator_circulant.py
@@ -119,11 +119,6 @@ class _BaseLinearOperatorCirculant(linear_operator.LinearOperator):
       s_shape = array_ops.shape(self.spectrum)
       self._block_shape_tensor = s_shape[-self.block_depth:]
 
-      # Add common variants of spectrum to the graph.
-      self._spectrum_complex = _to_complex(self.spectrum)
-      self._abs_spectrum = math_ops.abs(self.spectrum)
-      self._conj_spectrum = math_ops.conj(self._spectrum_complex)
-
       super(_BaseLinearOperatorCirculant, self).__init__(
           dtype=dtypes.as_dtype(input_output_dtype),
           graph_parents=[self.spectrum],
@@ -135,7 +130,8 @@ class _BaseLinearOperatorCirculant(linear_operator.LinearOperator):
 
   def _check_spectrum_and_return_tensor(self, spectrum):
     """Static check of spectrum.  Then return `Tensor` version."""
-    spectrum = ops.convert_to_tensor(spectrum, name="spectrum")
+    spectrum = linear_operator_util.convert_nonref_to_tensor(spectrum,
+                                                             name="spectrum")
 
     if spectrum.shape.ndims is not None:
       if spectrum.shape.ndims < self.block_depth:
@@ -294,7 +290,7 @@ class _BaseLinearOperatorCirculant(linear_operator.LinearOperator):
       `Tensor` with `dtype` `self.dtype`.
     """
     with self._name_scope(name):
-      h = self._ifft(self._spectrum_complex)
+      h = self._ifft(_to_complex(self.spectrum))
       return math_ops.cast(h, self.dtype)
 
   def _shape(self):
@@ -398,7 +394,9 @@ class _BaseLinearOperatorCirculant(linear_operator.LinearOperator):
     #           = F^{H} diag(spectrum) F x,
     # so that
     # matmul(x, adjoint=True) = F^{H} diag(conj(spectrum)) F x.
-    spectrum = self._conj_spectrum if adjoint else self._spectrum_complex
+    spectrum = _to_complex(self.spectrum)
+    if adjoint:
+      spectrum = math_ops.conj(spectrum)
 
     x = math_ops.cast(x, spectrum.dtype)
 
@@ -418,12 +416,15 @@ class _BaseLinearOperatorCirculant(linear_operator.LinearOperator):
 
   def _log_abs_determinant(self):
     axis = [-(i + 1) for i in range(self.block_depth)]
-    lad = math_ops.reduce_sum(math_ops.log(self._abs_spectrum), axis=axis)
+    lad = math_ops.reduce_sum(
+        math_ops.log(math_ops.abs(self.spectrum)), axis=axis)
     return math_ops.cast(lad, self.dtype)
 
   def _solve(self, rhs, adjoint=False, adjoint_arg=False):
     rhs = linalg.adjoint(rhs) if adjoint_arg else rhs
-    spectrum = self._conj_spectrum if adjoint else self._spectrum_complex
+    spectrum = _to_complex(self.spectrum)
+    if adjoint:
+      spectrum = math_ops.conj(spectrum)
 
     rhs, spectrum = self._broadcast_batch_dims(rhs, spectrum)
 
diff --git a/tensorflow/python/ops/linalg/sparse/BUILD b/tensorflow/python/ops/linalg/sparse/BUILD
index bbe8341e722..4e6d29cd932 100644
--- a/tensorflow/python/ops/linalg/sparse/BUILD
+++ b/tensorflow/python/ops/linalg/sparse/BUILD
@@ -13,7 +13,6 @@ tf_gen_op_wrapper_py(
     visibility = [
         "//learning/brain/python/ops:__pkg__",
         "//tensorflow/compiler/tests:__pkg__",
-        "//tensorflow/contrib/quantization:__pkg__",
         "//tensorflow/python/kernel_tests:__pkg__",
     ],
     deps = ["//tensorflow/core:sparse_csr_matrix_ops_op_lib"],
diff --git a/tensorflow/python/ops/linalg_grad.py b/tensorflow/python/ops/linalg_grad.py
index 21b09eb267f..06be79ce01e 100644
--- a/tensorflow/python/ops/linalg_grad.py
+++ b/tensorflow/python/ops/linalg_grad.py
@@ -352,8 +352,12 @@ def _SvdGrad(op, grad_s, grad_u, grad_v):
   # Giles' paper (see reference at top of file).  A derivation for
   # the full_matrices=False case is available at
   # https://j-towns.github.io/papers/svd-derivative.pdf
+  # The derivation for complex valued SVD can be found in
+  # https://re-ra.xyz/misc/complexsvd.pdf or
+  # https://giggleliu.github.io/2019/04/02/einsumbp.html
   a = op.inputs[0]
   a_shape = a.get_shape().with_rank_at_least(2)
+  grad_s = math_ops.cast(grad_s, a.dtype)
   grad_s_mat = array_ops.matrix_diag(grad_s)
 
   if not op.get_attr("compute_uv"):
@@ -364,11 +368,6 @@ def _SvdGrad(op, grad_s, grad_u, grad_v):
 
   full_matrices = op.get_attr("full_matrices")
 
-  # TODO(rmlarsen): Make this work with complex types.
-  if a.dtype.is_complex:
-    raise NotImplementedError(
-        "SVD gradient is not implemented for complex types and "
-        "compute_uv=True.")
   grad_u_shape = grad_u.get_shape().with_rank_at_least(2)
   grad_v_shape = grad_v.get_shape().with_rank_at_least(2)
   m = a_shape.dims[-2].merge_with(grad_u_shape[-2])
@@ -388,6 +387,7 @@ def _SvdGrad(op, grad_s, grad_u, grad_v):
   s = op.outputs[0]
   u = op.outputs[1]
   v = op.outputs[2]
+  s = math_ops.cast(s, a.dtype)
 
   use_adjoint = False
   if m > n:
@@ -413,17 +413,18 @@ def _SvdGrad(op, grad_s, grad_u, grad_v):
     # only defined up a (k-dimensional) subspace. In practice, this can
     # lead to numerical instability when singular values are close but not
     # exactly equal.
-    # Also, even with distinct singular values, the diagonal of f can have Inf
-    # values before setting to zero, which hurt when differentiating through
-    # this op. To avoid that, we add eye to the matrix before taking
-    # the reciprocal.
+    # To avoid nan in cases with degenrate sigular values or zero sigular values
+    # in calculating f and s_inv_mat, we introduce a Lorentz brodening.
+
+    def _SafeReciprocal(x, epsilon=1E-20):
+      return x * math_ops.reciprocal(x * x + epsilon)
+
     s_shape = array_ops.shape(s)
-    eye = _linalg.eye(s_shape[-1], batch_shape=s_shape[:-1], dtype=s.dtype)
     f = array_ops.matrix_set_diag(
-        math_ops.reciprocal(
-            array_ops.expand_dims(s2, -2) - array_ops.expand_dims(s2, -1) +
-            eye), array_ops.zeros_like(s))
-    s_inv_mat = array_ops.matrix_diag(math_ops.reciprocal(s))
+        _SafeReciprocal(
+            array_ops.expand_dims(s2, -2) - array_ops.expand_dims(s2, -1)),
+        array_ops.zeros_like(s))
+    s_inv_mat = array_ops.matrix_diag(_SafeReciprocal(s))
 
     v1 = v[..., :, :m]
     grad_v1 = grad_v[..., :, :m]
@@ -443,7 +444,7 @@ def _SvdGrad(op, grad_s, grad_u, grad_v):
     if m == n:
       grad_a_before_transpose = term1
     else:
-      gv1t = array_ops.matrix_transpose(grad_v1)
+      gv1t = array_ops.matrix_transpose(grad_v1, conjugate=True)
       gv1t_v1 = math_ops.matmul(gv1t, v1)
       term2_nous = gv1t - math_ops.matmul(gv1t_v1, v1, adjoint_b=True)
 
@@ -459,8 +460,18 @@ def _SvdGrad(op, grad_s, grad_u, grad_v):
 
       grad_a_before_transpose = term1 + term2
 
+    if a.dtype.is_complex:
+      eye = _linalg.eye(s_shape[-1], batch_shape=s_shape[:-1], dtype=a.dtype)
+      l = eye * v_gv
+      term3_nouv = math_ops.matmul(s_inv_mat, _linalg.adjoint(l) - l)
+      term3 = 1 / 2. * math_ops.matmul(
+          u, math_ops.matmul(term3_nouv, v1, adjoint_b=True))
+
+      grad_a_before_transpose += term3
+
     if use_adjoint:
-      grad_a = array_ops.matrix_transpose(grad_a_before_transpose)
+      grad_a = array_ops.matrix_transpose(
+          grad_a_before_transpose, conjugate=True)
     else:
       grad_a = grad_a_before_transpose
 
diff --git a/tensorflow/python/ops/lookup_ops.py b/tensorflow/python/ops/lookup_ops.py
index b2ea5c4ebc7..0aabb5c7ecb 100644
--- a/tensorflow/python/ops/lookup_ops.py
+++ b/tensorflow/python/ops/lookup_ops.py
@@ -627,7 +627,7 @@ class TextFileInitializer(TableInitializerBase):
     self._delimiter = delimiter
     self._name = name
     self._filename = self._track_trackable(
-        trackable.TrackableAsset(filename), "_filename")
+        trackable.Asset(filename), "_filename")
 
     super(TextFileInitializer, self).__init__(key_dtype, value_dtype)
 
diff --git a/tensorflow/python/ops/math_grad_test.py b/tensorflow/python/ops/math_grad_test.py
index 9715cd7cb59..9079f4b9b19 100644
--- a/tensorflow/python/ops/math_grad_test.py
+++ b/tensorflow/python/ops/math_grad_test.py
@@ -20,9 +20,9 @@ from __future__ import print_function
 
 import numpy as np
 
+from tensorflow.python.debug.lib import check_numerics_callback
 from tensorflow.python.eager import backprop
 from tensorflow.python.eager import context
-from tensorflow.python.eager import execution_callbacks
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
@@ -34,8 +34,6 @@ from tensorflow.python.ops import gradients
 from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import test
 
-RAISE = execution_callbacks.ExecutionCallback.RAISE
-
 
 class SquaredDifferenceOpTest(test.TestCase):
 
@@ -551,13 +549,16 @@ class PowGradTest(test.TestCase):
     self.assertAllClose([-2., 0., 2.], g)
 
   def test_zero_grad_tape(self):
-    with execution_callbacks.errstate(inf_or_nan=RAISE):
+    try:
+      check_numerics_callback.enable_check_numerics()
       x = constant_op.constant([-1, 0., 1.])
       with backprop.GradientTape() as tape:
         tape.watch(x)
         g = tape.gradient(math_ops.pow(x, 2), x)
       g = self.evaluate(g)
       self.assertAllClose([-2., 0., 2.], g)
+    finally:
+      check_numerics_callback.disable_check_numerics()
 
 
 @test_util.run_all_in_graph_and_eager_modes
diff --git a/tensorflow/python/ops/math_ops.py b/tensorflow/python/ops/math_ops.py
index f698fa41fa3..5c95c29c32d 100644
--- a/tensorflow/python/ops/math_ops.py
+++ b/tensorflow/python/ops/math_ops.py
@@ -1270,12 +1270,65 @@ ops.Tensor._override_operator("__gt__", gen_math_ops.greater)
 ops.Tensor._override_operator("__ge__", gen_math_ops.greater_equal)
 
 
+@tf_export("math.equal", "equal")
+@dispatch.add_dispatch_support
+def equal(x, y, name=None):
+  """Returns the truth value of (x == y) element-wise.
+
+  Usage:
+
+  ```python
+  x = tf.constant([2, 4])
+  y = tf.constant(2)
+  tf.math.equal(x, y) ==> array([True, False])
+
+  x = tf.constant([2, 4])
+  y = tf.constant([2, 4])
+  tf.math.equal(x, y) ==> array([True,  True])
+  ```
+
+  **NOTE**: `Equal` supports broadcasting. More about broadcasting [here](
+  https://docs.scipy.org/doc/numpy-1.13.0/user/basics.broadcasting.html)
+
+  Args:
+    x: A `Tensor` or `SparseTensor` or `IndexedSlices`.
+    y: A `Tensor` or `SparseTensor` or `IndexedSlices`.
+    name: A name for the operation (optional).
+
+  Returns:
+    A `Tensor` of type bool with the same size as that of x or y.
+  """
+  return gen_math_ops.equal(x, y, name=name)
+
+
+@tf_export("math.not_equal", "not_equal")
+@dispatch.add_dispatch_support
+def not_equal(x, y, name=None):
+  """Returns the truth value of (x != y) element-wise.
+
+  **NOTE**: `NotEqual` supports broadcasting. More about broadcasting [here](
+  https://docs.scipy.org/doc/numpy-1.13.0/user/basics.broadcasting.html)
+
+  Args:
+    x: A `Tensor` or `SparseTensor` or `IndexedSlices`.
+    y: A `Tensor` or `SparseTensor` or `IndexedSlices`.
+    name: A name for the operation (optional).
+
+  Returns:
+    A `Tensor` of type bool with the same size as that of x or y.
+  """
+  return gen_math_ops.not_equal(x, y, name=name)
+
+
 def tensor_equals(self, other):
   """Compares two tensors element-wise for equality."""
   g = getattr(self, "graph", None)
   if (ops.Tensor._USE_EQUALITY and ops.executing_eagerly_outside_functions() and
       (g is None or g._building_function)):  # pylint: disable=protected-access
-    return gen_math_ops.equal(self, other)
+    if fwd_compat.forward_compatible(2019, 9, 25):
+      return gen_math_ops.equal(self, other, incompatible_shape_error=False)
+    else:
+      return gen_math_ops.equal(self, other)
   else:
     # In legacy graph mode, tensor equality is object equality
     return self is other
@@ -1284,7 +1337,10 @@ def tensor_equals(self, other):
 def tensor_not_equals(self, other):
   """Compares two tensors element-wise for equality."""
   if ops.Tensor._USE_EQUALITY and ops.executing_eagerly_outside_functions():
-    return gen_math_ops.not_equal(self, other)
+    if fwd_compat.forward_compatible(2019, 9, 25):
+      return gen_math_ops.not_equal(self, other, incompatible_shape_error=False)
+    else:
+      return gen_math_ops.not_equal(self, other)
   else:
     # In legacy graph mode, tensor equality is object equality
     return self is not other
diff --git a/tensorflow/python/ops/parallel_for/array_test.py b/tensorflow/python/ops/parallel_for/array_test.py
index 9f166caa1b5..f4d560c91e6 100644
--- a/tensorflow/python/ops/parallel_for/array_test.py
+++ b/tensorflow/python/ops/parallel_for/array_test.py
@@ -38,12 +38,13 @@ class ArrayTest(PForTestCase):
 
   def test_gather(self):
     x = random_ops.random_uniform([3, 3, 3])
+    x2 = array_ops.placeholder_with_default(x, shape=None)  # Has dynamic shape.
 
     def loop_fn(i):
       outputs = []
       x_i = array_ops.gather(x, i)
-      for y in [x, x_i]:
-        axes = [0, 2, -1] if y is x else [0]
+      for y in [x, x2, x_i]:
+        axes = [0] if y is x_i else [0, 2, -1]
         for axis in axes:
           outputs.append(array_ops.gather(y, 2, axis=axis))
           outputs.append(array_ops.gather(y, i, axis=axis))
@@ -52,7 +53,7 @@ class ArrayTest(PForTestCase):
           outputs.append(array_ops.gather(y, [[2, i], [i, 1]], axis=axis))
       return outputs
 
-    self._test_loop_fn(loop_fn, 3, loop_fn_dtypes=[dtypes.float32] * 20)
+    self._test_loop_fn(loop_fn, 3, loop_fn_dtypes=[dtypes.float32] * 35)
 
   def test_gather_nd(self):
     x = random_ops.random_uniform([3, 3, 3])
diff --git a/tensorflow/python/ops/parallel_for/control_flow_ops.py b/tensorflow/python/ops/parallel_for/control_flow_ops.py
index d329825a3c3..23a5d1088ac 100644
--- a/tensorflow/python/ops/parallel_for/control_flow_ops.py
+++ b/tensorflow/python/ops/parallel_for/control_flow_ops.py
@@ -386,5 +386,10 @@ def vectorized_map(fn, elems):
   def loop_fn(i):
     gathered_elems = nest.map_structure(lambda x: array_ops.gather(x, i), elems)
     return fn(gathered_elems)
-  batch_size = array_ops.shape(nest.flatten(elems)[0])[0]
+  batch_size = None
+  first_elem_shape = nest.flatten(elems)[0].shape
+  if first_elem_shape.rank is not None:
+    batch_size = first_elem_shape.as_list()[0]
+  if batch_size is None:
+    batch_size = array_ops.shape()[0]
   return pfor(loop_fn, batch_size)
diff --git a/tensorflow/python/ops/parallel_for/pfor.py b/tensorflow/python/ops/parallel_for/pfor.py
index 05e2d6a455c..5e5b963202c 100644
--- a/tensorflow/python/ops/parallel_for/pfor.py
+++ b/tensorflow/python/ops/parallel_for/pfor.py
@@ -1968,8 +1968,8 @@ def _convert_gather(pfor_input):
       axis = axis_value
   if indices_stacked and not param_stacked:
     if indices is pfor_input.pfor.all_indices and axis == 0:
-      param_shape0 = param.shape.dims[0].value
-      indices_shape0 = indices.shape.dims[0].value
+      param_shape0 = tensor_shape.dimension_value(param.shape[0])
+      indices_shape0 = tensor_shape.dimension_value(indices.shape[0])
       if param_shape0 is not None and indices_shape0 == param_shape0:
         # Note that with loops and conditionals, indices may not be contiguous.
         # However they will be sorted and unique. So if the shape matches, then
@@ -2397,7 +2397,6 @@ def _convert_cast(pfor_input):
 @RegisterPForWithArgs("Div", math_ops.div)
 @RegisterPForWithArgs("DivNoNan", math_ops.div_no_nan)
 @RegisterPForWithArgs("Elu", nn_ops.elu)
-@RegisterPForWithArgs("Equal", math_ops.equal)
 @RegisterPForWithArgs("Erf", math_ops.erf)
 @RegisterPForWithArgs("Erfc", math_ops.erfc)
 @RegisterPForWithArgs("Exp", math_ops.exp)
@@ -2432,7 +2431,6 @@ def _convert_cast(pfor_input):
 @RegisterPForWithArgs("Mul", math_ops.multiply)
 @RegisterPForWithArgs("MulNoNan", math_ops.mul_no_nan)
 @RegisterPForWithArgs("Neg", math_ops.negative)
-@RegisterPForWithArgs("NotEqual", math_ops.not_equal)
 @RegisterPForWithArgs("Polygamma", math_ops.polygamma)
 @RegisterPForWithArgs("Pow", math_ops.pow)
 @RegisterPForWithArgs("Real", math_ops.real)
@@ -2471,6 +2469,26 @@ def _convert_cwise(pfor_input, op_type, op_func):
   return wrap(op_func(*[x.t for x in pfor_input.inputs]), True)
 
 
+@RegisterPFor("Equal")
+def _convert_equal(pfor_input):
+  pfor_input.expanddim_inputs_for_broadcast()
+  x = pfor_input.input(0)[0]
+  y = pfor_input.input(1)[0]
+  incompatible_shape_error = pfor_input.get_attr("incompatible_shape_error")
+  assert incompatible_shape_error
+  return wrap(math_ops.equal(x, y), True)
+
+
+@RegisterPFor("NotEqual")
+def _convert_not_equal(pfor_input):
+  pfor_input.expanddim_inputs_for_broadcast()
+  x = pfor_input.input(0)[0]
+  y = pfor_input.input(1)[0]
+  incompatible_shape_error = pfor_input.get_attr("incompatible_shape_error")
+  assert incompatible_shape_error
+  return wrap(math_ops.not_equal(x, y), True)
+
+
 @RegisterPFor("ApproximateEqual")
 def _convert_approximate_equal(pfor_input):
   pfor_input.expanddim_inputs_for_broadcast()
diff --git a/tensorflow/python/ops/parsing_ops.py b/tensorflow/python/ops/parsing_ops.py
index 718b6d19638..0bb1f1d560b 100644
--- a/tensorflow/python/ops/parsing_ops.py
+++ b/tensorflow/python/ops/parsing_ops.py
@@ -187,120 +187,249 @@ class FixedLenSequenceFeature(collections.namedtuple(
         cls, shape, dtype, allow_missing, default_value)
 
 
-def _features_to_raw_params(features, types):
-  """Split feature tuples into raw params used by `gen_parsing_ops`.
+class _ParseOpParams(object):
+  """Raw parameters used by `gen_parsing_ops`.
 
-  Args:
-    features: A `dict` mapping feature keys to objects of a type in `types`.
-    types: Type of features to allow, among `FixedLenFeature`, `VarLenFeature`,
-      `SparseFeature`, and `FixedLenSequenceFeature`.
-
-  Returns:
-    Tuple of `sparse_keys`, `sparse_types`, `dense_keys`, `dense_types`,
-      `dense_defaults`, `dense_shapes`.
-
-  Raises:
-    ValueError: if `features` contains an item not in `types`, or an invalid
-        feature.
+  Attributes:
+    sparse_keys: A list of string keys in the examples' features. The results
+      for these keys will be returned as `SparseTensor` objects.
+    sparse_types: A list of `DTypes` of the same length as `sparse_keys`. Only
+      `tf.float32` (`FloatList`), `tf.int64` (`Int64List`), and `tf.string`
+      (`BytesList`) are supported.
+    dense_keys: A list of string keys in the examples' features. The results for
+      these keys will be returned as `Tensor`s
+    dense_types: A list of DTypes of the same length as `dense_keys`. Only
+      `tf.float32` (`FloatList`), `tf.int64` (`Int64List`), and `tf.string`
+      (`BytesList`) are supported.
+    dense_defaults: A dict mapping string keys to `Tensor`s. The keys of the
+      dict must match the dense_keys of the feature.
+    dense_shapes: A list of tuples with the same length as `dense_keys`. The
+      shape of the data for each dense feature referenced by `dense_keys`.
+      Required for any input tensors identified by `dense_keys`.  Must be either
+      fully defined, or may contain an unknown first dimension. An unknown first
+      dimension means the feature is treated as having a variable number of
+      blocks, and the output shape along this dimension is considered unknown at
+      graph build time.  Padding is applied for minibatch elements smaller than
+      the maximum number of blocks for the given feature along this dimension.
+    dense_shapes_as_proto: dense_shapes converted to TensorShapeProto.
+    dense_defaults_vec: A vector of `Tensor`s containing the default values,
+      corresponding 1:1 with `dense_keys`.
+    num_features: The total number of feature keys.
   """
-  sparse_keys = []
-  sparse_types = []
-  dense_keys = []
-  dense_types = []
-  # When the graph is built twice, multiple dense_defaults in a normal dict
-  # could come out in different orders. This will fail the _e2e_test which
-  # expects exactly the same graph.
-  # OrderedDict which preserves the order can solve the problem.
-  dense_defaults = collections.OrderedDict()
-  dense_shapes = []
-  if features:
-    # NOTE: We iterate over sorted keys to keep things deterministic.
-    for key in sorted(features.keys()):
-      feature = features[key]
-      if isinstance(feature, VarLenFeature):
-        if VarLenFeature not in types:
-          raise ValueError("Unsupported VarLenFeature %s." % (feature,))
-        if not feature.dtype:
-          raise ValueError("Missing type for feature %s." % key)
-        sparse_keys.append(key)
-        sparse_types.append(feature.dtype)
-      elif isinstance(feature, SparseFeature):
-        if SparseFeature not in types:
-          raise ValueError("Unsupported SparseFeature %s." % (feature,))
 
-        if not feature.index_key:
-          raise ValueError(
-              "Missing index_key for SparseFeature %s." % (feature,))
-        if not feature.value_key:
-          raise ValueError(
-              "Missing value_key for SparseFeature %s." % (feature,))
-        if not feature.dtype:
-          raise ValueError("Missing type for feature %s." % key)
-        index_keys = feature.index_key
-        if isinstance(index_keys, str):
-          index_keys = [index_keys]
-        elif len(index_keys) > 1:
-          tf_logging.warning("SparseFeature is a complicated feature config "
-                             "and should only be used after careful "
-                             "consideration of VarLenFeature.")
-        for index_key in sorted(index_keys):
-          if index_key in sparse_keys:
-            dtype = sparse_types[sparse_keys.index(index_key)]
-            if dtype != dtypes.int64:
-              raise ValueError("Conflicting type %s vs int64 for feature %s." %
-                               (dtype, index_key))
-          else:
-            sparse_keys.append(index_key)
-            sparse_types.append(dtypes.int64)
-        if feature.value_key in sparse_keys:
-          dtype = sparse_types[sparse_keys.index(feature.value_key)]
-          if dtype != feature.dtype:
-            raise ValueError("Conflicting type %s vs %s for feature %s." % (
-                dtype, feature.dtype, feature.value_key))
-        else:
-          sparse_keys.append(feature.value_key)
-          sparse_types.append(feature.dtype)
-      elif isinstance(feature, FixedLenFeature):
-        if FixedLenFeature not in types:
-          raise ValueError("Unsupported FixedLenFeature %s." % (feature,))
-        if not feature.dtype:
-          raise ValueError("Missing type for feature %s." % key)
-        if feature.shape is None:
-          raise ValueError("Missing shape for feature %s." % key)
-        feature_tensor_shape = tensor_shape.as_shape(feature.shape)
-        if (feature.shape and feature_tensor_shape.ndims and
-            feature_tensor_shape.dims[0].value is None):
-          raise ValueError("First dimension of shape for feature %s unknown. "
-                           "Consider using FixedLenSequenceFeature." % key)
-        if (feature.shape is not None and
-            not feature_tensor_shape.is_fully_defined()):
-          raise ValueError("All dimensions of shape for feature %s need to be "
-                           "known but received %s." % (key, str(feature.shape)))
-        dense_keys.append(key)
-        dense_shapes.append(feature.shape)
-        dense_types.append(feature.dtype)
-        if feature.default_value is not None:
-          dense_defaults[key] = feature.default_value
-      elif isinstance(feature, FixedLenSequenceFeature):
-        if FixedLenSequenceFeature not in types:
-          raise ValueError("Unsupported FixedLenSequenceFeature %s." % (
-              feature,))
-        if not feature.dtype:
-          raise ValueError("Missing type for feature %s." % key)
-        if feature.shape is None:
-          raise ValueError("Missing shape for feature %s." % key)
-        dense_keys.append(key)
-        dense_shapes.append(feature.shape)
-        dense_types.append(feature.dtype)
-        if feature.allow_missing:
-          dense_defaults[key] = None
-        if feature.default_value is not None:
-          dense_defaults[key] = feature.default_value
+  def __init__(self,
+               sparse_keys=None,
+               sparse_types=None,
+               dense_keys=None,
+               dense_types=None,
+               dense_defaults=None,
+               dense_shapes=None):
+    # Note: we use an OrderedDict for dense_defaults, to ensure consistent
+    # graph construction order for _e2e_test.
+    dense_defaults = (
+        collections.OrderedDict() if dense_defaults is None else dense_defaults)
+    sparse_keys = [] if sparse_keys is None else sparse_keys
+    sparse_types = [] if sparse_types is None else sparse_types
+    dense_keys = [] if dense_keys is None else dense_keys
+    dense_types = [] if dense_types is None else dense_types
+    dense_shapes = ([[]] *
+                    len(dense_keys) if dense_shapes is None else dense_shapes)
+    self.sparse_keys = sparse_keys
+    self.sparse_types = [dtypes.as_dtype(t) for t in sparse_types]
+    self.dense_keys = dense_keys
+    self.dense_types = [dtypes.as_dtype(t) for t in dense_types]
+    self.dense_shapes = [tensor_shape.as_shape(s) for s in dense_shapes]
+    self.dense_defaults = dense_defaults
+    self._validate()
+
+  @classmethod
+  def from_features(cls, features, types):
+    """Builds _ParseOpParams for a given set of features and allowed types.
+
+    Args:
+      features: A `dict` mapping feature keys to objects of a type in `types`.
+      types: Type of features to allow, among `FixedLenFeature`,
+        `VarLenFeature`, `SparseFeature`, and `FixedLenSequenceFeature`.
+
+    Returns:
+      A `_ParseOpParams` containing the raw parameters for `gen_parsing_ops`.
+
+    Raises:
+      ValueError: if `features` contains an item not in `types`, or an invalid
+          feature.
+      ValueError: if sparse and dense key sets intersect.
+      ValueError: if input lengths do not match up.
+    """
+    params = cls()
+    if features:
+      # NOTE: We iterate over sorted keys to keep things deterministic.
+      for key in sorted(features.keys()):
+        feature = features[key]
+        if not isinstance(feature, tuple(types)):
+          raise ValueError("Unsupported %s %s." %
+                           (type(feature).__name__, feature))
+        params._add_feature(key, feature)  # pylint: disable=protected-access
+    return params
+
+  @property
+  def dense_shapes_as_proto(self):
+    return [shape.as_proto() for shape in self.dense_shapes]
+
+  @property
+  def num_features(self):
+    return len(self.dense_keys) + len(self.sparse_keys)
+
+  @property
+  def dense_defaults_vec(self):
+    return [
+        self._make_dense_default(k, s, t)
+        for k, s, t in zip(self.dense_keys, self.dense_shapes, self.dense_types)
+    ]
+
+  def _make_dense_default(self, key, shape, dtype):
+    """Construct the default value tensor for a specified dense feature.
+
+    Args:
+      key: The key string identifying the dense feature.
+      shape: The dense feature's shape.
+      dtype: The dense feature's dtype.
+
+    Returns:
+      A Tensor.
+    """
+    default_value = self.dense_defaults.get(key)
+    if (shape.ndims is not None and shape.ndims > 0 and
+        shape.dims[0].value is None):
+      # Variable stride dense shape, the default value should be a
+      # scalar padding value.
+      if default_value is None:
+        default_value = ops.convert_to_tensor(
+            "" if dtype == dtypes.string else 0, dtype=dtype)
       else:
-        raise ValueError("Invalid feature %s:%s." % (key, feature))
-  return (
-      sparse_keys, sparse_types, dense_keys, dense_types, dense_defaults,
-      dense_shapes)
+        # Reshape to a scalar to ensure user gets an error if they
+        # provide a tensor that's not intended to be a padding value
+        # (0 or 2+ elements).
+        key_name = "padding_" + re.sub("[^A-Za-z0-9_.\\-/]", "_", key)
+        default_value = ops.convert_to_tensor(
+            default_value, dtype=dtype, name=key_name)
+        default_value = array_ops.reshape(default_value, [])
+    else:
+      if default_value is None:
+        default_value = constant_op.constant([], dtype=dtype)
+      elif not isinstance(default_value, ops.Tensor):
+        key_name = "key_" + re.sub("[^A-Za-z0-9_.\\-/]", "_", key)
+        default_value = ops.convert_to_tensor(
+            default_value, dtype=dtype, name=key_name)
+        default_value = array_ops.reshape(default_value, shape)
+
+    return default_value
+
+  def _add_feature(self, key, feature):
+    """Adds the specified feature to this ParseOpParams."""
+    if isinstance(feature, VarLenFeature):
+      self._add_varlen_feature(key, feature)
+    elif isinstance(feature, SparseFeature):
+      self._add_sparse_feature(key, feature)
+    elif isinstance(feature, FixedLenFeature):
+      self._add_fixed_len_feature(key, feature)
+    elif isinstance(feature, FixedLenSequenceFeature):
+      self._add_fixed_len_sequence_feature(key, feature)
+    else:
+      raise ValueError("Invalid feature %s:%s." % (key, feature))
+
+  def _add_varlen_feature(self, key, feature):
+    """Adds a VarLenFeature."""
+    if not feature.dtype:
+      raise ValueError("Missing type for feature %s." % key)
+    self._add_sparse_key(key, feature.dtype)
+
+  def _add_sparse_key(self, key, dtype):
+    """Adds a sparse key & dtype, checking for duplicates."""
+    if key in self.sparse_keys:
+      original_dtype = self.sparse_types[self.sparse_keys.index(key)]
+      if original_dtype != dtype:
+        raise ValueError("Conflicting type %s vs %s for feature %s." %
+                         (original_dtype, dtype, key))
+    else:
+      self.sparse_keys.append(key)
+      self.sparse_types.append(dtype)
+
+  def _add_sparse_feature(self, key, feature):
+    """Adds a SparseFeature."""
+
+    if not feature.index_key:
+      raise ValueError("Missing index_key for SparseFeature %s." % (feature,))
+    if not feature.value_key:
+      raise ValueError("Missing value_key for SparseFeature %s." % (feature,))
+    if not feature.dtype:
+      raise ValueError("Missing type for feature %s." % key)
+    index_keys = feature.index_key
+    if isinstance(index_keys, str):
+      index_keys = [index_keys]
+    elif len(index_keys) > 1:
+      tf_logging.warning("SparseFeature is a complicated feature config "
+                         "and should only be used after careful "
+                         "consideration of VarLenFeature.")
+    for index_key in sorted(index_keys):
+      self._add_sparse_key(index_key, dtypes.int64)
+    self._add_sparse_key(feature.value_key, feature.dtype)
+
+  def _add_fixed_len_feature(self, key, feature):
+    """Adds a FixedLenFeature."""
+    if not feature.dtype:
+      raise ValueError("Missing type for feature %s." % key)
+    if feature.shape is None:
+      raise ValueError("Missing shape for feature %s." % key)
+    feature_tensor_shape = tensor_shape.as_shape(feature.shape)
+    if (feature.shape and feature_tensor_shape.ndims and
+        feature_tensor_shape.dims[0].value is None):
+      raise ValueError("First dimension of shape for feature %s unknown. "
+                       "Consider using FixedLenSequenceFeature." % key)
+    if (feature.shape is not None and
+        not feature_tensor_shape.is_fully_defined()):
+      raise ValueError("All dimensions of shape for feature %s need to be "
+                       "known but received %s." % (key, str(feature.shape)))
+    self.dense_keys.append(key)
+    self.dense_shapes.append(tensor_shape.as_shape(feature.shape))
+    self.dense_types.append(feature.dtype)
+    if feature.default_value is not None:
+      self.dense_defaults[key] = feature.default_value
+
+  def _add_fixed_len_sequence_feature(self, key, feature):
+    """Adds a FixedLenSequenceFeature."""
+    if not feature.dtype:
+      raise ValueError("Missing type for feature %s." % key)
+    if feature.shape is None:
+      raise ValueError("Missing shape for feature %s." % key)
+    self.dense_keys.append(key)
+    self.dense_shapes.append(tensor_shape.as_shape(feature.shape))
+    self.dense_types.append(feature.dtype)
+    if feature.allow_missing:
+      self.dense_defaults[key] = None
+    if feature.default_value is not None:
+      self.dense_defaults[key] = feature.default_value
+
+  def _validate(self):
+    """Validates the features in this ParseOpParams."""
+    if len(self.dense_shapes) != len(self.dense_keys):
+      raise ValueError(
+          "len(self.dense_shapes) != len(self.dense_keys): %d vs %d" %
+          (len(self.dense_shapes), len(self.dense_keys)))
+    if len(self.dense_types) != len(self.dense_keys):
+      raise ValueError(
+          "len(self.dense_types) != len(self.dense_keys): %d vs %d" %
+          (len(self.dense_types), len(self.dense_keys)))
+    if len(self.sparse_types) != len(self.sparse_keys):
+      raise ValueError(
+          "len(self.sparse_types) != len(self.sparse_keys): %d vs %d" %
+          (len(self.sparse_types), len(self.sparse_keys)))
+
+    dense_key_set = set(self.dense_keys)
+    sparse_key_set = set(self.sparse_keys)
+    if not dense_key_set.isdisjoint(sparse_key_set):
+      raise ValueError(
+          "Dense and sparse keys must not intersect; intersection: %s" %
+          dense_key_set.intersection(sparse_key_set))
 
 
 def _construct_sparse_tensors_for_sparse_features(features, tensor_dict):
@@ -347,6 +476,7 @@ def _construct_sparse_tensors_for_sparse_features(features, tensor_dict):
 
 
 def _prepend_none_dimension(features):
+  """Returns a copy of features with adjusted FixedLenSequenceFeature shapes."""
   if features:
     modified_features = dict(features)  # Create a copy to modify
     for key, feature in features.items():
@@ -798,25 +928,15 @@ def parse_example_v2(serialized, features, example_names=None, name=None):
   if not features:
     raise ValueError("Missing: features was %s." % features)
   features = _prepend_none_dimension(features)
-  (sparse_keys, sparse_types, dense_keys, dense_types, dense_defaults,
-   dense_shapes) = _features_to_raw_params(
-       features,
-       [VarLenFeature, SparseFeature, FixedLenFeature, FixedLenSequenceFeature])
-  outputs = _parse_example_raw(
-      serialized, example_names, sparse_keys, sparse_types, dense_keys,
-      dense_types, dense_defaults, dense_shapes, name)
+  params = _ParseOpParams.from_features(
+      features,
+      [VarLenFeature, SparseFeature, FixedLenFeature, FixedLenSequenceFeature])
+
+  outputs = _parse_example_raw(serialized, example_names, params, name=name)
   return _construct_sparse_tensors_for_sparse_features(features, outputs)
 
 
-def _parse_example_raw(serialized,
-                       names=None,
-                       sparse_keys=None,
-                       sparse_types=None,
-                       dense_keys=None,
-                       dense_types=None,
-                       dense_defaults=None,
-                       dense_shapes=None,
-                       name=None):
+def _parse_example_raw(serialized, names, params, name):
   """Parses `Example` protos.
 
   Args:
@@ -824,47 +944,25 @@ def _parse_example_raw(serialized,
       serialized `Example` protos.
     names: A vector (1-D Tensor) of strings (optional), the names of
       the serialized protos.
-    sparse_keys: A list of string keys in the examples' features.
-      The results for these keys will be returned as `SparseTensor` objects.
-    sparse_types: A list of `DTypes` of the same length as `sparse_keys`.
-      Only `tf.float32` (`FloatList`), `tf.int64` (`Int64List`),
-      and `tf.string` (`BytesList`) are supported.
-    dense_keys: A list of string keys in the examples' features.
-      The results for these keys will be returned as `Tensor`s
-    dense_types: A list of DTypes of the same length as `dense_keys`.
-      Only `tf.float32` (`FloatList`), `tf.int64` (`Int64List`),
-      and `tf.string` (`BytesList`) are supported.
-    dense_defaults: A dict mapping string keys to `Tensor`s.
-      The keys of the dict must match the dense_keys of the feature.
-    dense_shapes: A list of tuples with the same length as `dense_keys`.
-      The shape of the data for each dense feature referenced by `dense_keys`.
-      Required for any input tensors identified by `dense_keys`.  Must be
-      either fully defined, or may contain an unknown first dimension.
-      An unknown first dimension means the feature is treated as having
-      a variable number of blocks, and the output shape along this dimension
-      is considered unknown at graph build time.  Padding is applied for
-      minibatch elements smaller than the maximum number of blocks for the
-      given feature along this dimension.
+    params: A `ParseOpParams` containing the parameters for the parse op.
     name: A name for this operation (optional).
 
   Returns:
     A `dict` mapping keys to `Tensor`s and `SparseTensor`s.
 
   """
+  if params.num_features == 0:
+    raise ValueError("Must provide at least one feature key")
   with ops.name_scope(name, "ParseExample", [serialized, names]):
-    (names, dense_defaults_vec, sparse_keys, sparse_types,
-     dense_keys, dense_shapes, _) = _process_raw_parameters(
-         names, dense_defaults, sparse_keys, sparse_types, dense_keys,
-         dense_types, dense_shapes)
-
+    names = [] if names is None else names
     outputs = gen_parsing_ops.parse_example(
         serialized=serialized,
         names=names,
-        dense_defaults=dense_defaults_vec,
-        sparse_keys=sparse_keys,
-        sparse_types=sparse_types,
-        dense_keys=dense_keys,
-        dense_shapes=dense_shapes,
+        dense_defaults=params.dense_defaults_vec,
+        sparse_keys=params.sparse_keys,
+        sparse_types=params.sparse_types,
+        dense_keys=params.dense_keys,
+        dense_shapes=params.dense_shapes_as_proto,
         name=name)
 
     (sparse_indices, sparse_values, sparse_shapes, dense_values) = outputs
@@ -873,113 +971,9 @@ def _parse_example_raw(serialized,
         sparse_tensor.SparseTensor(ix, val, shape) for (ix, val, shape)
         in zip(sparse_indices, sparse_values, sparse_shapes)]
 
-    return dict(zip(sparse_keys + dense_keys, sparse_tensors + dense_values))
-
-
-def _process_raw_parameters(names, dense_defaults, sparse_keys, sparse_types,
-                            dense_keys, dense_types, dense_shapes):
-  """Process raw parameters to params used by `gen_parsing_ops`.
-
-  Args:
-    names: A vector (1-D Tensor) of strings (optional), the names of
-      the serialized protos.
-    dense_defaults: A dict mapping string keys to `Tensor`s.
-      The keys of the dict must match the dense_keys of the feature.
-    sparse_keys: A list of string keys in the examples' features.
-      The results for these keys will be returned as `SparseTensor` objects.
-    sparse_types: A list of `DTypes` of the same length as `sparse_keys`.
-      Only `tf.float32` (`FloatList`), `tf.int64` (`Int64List`),
-      and `tf.string` (`BytesList`) are supported.
-    dense_keys: A list of string keys in the examples' features.
-      The results for these keys will be returned as `Tensor`s
-    dense_types: A list of DTypes of the same length as `dense_keys`.
-      Only `tf.float32` (`FloatList`), `tf.int64` (`Int64List`),
-      and `tf.string` (`BytesList`) are supported.
-    dense_shapes: A list of tuples with the same length as `dense_keys`.
-      The shape of the data for each dense feature referenced by `dense_keys`.
-      Required for any input tensors identified by `dense_keys`.  Must be
-      either fully defined, or may contain an unknown first dimension.
-      An unknown first dimension means the feature is treated as having
-      a variable number of blocks, and the output shape along this dimension
-      is considered unknown at graph build time.  Padding is applied for
-      minibatch elements smaller than the maximum number of blocks for the
-      given feature along this dimension.
-
-  Returns:
-    Tuple of `names`, `dense_defaults_vec`, `sparse_keys`, `sparse_types`,
-    `dense_keys`, `dense_shapes`.
-
-  Raises:
-    ValueError: If sparse and dense key sets intersect, or input lengths do not
-      match up.
-  """
-  names = [] if names is None else names
-  dense_defaults = collections.OrderedDict(
-  ) if dense_defaults is None else dense_defaults
-  sparse_keys = [] if sparse_keys is None else sparse_keys
-  sparse_types = [] if sparse_types is None else sparse_types
-  dense_keys = [] if dense_keys is None else dense_keys
-  dense_types = [] if dense_types is None else dense_types
-  dense_shapes = ([[]] * len(dense_keys)
-                  if dense_shapes is None else dense_shapes)
-
-  num_dense = len(dense_keys)
-  num_sparse = len(sparse_keys)
-
-  if len(dense_shapes) != num_dense:
-    raise ValueError("len(dense_shapes) != len(dense_keys): %d vs. %d" %
-                     (len(dense_shapes), num_dense))
-  if len(dense_types) != num_dense:
-    raise ValueError("len(dense_types) != len(num_dense): %d vs. %d" %
-                     (len(dense_types), num_dense))
-  if len(sparse_types) != num_sparse:
-    raise ValueError("len(sparse_types) != len(sparse_keys): %d vs. %d" %
-                     (len(sparse_types), num_sparse))
-  if num_dense + num_sparse == 0:
-    raise ValueError("Must provide at least one sparse key or dense key")
-  if not set(dense_keys).isdisjoint(set(sparse_keys)):
-    raise ValueError(
-        "Dense and sparse keys must not intersect; intersection: %s" %
-        set(dense_keys).intersection(set(sparse_keys)))
-
-  # Convert dense_shapes to TensorShape object.
-  dense_shapes = [tensor_shape.as_shape(shape) for shape in dense_shapes]
-
-  dense_defaults_vec = []
-  for i, key in enumerate(dense_keys):
-    default_value = dense_defaults.get(key)
-    dense_shape = dense_shapes[i]
-    if (dense_shape.ndims is not None and dense_shape.ndims > 0 and
-        dense_shape.dims[0].value is None):
-      # Variable stride dense shape, the default value should be a
-      # scalar padding value
-      if default_value is None:
-        default_value = ops.convert_to_tensor(
-            "" if dense_types[i] == dtypes.string else 0, dtype=dense_types[i])
-      else:
-        # Reshape to a scalar to ensure user gets an error if they
-        # provide a tensor that's not intended to be a padding value
-        # (0 or 2+ elements).
-        key_name = "padding_" + re.sub("[^A-Za-z0-9_.\\-/]", "_", key)
-        default_value = ops.convert_to_tensor(
-            default_value, dtype=dense_types[i], name=key_name)
-        default_value = array_ops.reshape(default_value, [])
-    else:
-      if default_value is None:
-        default_value = constant_op.constant([], dtype=dense_types[i])
-      elif not isinstance(default_value, ops.Tensor):
-        key_name = "key_" + re.sub("[^A-Za-z0-9_.\\-/]", "_", key)
-        default_value = ops.convert_to_tensor(
-            default_value, dtype=dense_types[i], name=key_name)
-        default_value = array_ops.reshape(default_value, dense_shape)
-
-    dense_defaults_vec.append(default_value)
-
-  # Finally, convert dense_shapes to TensorShapeProto
-  dense_shapes_as_proto = [shape.as_proto() for shape in dense_shapes]
-
-  return (names, dense_defaults_vec, sparse_keys, sparse_types, dense_keys,
-          dense_shapes_as_proto, dense_shapes)
+    return dict(
+        zip(params.sparse_keys + params.dense_keys,
+            sparse_tensors + dense_values))
 
 
 @tf_export(v1=["io.parse_single_example", "parse_single_example"])
@@ -1062,38 +1056,20 @@ def parse_single_example_v2_unoptimized(
   if example_names is None:
     return parse_single_example_v2(serialized, features, name)
   features = _prepend_none_dimension(features)
-  (sparse_keys, sparse_types, dense_keys, dense_types, dense_defaults,
-   dense_shapes) = _features_to_raw_params(
-       features,
-       [VarLenFeature, FixedLenFeature, FixedLenSequenceFeature, SparseFeature])
-  outputs = _parse_single_example_raw(
-      serialized, example_names, sparse_keys, sparse_types, dense_keys,
-      dense_types, dense_defaults, dense_shapes, name)
+  params = _ParseOpParams.from_features(
+      features,
+      [VarLenFeature, FixedLenFeature, FixedLenSequenceFeature, SparseFeature])
+  outputs = _parse_single_example_raw(serialized, example_names, params, name)
   return _construct_sparse_tensors_for_sparse_features(features, outputs)
 
 
-def _parse_single_example_raw(serialized,
-                              names=None,
-                              sparse_keys=None,
-                              sparse_types=None,
-                              dense_keys=None,
-                              dense_types=None,
-                              dense_defaults=None,
-                              dense_shapes=None,
-                              name=None):
+def _parse_single_example_raw(serialized, names, params, name=None):
   """Parses a single `Example` proto.
 
   Args:
     serialized: A scalar string Tensor, a single serialized Example.
-      See `_parse_example_raw` documentation for more details.
     names: (Optional) A scalar string Tensor, the associated name.
-      See `_parse_example_raw` documentation for more details.
-    sparse_keys: See `_parse_example_raw` documentation for more details.
-    sparse_types: See `_parse_example_raw` documentation for more details.
-    dense_keys: See `_parse_example_raw` documentation for more details.
-    dense_types: See `_parse_example_raw` documentation for more details.
-    dense_defaults: See `_parse_example_raw` documentation for more details.
-    dense_shapes: See `_parse_example_raw` documentation for more details.
+    params: A `ParseOpParams` containing the parameters for the parse op.
     name: A name for this operation (optional).
 
   Returns:
@@ -1102,61 +1078,31 @@ def _parse_single_example_raw(serialized,
   Raises:
     ValueError: if any feature is invalid.
   """
+  if params.num_features == 0:
+    raise ValueError("Must provide at least one feature key")
   with ops.name_scope(name, "ParseSingleExample", [serialized, names]):
     serialized = ops.convert_to_tensor(serialized)
-    serialized_shape = serialized.get_shape()
-    if serialized_shape.ndims is not None:
-      if serialized_shape.ndims != 0:
-        raise ValueError("Input serialized must be a scalar")
-    else:
-      serialized = control_flow_ops.with_dependencies(
-          [control_flow_ops.Assert(
-              math_ops.equal(array_ops.rank(serialized), 0),
-              ["Input serialized must be a scalar"],
-              name="SerializedIsScalar")],
-          serialized,
-          name="SerializedDependencies")
+    serialized = _assert_scalar(serialized, "serialized")
     serialized = array_ops.expand_dims(serialized, 0)
     if names is not None:
       names = ops.convert_to_tensor(names)
-      names_shape = names.get_shape()
-      if names_shape.ndims is not None:
-        if names_shape.ndims != 0:
-          raise ValueError("Input names must be a scalar")
-      else:
-        names = control_flow_ops.with_dependencies(
-            [control_flow_ops.Assert(
-                math_ops.equal(array_ops.rank(names), 0),
-                ["Input names must be a scalar"],
-                name="NamesIsScalar")],
-            names,
-            name="NamesDependencies")
+      names = _assert_scalar(names, "names")
       names = array_ops.expand_dims(names, 0)
 
-    outputs = _parse_example_raw(
-        serialized,
-        names=names,
-        sparse_keys=sparse_keys,
-        sparse_types=sparse_types,
-        dense_keys=dense_keys,
-        dense_types=dense_types,
-        dense_defaults=dense_defaults,
-        dense_shapes=dense_shapes,
-        name=name)
-    if dense_keys is not None:
-      for d in dense_keys:
-        d_name = re.sub("[^A-Za-z0-9_.\\-/]", "_", d)
-        outputs[d] = array_ops.squeeze(
-            outputs[d], [0], name="Squeeze_%s" % d_name)
-    if sparse_keys is not None:
-      for s in sparse_keys:
-        s_name = re.sub("[^A-Za-z0-9_.\\-/]", "_", s)
-        outputs[s] = sparse_tensor.SparseTensor(
-            array_ops.slice(outputs[s].indices,
-                            [0, 1], [-1, -1], name="Slice_Indices_%s" % s_name),
-            outputs[s].values,
-            array_ops.slice(outputs[s].dense_shape,
-                            [1], [-1], name="Squeeze_Shape_%s" % s_name))
+    outputs = _parse_example_raw(serialized, names, params, name)
+    for d in params.dense_keys:
+      d_name = re.sub("[^A-Za-z0-9_.\\-/]", "_", d)
+      outputs[d] = array_ops.squeeze(
+          outputs[d], [0], name="Squeeze_%s" % d_name)
+    for s in params.sparse_keys:
+      s_name = re.sub("[^A-Za-z0-9_.\\-/]", "_", s)
+      outputs[s] = sparse_tensor.SparseTensor(
+          array_ops.slice(
+              outputs[s].indices, [0, 1], [-1, -1],
+              name="Slice_Indices_%s" % s_name), outputs[s].values,
+          array_ops.slice(
+              outputs[s].dense_shape, [1], [-1],
+              name="Squeeze_Shape_%s" % s_name))
     return outputs
 
 
@@ -1254,36 +1200,19 @@ def parse_sequence_example(serialized,
   """
   if not (context_features or sequence_features):
     raise ValueError("Missing features.")
-  (context_sparse_keys, context_sparse_types, context_dense_keys,
-   context_dense_types,
-   context_dense_defaults, context_dense_shapes) = _features_to_raw_params(
-       context_features, [VarLenFeature, FixedLenFeature])
-  (feature_list_sparse_keys, feature_list_sparse_types, feature_list_dense_keys,
-   feature_list_dense_types, feature_list_dense_defaults,
-   feature_list_dense_shapes) = _features_to_raw_params(
-       sequence_features, [VarLenFeature, FixedLenSequenceFeature])
-  return _parse_sequence_example_raw(
-      serialized, example_names, context_sparse_keys, context_sparse_types,
-      context_dense_keys, context_dense_types, context_dense_defaults,
-      context_dense_shapes, feature_list_sparse_keys, feature_list_sparse_types,
-      feature_list_dense_keys, feature_list_dense_types,
-      feature_list_dense_shapes, feature_list_dense_defaults, name)
+  context_params = _ParseOpParams.from_features(
+      context_features, [VarLenFeature, FixedLenFeature])
+  feature_list_params = _ParseOpParams.from_features(
+      sequence_features, [VarLenFeature, FixedLenSequenceFeature])
+
+  return _parse_sequence_example_raw(serialized, example_names, context_params,
+                                     feature_list_params, name)
 
 
 def _parse_sequence_example_raw(serialized,
-                                debug_name=None,
-                                context_sparse_keys=None,
-                                context_sparse_types=None,
-                                context_dense_keys=None,
-                                context_dense_types=None,
-                                context_dense_defaults=None,
-                                context_dense_shapes=None,
-                                feature_list_sparse_keys=None,
-                                feature_list_sparse_types=None,
-                                feature_list_dense_keys=None,
-                                feature_list_dense_types=None,
-                                feature_list_dense_shapes=None,
-                                feature_list_dense_defaults=None,
+                                debug_name,
+                                context,
+                                feature_list,
                                 name=None):
   """Parses a vector of `SequenceExample` protos.
 
@@ -1292,41 +1221,10 @@ def _parse_sequence_example_raw(serialized,
       serialized `SequenceExample` protos.
     debug_name: A vector (1-D Tensor) of strings (optional), the names of the
       serialized protos.
-    context_sparse_keys: A list of string keys in the `SequenceExample`'s
-      features.  The results for these keys will be returned as `SparseTensor`
-      objects.
-    context_sparse_types: A list of `DTypes`, the same length as `sparse_keys`.
-      Only `tf.float32` (`FloatList`), `tf.int64` (`Int64List`), and `tf.string`
-      (`BytesList`) are supported.
-    context_dense_keys: A list of string keys in the examples' features. The
-      results for these keys will be returned as `Tensor`s
-    context_dense_types: A list of DTypes, same length as `context_dense_keys`.
-      Only `tf.float32` (`FloatList`), `tf.int64` (`Int64List`), and `tf.string`
-      (`BytesList`) are supported.
-    context_dense_defaults: A dict mapping string keys to `Tensor`s. The keys of
-      the dict must match the context_dense_keys of the feature.
-    context_dense_shapes: A list of tuples, same length as `context_dense_keys`.
-      The shape of the data for each context_dense feature referenced by
-      `context_dense_keys`.  Required for any input tensors identified by
-      `context_dense_keys` whose shapes are anything other than `[]` or `[1]`.
-    feature_list_sparse_keys: A list of string keys in the `SequenceExample`'s
-      feature_lists.  The results for these keys will be returned as
-      `SparseTensor` objects.
-    feature_list_sparse_types: A list of `DTypes`, same length as `sparse_keys`.
-      Only `tf.float32` (`FloatList`), `tf.int64` (`Int64List`), and `tf.string`
-      (`BytesList`) are supported.
-    feature_list_dense_keys: A list of string keys in the `SequenceExample`'s
-      features_lists. The results for these keys will be returned as `Tensor`s.
-    feature_list_dense_types: A list of `DTypes`, same length as
-      `feature_list_dense_keys`.  Only `tf.float32` (`FloatList`), `tf.int64`
-      (`Int64List`), and `tf.string` (`BytesList`) are supported.
-    feature_list_dense_shapes: A list of tuples, same length as
-      `feature_list_dense_keys`.  The shape of the data for each `FeatureList`
-      feature referenced by `feature_list_dense_keys`.
-    feature_list_dense_defaults: A dict mapping key strings to values. The only
-      currently allowed value is `None`.  Any key appearing in this dict with
-      value `None` is allowed to be missing from the `SequenceExample`.  If
-      missing, the key is treated as zero-length.
+    context: A `ParseOpParams` containing the parameters for the parse
+      op for the context features.
+    feature_list: A `ParseOpParams` containing the parameters for the
+      parse op for the feature_list features.
     name: A name for this operation (optional).
 
   Returns:
@@ -1336,139 +1234,39 @@ def _parse_sequence_example_raw(serialized,
     contains the lengths of any dense feature_list features.
 
   Raises:
-    ValueError: If context_sparse and context_dense key sets intersect,
-      if feature_list_sparse and feature_list_dense key sets intersect,
-      if input lengths do not match up, or if a value in
-      feature_list_dense_defaults is not None.
-    TypeError: if feature_list_dense_defaults is not either None or a dict.
+    TypeError: if feature_list.dense_defaults is not either None or a dict.
   """
+  if context.num_features + feature_list.num_features == 0:
+    raise ValueError("Must provide at least one feature key")
   with ops.name_scope(name, "ParseSequenceExample", [serialized]):
-    context_dense_defaults = ({} if context_dense_defaults is None else
-                              context_dense_defaults)
-    context_sparse_keys = ([] if context_sparse_keys is None else
-                           context_sparse_keys)
-    context_sparse_types = ([] if context_sparse_types is None else
-                            context_sparse_types)
-    context_dense_keys = ([]
-                          if context_dense_keys is None else context_dense_keys)
-    context_dense_types = ([] if context_dense_types is None else
-                           context_dense_types)
-    context_dense_shapes = ([[]] * len(context_dense_keys)
-                            if context_dense_shapes is None else
-                            context_dense_shapes)
-    feature_list_sparse_keys = ([] if feature_list_sparse_keys is None else
-                                feature_list_sparse_keys)
-    feature_list_sparse_types = ([] if feature_list_sparse_types is None else
-                                 feature_list_sparse_types)
-    feature_list_dense_keys = ([] if feature_list_dense_keys is None else
-                               feature_list_dense_keys)
-    feature_list_dense_types = ([] if feature_list_dense_types is None else
-                                feature_list_dense_types)
-    feature_list_dense_shapes = ([[]] * len(feature_list_dense_keys)
-                                 if feature_list_dense_shapes is None else
-                                 feature_list_dense_shapes)
-    feature_list_dense_defaults = (
-        dict()
-        if feature_list_dense_defaults is None else feature_list_dense_defaults)
     debug_name = [] if debug_name is None else debug_name
 
     # Internal
     feature_list_dense_missing_assumed_empty = []
-
-    num_context_dense = len(context_dense_keys)
-    num_feature_list_dense = len(feature_list_dense_keys)
-    num_context_sparse = len(context_sparse_keys)
-    num_feature_list_sparse = len(feature_list_sparse_keys)
-
-    if len(context_dense_shapes) != num_context_dense:
-      raise ValueError(
-          "len(context_dense_shapes) != len(context_dense_keys): %d vs. %d" %
-          (len(context_dense_shapes), num_context_dense))
-    if len(context_dense_types) != num_context_dense:
-      raise ValueError(
-          "len(context_dense_types) != len(num_context_dense): %d vs. %d" %
-          (len(context_dense_types), num_context_dense))
-    if len(feature_list_dense_shapes) != num_feature_list_dense:
-      raise ValueError(
-          "len(feature_list_dense_shapes) != len(feature_list_dense_keys): "
-          "%d vs. %d" % (len(feature_list_dense_shapes),
-                         num_feature_list_dense))
-    if len(feature_list_dense_types) != num_feature_list_dense:
-      raise ValueError(
-          "len(feature_list_dense_types) != len(num_feature_list_dense):"
-          "%d vs. %d" % (len(feature_list_dense_types), num_feature_list_dense))
-    if len(context_sparse_types) != num_context_sparse:
-      raise ValueError(
-          "len(context_sparse_types) != len(context_sparse_keys): %d vs. %d" %
-          (len(context_sparse_types), num_context_sparse))
-    if len(feature_list_sparse_types) != num_feature_list_sparse:
-      raise ValueError(
-          "len(feature_list_sparse_types) != len(feature_list_sparse_keys): "
-          "%d vs. %d" % (len(feature_list_sparse_types),
-                         num_feature_list_sparse))
-    if (num_context_dense + num_context_sparse + num_feature_list_dense +
-        num_feature_list_sparse) == 0:
-      raise ValueError(
-          "Must provide at least one context_sparse key, context_dense key, "
-          ", feature_list_sparse key, or feature_list_dense key")
-    if not set(context_dense_keys).isdisjoint(set(context_sparse_keys)):
-      raise ValueError(
-          "context_dense and context_sparse keys must not intersect; "
-          "intersection: %s" % set(context_dense_keys).intersection(
-              set(context_sparse_keys)))
-    if not set(feature_list_dense_keys).isdisjoint(
-        set(feature_list_sparse_keys)):
-      raise ValueError(
-          "feature_list_dense and feature_list_sparse keys must not intersect; "
-          "intersection: %s" % set(feature_list_dense_keys).intersection(
-              set(feature_list_sparse_keys)))
-    if not isinstance(feature_list_dense_defaults, dict):
-      raise TypeError("feature_list_dense_defaults must be a dict")
-    for k, v in feature_list_dense_defaults.items():
+    for k, v in feature_list.dense_defaults.items():
       if v is not None:
-        raise ValueError(
-            "Value feature_list_dense_defaults[%s] must be None" % k)
+        raise ValueError("Value feature_list.dense_defaults[%s] must be None" %
+                         k)
       feature_list_dense_missing_assumed_empty.append(k)
 
-    context_dense_defaults_vec = []
-    for i, key in enumerate(context_dense_keys):
-      default_value = context_dense_defaults.get(key)
-      if default_value is None:
-        default_value = constant_op.constant([], dtype=context_dense_types[i])
-      elif not isinstance(default_value, ops.Tensor):
-        key_name = "key_" + re.sub("[^A-Za-z0-9_.\\-/]", "_", key)
-        default_value = ops.convert_to_tensor(
-            default_value, dtype=context_dense_types[i], name=key_name)
-
-      context_dense_defaults_vec.append(default_value)
-
-    context_dense_shapes = [
-        tensor_shape.as_shape(shape).as_proto()
-        for shape in context_dense_shapes
-    ]
-    feature_list_dense_shapes = [
-        tensor_shape.as_shape(shape).as_proto()
-        for shape in feature_list_dense_shapes
-    ]
-
     # pylint: disable=protected-access
     outputs = gen_parsing_ops.parse_sequence_example(
         serialized=serialized,
         debug_name=debug_name,
-        Ncontext_sparse=num_context_sparse,
-        Ncontext_dense=num_context_dense,
-        Nfeature_list_sparse=num_feature_list_sparse,
-        Nfeature_list_dense=num_feature_list_dense,
-        context_dense_defaults=context_dense_defaults_vec,
-        context_sparse_keys=context_sparse_keys,
-        context_sparse_types=context_sparse_types,
-        context_dense_keys=context_dense_keys,
-        context_dense_shapes=context_dense_shapes,
-        feature_list_sparse_keys=feature_list_sparse_keys,
-        feature_list_sparse_types=feature_list_sparse_types,
-        feature_list_dense_keys=feature_list_dense_keys,
-        feature_list_dense_types=feature_list_dense_types,
-        feature_list_dense_shapes=feature_list_dense_shapes,
+        Ncontext_sparse=len(context.sparse_keys),
+        Ncontext_dense=len(context.dense_keys),
+        Nfeature_list_sparse=len(feature_list.sparse_keys),
+        Nfeature_list_dense=len(feature_list.dense_keys),
+        context_dense_defaults=context.dense_defaults_vec,
+        context_sparse_keys=context.sparse_keys,
+        context_sparse_types=context.sparse_types,
+        context_dense_keys=context.dense_keys,
+        context_dense_shapes=context.dense_shapes_as_proto,
+        feature_list_sparse_keys=feature_list.sparse_keys,
+        feature_list_sparse_types=feature_list.sparse_types,
+        feature_list_dense_keys=feature_list.dense_keys,
+        feature_list_dense_types=feature_list.dense_types,
+        feature_list_dense_shapes=feature_list.dense_shapes,
         feature_list_dense_missing_assumed_empty=(
             feature_list_dense_missing_assumed_empty),
         name=name)
@@ -1494,13 +1292,13 @@ def _parse_sequence_example_raw(serialized,
     ]
 
     context_output = dict(
-        zip(context_sparse_keys + context_dense_keys,
+        zip(context.sparse_keys + context.dense_keys,
             context_sparse_tensors + context_dense_values))
     feature_list_output = dict(
-        zip(feature_list_sparse_keys + feature_list_dense_keys,
+        zip(feature_list.sparse_keys + feature_list.dense_keys,
             feature_list_sparse_tensors + feature_list_dense_values))
     feature_list_lengths = dict(
-        zip(feature_list_dense_keys, feature_list_dense_lengths))
+        zip(feature_list.dense_keys, feature_list_dense_lengths))
 
     return (context_output, feature_list_output, feature_list_lengths)
 
@@ -1592,81 +1390,32 @@ def parse_single_sequence_example(
   # pylint: enable=line-too-long
   if not (context_features or sequence_features):
     raise ValueError("Missing features.")
-  (context_sparse_keys, context_sparse_types, context_dense_keys,
-   context_dense_types, context_dense_defaults,
-   context_dense_shapes) = _features_to_raw_params(
-       context_features, [VarLenFeature, FixedLenFeature])
-  (feature_list_sparse_keys, feature_list_sparse_types,
-   feature_list_dense_keys, feature_list_dense_types,
-   feature_list_dense_defaults,
-   feature_list_dense_shapes) = _features_to_raw_params(
-       sequence_features, [VarLenFeature, FixedLenSequenceFeature])
-  return _parse_single_sequence_example_raw(
-      serialized, context_sparse_keys, context_sparse_types,
-      context_dense_keys, context_dense_types, context_dense_defaults,
-      context_dense_shapes, feature_list_sparse_keys,
-      feature_list_sparse_types, feature_list_dense_keys,
-      feature_list_dense_types, feature_list_dense_shapes,
-      feature_list_dense_defaults, example_name, name)
+  context_params = _ParseOpParams.from_features(
+      context_features, [VarLenFeature, FixedLenFeature])
+  feature_list_params = _ParseOpParams.from_features(
+      sequence_features, [VarLenFeature, FixedLenSequenceFeature])
+
+  return _parse_single_sequence_example_raw(serialized, context_params,
+                                            feature_list_params, example_name,
+                                            name)
 
 
 def _parse_single_sequence_example_raw(serialized,
-                                       context_sparse_keys=None,
-                                       context_sparse_types=None,
-                                       context_dense_keys=None,
-                                       context_dense_types=None,
-                                       context_dense_defaults=None,
-                                       context_dense_shapes=None,
-                                       feature_list_sparse_keys=None,
-                                       feature_list_sparse_types=None,
-                                       feature_list_dense_keys=None,
-                                       feature_list_dense_types=None,
-                                       feature_list_dense_shapes=None,
-                                       feature_list_dense_defaults=None,
-                                       debug_name=None,
+                                       context,
+                                       feature_list,
+                                       debug_name,
                                        name=None):
   """Parses a single `SequenceExample` proto.
 
   Args:
-    serialized: A scalar (0-D Tensor) of type string, a single binary
-      serialized `SequenceExample` proto.
-    context_sparse_keys: A list of string keys in the `SequenceExample`'s
-      features.  The results for these keys will be returned as
-      `SparseTensor` objects.
-    context_sparse_types: A list of `DTypes`, the same length as `sparse_keys`.
-      Only `tf.float32` (`FloatList`), `tf.int64` (`Int64List`),
-      and `tf.string` (`BytesList`) are supported.
-    context_dense_keys: A list of string keys in the examples' features.
-      The results for these keys will be returned as `Tensor`s
-    context_dense_types: A list of DTypes, same length as `context_dense_keys`.
-      Only `tf.float32` (`FloatList`), `tf.int64` (`Int64List`),
-      and `tf.string` (`BytesList`) are supported.
-    context_dense_defaults: A dict mapping string keys to `Tensor`s.
-      The keys of the dict must match the context_dense_keys of the feature.
-    context_dense_shapes: A list of tuples, same length as `context_dense_keys`.
-      The shape of the data for each context_dense feature referenced by
-      `context_dense_keys`.  Required for any input tensors identified by
-      `context_dense_keys` whose shapes are anything other than `[]` or `[1]`.
-    feature_list_sparse_keys: A list of string keys in the `SequenceExample`'s
-      feature_lists.  The results for these keys will be returned as
-      `SparseTensor` objects.
-    feature_list_sparse_types: A list of `DTypes`, same length as `sparse_keys`.
-      Only `tf.float32` (`FloatList`), `tf.int64` (`Int64List`),
-      and `tf.string` (`BytesList`) are supported.
-    feature_list_dense_keys: A list of string keys in the `SequenceExample`'s
-      features_lists. The results for these keys will be returned as `Tensor`s.
-    feature_list_dense_types: A list of `DTypes`, same length as
-      `feature_list_dense_keys`.  Only `tf.float32` (`FloatList`),
-      `tf.int64` (`Int64List`), and `tf.string` (`BytesList`) are supported.
-    feature_list_dense_shapes: A list of tuples, same length as
-      `feature_list_dense_keys`.  The shape of the data for each
-      `FeatureList` feature referenced by `feature_list_dense_keys`.
-    feature_list_dense_defaults: A dict mapping key strings to values.
-      The only currently allowed value is `None`.  Any key appearing
-      in this dict with value `None` is allowed to be missing from the
-      `SequenceExample`.  If missing, the key is treated as zero-length.
-    debug_name: A scalar (0-D Tensor) of strings (optional), the name of
-      the serialized proto.
+    serialized: A scalar (0-D Tensor) of type string, a single binary serialized
+      `SequenceExample` proto.
+    context: A `ParseOpParams` containing the parameters for the parse op for
+      the context features.
+    feature_list: A `ParseOpParams` containing the parameters for the parse op
+      for the feature_list features.
+    debug_name: A scalar (0-D Tensor) of strings (optional), the name of the
+      serialized proto.
     name: A name for this operation (optional).
 
   Returns:
@@ -1675,132 +1424,34 @@ def _parse_single_sequence_example_raw(serialized,
     The second dict contains the feature_list key/values.
 
   Raises:
-    ValueError: If context_sparse and context_dense key sets intersect,
-      if input lengths do not match up, or if a value in
-      feature_list_dense_defaults is not None.
-    TypeError: if feature_list_dense_defaults is not either None or a dict.
+    TypeError: if feature_list.dense_defaults is not either None or a dict.
   """
+  if context.num_features + feature_list.num_features == 0:
+    raise ValueError("Must provide at least one feature key")
   with ops.name_scope(name, "ParseSingleSequenceExample", [serialized]):
-    context_dense_defaults = (
-        {} if context_dense_defaults is None else context_dense_defaults)
-    context_sparse_keys = (
-        [] if context_sparse_keys is None else context_sparse_keys)
-    context_sparse_types = (
-        [] if context_sparse_types is None else context_sparse_types)
-    context_dense_keys = (
-        [] if context_dense_keys is None else context_dense_keys)
-    context_dense_types = (
-        [] if context_dense_types is None else context_dense_types)
-    context_dense_shapes = (
-        [[]] * len(context_dense_keys)
-        if context_dense_shapes is None else context_dense_shapes)
-    feature_list_sparse_keys = (
-        [] if feature_list_sparse_keys is None else feature_list_sparse_keys)
-    feature_list_sparse_types = (
-        [] if feature_list_sparse_types is None else feature_list_sparse_types)
-    feature_list_dense_keys = (
-        [] if feature_list_dense_keys is None else feature_list_dense_keys)
-    feature_list_dense_types = (
-        [] if feature_list_dense_types is None else feature_list_dense_types)
-    feature_list_dense_shapes = (
-        [[]] * len(feature_list_dense_keys)
-        if feature_list_dense_shapes is None else feature_list_dense_shapes)
-    feature_list_dense_defaults = (
-        dict() if feature_list_dense_defaults is None
-        else feature_list_dense_defaults)
     debug_name = "" if debug_name is None else debug_name
 
     # Internal
     feature_list_dense_missing_assumed_empty = []
-
-    num_context_dense = len(context_dense_keys)
-    num_feature_list_dense = len(feature_list_dense_keys)
-    num_context_sparse = len(context_sparse_keys)
-    num_feature_list_sparse = len(feature_list_sparse_keys)
-
-    if len(context_dense_shapes) != num_context_dense:
-      raise ValueError(
-          "len(context_dense_shapes) != len(context_dense_keys): %d vs. %d"
-          % (len(context_dense_shapes), num_context_dense))
-    if len(context_dense_types) != num_context_dense:
-      raise ValueError(
-          "len(context_dense_types) != len(num_context_dense): %d vs. %d"
-          % (len(context_dense_types), num_context_dense))
-    if len(feature_list_dense_shapes) != num_feature_list_dense:
-      raise ValueError(
-          "len(feature_list_dense_shapes) != len(feature_list_dense_keys): "
-          "%d vs. %d" % (len(feature_list_dense_shapes),
-                         num_feature_list_dense))
-    if len(feature_list_dense_types) != num_feature_list_dense:
-      raise ValueError(
-          "len(feature_list_dense_types) != len(num_feature_list_dense):"
-          "%d vs. %d" % (len(feature_list_dense_types), num_feature_list_dense))
-    if len(context_sparse_types) != num_context_sparse:
-      raise ValueError(
-          "len(context_sparse_types) != len(context_sparse_keys): %d vs. %d"
-          % (len(context_sparse_types), num_context_sparse))
-    if len(feature_list_sparse_types) != num_feature_list_sparse:
-      raise ValueError(
-          "len(feature_list_sparse_types) != len(feature_list_sparse_keys): "
-          "%d vs. %d"
-          % (len(feature_list_sparse_types), num_feature_list_sparse))
-    if (num_context_dense + num_context_sparse
-        + num_feature_list_dense + num_feature_list_sparse) == 0:
-      raise ValueError(
-          "Must provide at least one context_sparse key, context_dense key, "
-          ", feature_list_sparse key, or feature_list_dense key")
-    if not set(context_dense_keys).isdisjoint(set(context_sparse_keys)):
-      raise ValueError(
-          "context_dense and context_sparse keys must not intersect; "
-          "intersection: %s" %
-          set(context_dense_keys).intersection(set(context_sparse_keys)))
-    if not set(feature_list_dense_keys).isdisjoint(
-        set(feature_list_sparse_keys)):
-      raise ValueError(
-          "feature_list_dense and feature_list_sparse keys must not intersect; "
-          "intersection: %s" %
-          set(feature_list_dense_keys).intersection(
-              set(feature_list_sparse_keys)))
-    if not isinstance(feature_list_dense_defaults, dict):
-      raise TypeError("feature_list_dense_defaults must be a dict")
-    for k, v in feature_list_dense_defaults.items():
+    for k, v in feature_list.dense_defaults.items():
       if v is not None:
-        raise ValueError("Value feature_list_dense_defaults[%s] must be None"
-                         % k)
+        raise ValueError("Value feature_list.dense_defaults[%s] must be None" %
+                         k)
       feature_list_dense_missing_assumed_empty.append(k)
 
-    context_dense_defaults_vec = []
-    for i, key in enumerate(context_dense_keys):
-      default_value = context_dense_defaults.get(key)
-      if default_value is None:
-        default_value = constant_op.constant([], dtype=context_dense_types[i])
-      elif not isinstance(default_value, ops.Tensor):
-        key_name = "key_" + re.sub("[^A-Za-z0-9_.\\-/]", "_", key)
-        default_value = ops.convert_to_tensor(
-            default_value, dtype=context_dense_types[i], name=key_name)
-        default_value = array_ops.reshape(
-            default_value, context_dense_shapes[i])
-
-      context_dense_defaults_vec.append(default_value)
-
-    context_dense_shapes = [tensor_shape.as_shape(shape).as_proto()
-                            for shape in context_dense_shapes]
-    feature_list_dense_shapes = [tensor_shape.as_shape(shape).as_proto()
-                                 for shape in feature_list_dense_shapes]
-
     outputs = gen_parsing_ops.parse_single_sequence_example(
         serialized=serialized,
         debug_name=debug_name,
-        context_dense_defaults=context_dense_defaults_vec,
-        context_sparse_keys=context_sparse_keys,
-        context_sparse_types=context_sparse_types,
-        context_dense_keys=context_dense_keys,
-        context_dense_shapes=context_dense_shapes,
-        feature_list_sparse_keys=feature_list_sparse_keys,
-        feature_list_sparse_types=feature_list_sparse_types,
-        feature_list_dense_keys=feature_list_dense_keys,
-        feature_list_dense_types=feature_list_dense_types,
-        feature_list_dense_shapes=feature_list_dense_shapes,
+        context_dense_defaults=context.dense_defaults_vec,
+        context_sparse_keys=context.sparse_keys,
+        context_sparse_types=context.sparse_types,
+        context_dense_keys=context.dense_keys,
+        context_dense_shapes=context.dense_shapes,
+        feature_list_sparse_keys=feature_list.sparse_keys,
+        feature_list_sparse_types=feature_list.sparse_types,
+        feature_list_dense_keys=feature_list.dense_keys,
+        feature_list_dense_types=feature_list.dense_types,
+        feature_list_dense_shapes=feature_list.dense_shapes,
         feature_list_dense_missing_assumed_empty=(
             feature_list_dense_missing_assumed_empty),
         name=name)
@@ -1823,10 +1474,10 @@ def _parse_single_sequence_example_raw(serialized,
                feature_list_sparse_shapes)]
 
     context_output = dict(
-        zip(context_sparse_keys + context_dense_keys,
+        zip(context.sparse_keys + context.dense_keys,
             context_sparse_tensors + context_dense_values))
     feature_list_output = dict(
-        zip(feature_list_sparse_keys + feature_list_dense_keys,
+        zip(feature_list.sparse_keys + feature_list.dense_keys,
             feature_list_sparse_tensors + feature_list_dense_values))
 
     return (context_output, feature_list_output)
@@ -2084,45 +1735,20 @@ def parse_single_example_v2(serialized, features, name=None):
   if not features:
     raise ValueError("Missing: features was %s." % features)
   features = _prepend_none_dimension(features)
-  (sparse_keys, sparse_types, dense_keys, dense_types,
-   dense_defaults, dense_shapes) = _features_to_raw_params(
-       features,
-       [VarLenFeature, SparseFeature, FixedLenFeature, FixedLenSequenceFeature])
-  outputs = _parse_single_example_v2_raw(serialized, sparse_keys, sparse_types,
-                                         dense_keys, dense_types,
-                                         dense_defaults, dense_shapes, name)
+  params = _ParseOpParams.from_features(
+      features,
+      [VarLenFeature, FixedLenFeature, FixedLenSequenceFeature, SparseFeature])
+  outputs = _parse_single_example_v2_raw(serialized, params, name)
   return _construct_sparse_tensors_for_sparse_features(features, outputs)
 
 
-def _parse_single_example_v2_raw(serialized, sparse_keys, sparse_types,
-                                 dense_keys, dense_types, dense_defaults,
-                                 dense_shapes, name):
+def _parse_single_example_v2_raw(serialized, params, name):
   """Parses `Example` protos.
 
   Args:
     serialized: A scalar (0-D Tensor) string, containing a binary
       serialized `Example` proto.
-    sparse_keys: A list of string keys in the examples' features.
-      The results for these keys will be returned as `SparseTensor` objects.
-    sparse_types: A list of `DTypes` of the same length as `sparse_keys`.
-      Only `tf.float32` (`FloatList`), `tf.int64` (`Int64List`),
-      and `tf.string` (`BytesList`) are supported.
-    dense_keys: A list of string keys in the examples' features.
-      The results for these keys will be returned as `Tensor`s
-    dense_types: A list of DTypes of the same length as `dense_keys`.
-      Only `tf.float32` (`FloatList`), `tf.int64` (`Int64List`),
-      and `tf.string` (`BytesList`) are supported.
-    dense_defaults: A dict mapping string keys to `Tensor`s.
-      The keys of the dict must match the dense_keys of the feature.
-    dense_shapes: A list of tuples with the same length as `dense_keys`.
-      The shape of the data for each dense feature referenced by `dense_keys`.
-      Required for any input tensors identified by `dense_keys`.  Must be
-      either fully defined, or may contain an unknown first dimension.
-      An unknown first dimension means the feature is treated as having
-      a variable number of blocks, and the output shape along this dimension
-      is considered unknown at graph build time.  Padding is applied for
-      minibatch elements smaller than the maximum number of blocks for the
-      given feature along this dimension.
+    params: A `ParseOpParams` containing the parameters for the parse op.
     name: A name for this operation (optional).
 
   Returns:
@@ -2132,81 +1758,18 @@ def _parse_single_example_v2_raw(serialized, sparse_keys, sparse_types,
     ValueError: If sparse and dense key sets intersect, or input lengths do not
       match up.
   """
+  if params.num_features == 0:
+    raise ValueError("Must provide at least one feature key")
   with ops.name_scope(name, "ParseSingleExample", [serialized]):
     serialized = ops.convert_to_tensor(serialized, name="serialized")
-    dense_defaults = collections.OrderedDict(
-    ) if dense_defaults is None else dense_defaults
-    sparse_keys = [] if sparse_keys is None else sparse_keys
-    sparse_types = [] if sparse_types is None else sparse_types
-    dense_keys = [] if dense_keys is None else dense_keys
-    dense_types = [] if dense_types is None else dense_types
-    dense_shapes = ([[]] * len(dense_keys)
-                    if dense_shapes is None else dense_shapes)
-
-    num_dense = len(dense_keys)
-    num_sparse = len(sparse_keys)
-
-    if len(dense_shapes) != num_dense:
-      raise ValueError("len(dense_shapes) != len(dense_keys): %d vs. %d" %
-                       (len(dense_shapes), num_dense))
-    if len(dense_types) != num_dense:
-      raise ValueError("len(dense_types) != len(num_dense): %d vs. %d" %
-                       (len(dense_types), num_dense))
-    if len(sparse_types) != num_sparse:
-      raise ValueError("len(sparse_types) != len(sparse_keys): %d vs. %d" %
-                       (len(sparse_types), num_sparse))
-    if num_dense + num_sparse == 0:
-      raise ValueError("Must provide at least one sparse key or dense key")
-    if not set(dense_keys).isdisjoint(set(sparse_keys)):
-      raise ValueError(
-          "Dense and sparse keys must not intersect; intersection: %s" %
-          set(dense_keys).intersection(set(sparse_keys)))
-
-    # Convert dense_shapes to TensorShape object.
-    dense_shapes = [tensor_shape.as_shape(shape) for shape in dense_shapes]
-
-    dense_defaults_vec = []
-    for i, key in enumerate(dense_keys):
-      default_value = dense_defaults.get(key)
-      dense_shape = dense_shapes[i]
-      if (dense_shape.ndims is not None and dense_shape.ndims > 0 and
-          dense_shape.dims[0].value is None):
-        # Variable stride dense shape, the default value should be a
-        # scalar padding value
-        if default_value is None:
-          default_value = ops.convert_to_tensor(
-              "" if dense_types[i] == dtypes.string else 0,
-              dtype=dense_types[i])
-        else:
-          # Reshape to a scalar to ensure user gets an error if they
-          # provide a tensor that's not intended to be a padding value
-          # (0 or 2+ elements).
-          key_name = "padding_" + re.sub("[^A-Za-z0-9_.\\-/]", "_", key)
-          default_value = ops.convert_to_tensor(
-              default_value, dtype=dense_types[i], name=key_name)
-          default_value = array_ops.reshape(default_value, [])
-      else:
-        if default_value is None:
-          default_value = constant_op.constant([], dtype=dense_types[i])
-        elif not isinstance(default_value, ops.Tensor):
-          key_name = "key_" + re.sub("[^A-Za-z0-9_.\\-/]", "_", key)
-          default_value = ops.convert_to_tensor(
-              default_value, dtype=dense_types[i], name=key_name)
-          default_value = array_ops.reshape(default_value, dense_shape)
-
-      dense_defaults_vec.append(default_value)
-
-    # Finally, convert dense_shapes to TensorShapeProto
-    dense_shapes = [shape.as_proto() for shape in dense_shapes]
-
     outputs = gen_parsing_ops.parse_single_example(
         serialized=serialized,
-        dense_defaults=dense_defaults_vec,
-        num_sparse=len(sparse_keys),
-        sparse_keys=sparse_keys,
-        sparse_types=sparse_types,
-        dense_keys=dense_keys,
-        dense_shapes=dense_shapes,
+        dense_defaults=params.dense_defaults_vec,
+        num_sparse=len(params.sparse_keys),
+        sparse_keys=params.sparse_keys,
+        sparse_types=params.sparse_types,
+        dense_keys=params.dense_keys,
+        dense_shapes=params.dense_shapes,
         name=name)
 
     (sparse_indices, sparse_values, sparse_shapes, dense_values) = outputs
@@ -2217,4 +1780,23 @@ def _parse_single_example_v2_raw(serialized, sparse_keys, sparse_types,
              shape) in zip(sparse_indices, sparse_values, sparse_shapes)
     ]
 
-    return dict(zip(sparse_keys + dense_keys, sparse_tensors + dense_values))
+    return dict(
+        zip(params.sparse_keys + params.dense_keys,
+            sparse_tensors + dense_values))
+
+
+def _assert_scalar(value, name):
+  """Asserts that `value` is scalar, and returns `value`."""
+  value_rank = value.shape.rank
+  if value_rank is None:
+    check = control_flow_ops.Assert(
+        math_ops.equal(array_ops.rank(value), 0),
+        ["Input %s must be a scalar" % name],
+        name="%sIsScalar" % name.capitalize())
+    return control_flow_ops.with_dependencies([check],
+                                              value,
+                                              name="%sDependencies" % name)
+  elif value_rank == 0:
+    return value
+  else:
+    raise ValueError("Input %s must be a scalar" % name)
diff --git a/tensorflow/python/ops/quantized_ops_test.py b/tensorflow/python/ops/quantized_ops_test.py
index b81843d1748..d6679a6800a 100644
--- a/tensorflow/python/ops/quantized_ops_test.py
+++ b/tensorflow/python/ops/quantized_ops_test.py
@@ -55,6 +55,54 @@ class QuantizedOpsTest(test.TestCase):
       value = self.evaluate(op)
       self.assertArrayNear(expected_output, value, 0.1)
 
+  def testAxis(self):
+    # Generates a tensor of the specified `shape` using values from `values`
+    # scaled by (slice_idx + 1) along `axis` dimension.
+    def scale_per_slice(shape, axis, values):
+      # Note: repeats the values if the shape is larger than values.
+      out = np.take(values, np.remainder(np.arange(np.prod(shape)),
+                                         len(values))).reshape(shape)
+      if axis is not None:
+        scale_shape = [1] * len(shape)
+        scale_shape[axis] = shape[axis]
+        out *= np.arange(1, shape[axis] + 1).reshape(scale_shape)
+      return out
+
+    shape = np.array([2, 3, 4, 5])
+    values = np.array([-1, -0.5, 0, 0.3, 0.8, 0.555, 0.5], dtype=np.float32)
+    quant_values = np.array([-128, -64, 0, 38, 102, 71, 64], dtype=np.int32)
+    for axis in [None, 0, 1, 2, 3]:
+      inputs = constant_op.constant(scale_per_slice(shape, axis, values))
+      expected_quantized = scale_per_slice(shape, None, quant_values)
+      if axis is None:
+        min_range, max_range = -1.0, 0.8
+      else:
+        num_slices = shape[axis]
+        min_range, max_range = [], []
+        for slice_idx in range(num_slices):
+          min_range.append(-1.0 * (slice_idx + 1))
+          max_range.append(0.8 * (slice_idx + 1))
+      quantized = self.evaluate(
+          array_ops.quantize(
+              inputs,
+              min_range,
+              max_range,
+              T=dtypes.qint8,
+              mode="SCALED",
+              round_mode="HALF_TO_EVEN",
+              axis=axis)).output
+      self.assertAllEqual(quantized, expected_quantized)
+      if axis is not None:
+        quantized = self.evaluate(
+            array_ops.quantize(
+                inputs,
+                min_range,
+                max_range,
+                T=dtypes.qint8,
+                mode="SCALED",
+                round_mode="HALF_TO_EVEN",
+                axis=(axis - 4))).output
+        self.assertAllClose(quantized, expected_quantized)
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/ops/ragged/ragged_array_ops.py b/tensorflow/python/ops/ragged/ragged_array_ops.py
index 18af9828bf6..af9e7c278a2 100644
--- a/tensorflow/python/ops/ragged/ragged_array_ops.py
+++ b/tensorflow/python/ops/ragged/ragged_array_ops.py
@@ -70,25 +70,24 @@ def boolean_mask(data, mask, name=None):
       not a prefix of `data.shape`.
 
   #### Examples:
-    ```python
-    >>> # Aliases for True & False so data and mask line up.
-    >>> T, F = (True, False)
 
-    >>> tf.ragged.boolean_mask(  # Mask a 2D Tensor.
-    ...     data=[[1, 2, 3], [4, 5, 6], [7, 8, 9]],
-    ...     mask=[[T, F, T], [F, F, F], [T, F, F]]).tolist()
-    [[1, 3], [], [7]]
+  >>> # Aliases for True & False so data and mask line up.
+  >>> T, F = (True, False)
 
-    >>> tf.ragged.boolean_mask(  # Mask a 2D RaggedTensor.
-    ...     tf.ragged.constant([[1, 2, 3], [4], [5, 6]]),
-    ...     tf.ragged.constant([[F, F, T], [F], [T, T]])).tolist()
-    [[3], [], [5, 6]]
+  >>> tf.ragged.boolean_mask(  # Mask a 2D Tensor.
+  ...     data=[[1, 2, 3], [4, 5, 6], [7, 8, 9]],
+  ...     mask=[[T, F, T], [F, F, F], [T, F, F]]).to_list()
+  [[1, 3], [], [7]]
 
-    >>> tf.ragged.boolean_mask(  # Mask rows of a 2D RaggedTensor.
-    ...     tf.ragged.constant([[1, 2, 3], [4], [5, 6]]),
-    ...     tf.ragged.constant([True, False, True])).tolist()
-    [[1, 2, 3], [5, 6]]
-    ```
+  >>> tf.ragged.boolean_mask(  # Mask a 2D RaggedTensor.
+  ...     tf.ragged.constant([[1, 2, 3], [4], [5, 6]]),
+  ...     tf.ragged.constant([[F, F, T], [F], [T, T]])).to_list()
+  [[3], [], [5, 6]]
+
+  >>> tf.ragged.boolean_mask(  # Mask rows of a 2D RaggedTensor.
+  ...     tf.ragged.constant([[1, 2, 3], [4], [5, 6]]),
+  ...     tf.ragged.constant([True, False, True])).to_list()
+  [[1, 2, 3], [5, 6]]
   """
   with ops.name_scope(name, 'RaggedMask', [data, mask]):
     # Convert inputs to tensors.
@@ -223,11 +222,10 @@ def tile(input, multiples, name=None):  # pylint: disable=redefined-builtin
     A `RaggedTensor` with the same type, rank, and ragged_rank as `input`.
 
   #### Example:
-    ```python
-    >>> rt = tf.ragged.constant([[1, 2], [3]])
-    >>> ragged.tile(rt, [3, 2])
-    [[1, 2, 1, 2], [3, 3], [1, 2, 1, 2], [3, 3], [1, 2, 1, 2], [3, 3]]
-    ```
+
+  >>> rt = tf.ragged.constant([[1, 2], [3]])
+  >>> tf.tile(rt, [3, 2]).to_list()
+  [[1, 2, 1, 2], [3, 3], [1, 2, 1, 2], [3, 3], [1, 2, 1, 2], [3, 3]]
   """
   with ops.name_scope(name, 'RaggedTile', [input, multiples]):
     input = ragged_tensor.convert_to_tensor_or_ragged_tensor(
@@ -267,11 +265,10 @@ def _tile_ragged_values(rt_input, multiples, const_multiples=None):
     A `Tensor` with the same type and rank as `rt_input.flat_values`.
 
   #### Example:
-    ```python
-    >>> rt = tf.ragged.constant([[1, 2], [3]])
-    >>> _tile_ragged_values(rt, [3, 2])
-    [1, 2, 1, 2, 3, 3, 1, 2, 1, 2, 3, 3, 1, 2, 1, 2, 3, 3]
-    ```
+
+  >>> rt = tf.ragged.constant([[1, 2], [3]])
+  >>> _tile_ragged_values(rt, tf.constant([3, 2])).numpy()
+  array([1, 2, 1, 2, 3, 3, 1, 2, 1, 2, 3, 3, 1, 2, 1, 2, 3, 3], dtype=int32)
   """
   ragged_rank = rt_input.ragged_rank
   nested_splits = rt_input.nested_row_splits
@@ -326,11 +323,10 @@ def _tile_ragged_splits(rt_input, multiples, const_multiples=None):
     `rt_input`).
 
   #### Example:
-    ```python
-    >>> rt = tf.ragged.constant([[1, 2], [3]])
-    >>> _tile_ragged_splits(rt, [3, 2])
-    [0, 4, 6, 10, 12, 16, 18]
-    ```
+
+  >>> rt = tf.ragged.constant([[1, 2], [3]])
+  >>> _tile_ragged_splits(rt, [3, 2])
+  [<tf.Tensor: ..., numpy=array([ 0,  4,  6, 10, 12, 16, 18])>]
   """
   ragged_rank = rt_input.ragged_rank
   nested_splits = rt_input.nested_row_splits
@@ -423,23 +419,22 @@ def expand_dims(input, axis, name=None):  # pylint: disable=redefined-builtin
     size 1 at `axis`.
 
   #### Examples:
-    ```python
-    >>> rt = tf.ragged.constant([[1, 2], [3]])
-    >>> print rt.shape
-    TensorShape([2, None])
 
-    >>> expanded = ragged.expand_dims(rt, axis=0)
-    >>> print(expanded.shape, expanded)
-    TensorShape([1, None, None]) [[[1, 2], [3]]]
+  >>> rt = tf.ragged.constant([[1, 2], [3]])
+  >>> print(rt.shape)
+  (2, None)
 
-    >>> expanded = ragged.expand_dims(rt, axis=1)
-    >>> print(expanded.shape, expanded)
-    TensorShape([2, None, None]) [[[1, 2]], [[3]]]
+  >>> expanded = tf.expand_dims(rt, axis=0)
+  >>> print(expanded.shape, expanded)
+  (1, None, None) <tf.RaggedTensor [[[1, 2], [3]]]>
 
-    >>> expanded = ragged.expand_dims(rt, axis=2)
-    >>> print(expanded.shape, expanded)
-    TensorShape([2, None, 1]) [[[1], [2]], [[3]]]
-    ```
+  >>> expanded = tf.expand_dims(rt, axis=1)
+  >>> print(expanded.shape, expanded)
+  (2, None, None) <tf.RaggedTensor [[[1, 2]], [[3]]]>
+
+  >>> expanded = tf.expand_dims(rt, axis=2)
+  >>> print(expanded.shape, expanded)
+  (2, None, 1) <tf.RaggedTensor [[[1], [2]], [[3]]]>
   """
   with ops.name_scope(name, 'RaggedExpandDims', [input]):
     input = ragged_tensor.convert_to_tensor_or_ragged_tensor(
@@ -474,6 +469,11 @@ def size(input, out_type=dtypes.int32, name=None):  # pylint: disable=redefined-
 
   The size of a ragged tensor is the size of its inner values.
 
+  #### Example:
+
+  >>> tf.size(tf.ragged.constant([[1, 2], [3]])).numpy()
+  3
+
   Args:
     input: A potentially ragged `Tensor`.
     out_type: The numeric output type for the operation.
@@ -481,12 +481,6 @@ def size(input, out_type=dtypes.int32, name=None):  # pylint: disable=redefined-
 
   Returns:
     A Tensor of type `out_type`.
-
-  #### Example:
-    ```python
-    >>> tf.size(tf.ragged.constant([[1, 2], [3]]))
-    3
-    ```
   """
   if ragged_tensor.is_ragged(input):
     return array_ops.size(input.flat_values, out_type=out_type, name=name)
@@ -502,13 +496,12 @@ def rank(input, name=None):  # pylint: disable=redefined-builtin
 
   Returns a 0-D `int32` `Tensor` representing the rank of `input`.
 
-  For example:
+  #### Example:
 
-  ```python
-  # shape of tensor 't' is [2, None, None]
-  t = tf.ragged.constant([[[1], [2, 2]], [[3, 3, 3], [4, 4, 4, 4]]])
-  tf.rank(t)  # 3
-  ```
+  >>> # shape of tensor 't' is [2, None, None]
+  >>> t = tf.ragged.constant([[[1], [2, 2]], [[3, 3, 3], [4, 4, 4, 4]]])
+  >>> tf.rank(t).numpy()
+  3
 
   Args:
     input: A `RaggedTensor`
@@ -562,14 +555,13 @@ def stack_dynamic_partitions(data, partitions, num_partitions, name=None):
   If `num_partitions` is an `int` (not a `Tensor`), then this is equivalent to
   `tf.ragged.stack(tf.dynamic_partition(data, partitions, num_partitions))`.
 
-  ####Example:
-    ```python
-    >>> data           = ['a', 'b', 'c', 'd', 'e']
-    >>> partitions     = [  3,   0,   2,   2,   3]
-    >>> num_partitions = 5
-    >>> tf.ragged.stack_dynamic_partitions(data, partitions, num_partitions)
-    <RaggedTensor [['b'], [], ['c', 'd'], ['a', 'e'], []]>
-    ```
+  #### Example:
+
+  >>> data           = ['a', 'b', 'c', 'd', 'e']
+  >>> partitions     = [  3,   0,   2,   2,   3]
+  >>> num_partitions = 5
+  >>> tf.ragged.stack_dynamic_partitions(data, partitions, num_partitions)
+  <tf.RaggedTensor [[b'b'], [], [b'c', b'd'], [b'a', b'e'], []]>
 
   Args:
     data: A `Tensor` or `RaggedTensor` containing the values to stack.
diff --git a/tensorflow/python/ops/ragged/ragged_batch_gather_ops.py b/tensorflow/python/ops/ragged/ragged_batch_gather_ops.py
index cc8bebbdd19..8f4271fc821 100644
--- a/tensorflow/python/ops/ragged/ragged_batch_gather_ops.py
+++ b/tensorflow/python/ops/ragged/ragged_batch_gather_ops.py
@@ -55,12 +55,11 @@ def batch_gather(params, indices, name=None):
     `result.ragged_rank = max(indices.ragged_rank, params.ragged_rank)`.
 
   #### Example:
-    ```python
-    >>> params = tf.ragged.constant([['a', 'b', 'c'], ['d'], [], ['e']])
-    >>> indices = tf.ragged.constant([[1, 2, 0], [], [], [0, 0]])
-    >>> tf.compat.v1.batch_gather(params, indices)
-    [['b', 'c', 'a'], [], [], ['e', 'e']]
-    ```
+
+  >>> params = tf.ragged.constant([['a', 'b', 'c'], ['d'], [], ['e']])
+  >>> indices = tf.ragged.constant([[1, 2, 0], [], [], [0, 0]])
+  >>> tf.compat.v1.batch_gather(params, indices)
+  <tf.RaggedTensor [[b'b', b'c', b'a'], [], [], [b'e', b'e']]>
   """
   if not (ragged_tensor.is_ragged(params) or ragged_tensor.is_ragged(indices)):
     return array_ops.batch_gather(params, indices, name)
diff --git a/tensorflow/python/ops/ragged/ragged_batch_gather_with_default_op.py b/tensorflow/python/ops/ragged/ragged_batch_gather_with_default_op.py
index b10524c39ad..377fd84f96e 100644
--- a/tensorflow/python/ops/ragged/ragged_batch_gather_with_default_op.py
+++ b/tensorflow/python/ops/ragged/ragged_batch_gather_with_default_op.py
@@ -59,16 +59,12 @@ def batch_gather_with_default(params,
     `result.ragged_rank = max(indices.ragged_rank, params.ragged_rank)`.
 
   #### Example:
-    ```python
-    >>> params = tf.ragged.constant([
-          ['a', 'b', 'c'],
-          ['d'],
-          [],
-          ['e']])
-    >>> indices = tf.ragged.constant([[1, 2, -1], [], [], [0, 10]])
-    >>> batch_gather_with_default(params, indices, 'FOO')
-    [['b', 'c', 'FOO'], [], [], ['e', 'FOO']]
-  ```
+
+  >>> params = tf.ragged.constant([['a', 'b', 'c'], ['d'], [], ['e']])
+  >>> indices = tf.ragged.constant([[1, 2, -1], [], [], [0, 10]])
+  >>> batch_gather_with_default(params, indices, 'FOO')
+  <tf.RaggedTensor [[b'b', b'c', b'FOO'], [], [], [b'e', b'FOO']]>
+
   """
   with ops.name_scope(name, 'RaggedBatchGatherWithDefault'):
     params = ragged_tensor.convert_to_tensor_or_ragged_tensor(
diff --git a/tensorflow/python/ops/ragged/ragged_concat_ops.py b/tensorflow/python/ops/ragged/ragged_concat_ops.py
index 1372db07abc..5699a4c6d00 100644
--- a/tensorflow/python/ops/ragged/ragged_concat_ops.py
+++ b/tensorflow/python/ops/ragged/ragged_concat_ops.py
@@ -56,14 +56,13 @@ def concat(values, axis, name=None):
       the input tensors have different ranks.
 
   #### Example:
-    ```python
-    >>> t1 = tf.ragged.constant([[1, 2], [3, 4, 5]])
-    >>> t2 = tf.ragged.constant([[6], [7, 8, 9]])
-    >>> ragged.concat([t1, t2], axis=0)
-    [[1, 2], [3, 4, 5], [6], [7, 8, 9]]
-    >>> ragged.concat([t1, t2], axis=1)
-    [[1, 2, 6], [3, 4, 5, 7, 8, 9]]
-    ```
+
+  >>> t1 = tf.ragged.constant([[1, 2], [3, 4, 5]])
+  >>> t2 = tf.ragged.constant([[6], [7, 8, 9]])
+  >>> tf.concat([t1, t2], axis=0)
+  <tf.RaggedTensor [[1, 2], [3, 4, 5], [6], [7, 8, 9]]>
+  >>> tf.concat([t1, t2], axis=1)
+  <tf.RaggedTensor [[1, 2, 6], [3, 4, 5, 7, 8, 9]]>
   """
   if not isinstance(values, (list, tuple)):
     values = [values]
@@ -79,15 +78,21 @@ def stack(values, axis=0, name=None):
   (`R >= axis`), returns a rank-`R+1` `RaggedTensor` `result` such that
   `result[i0...iaxis]` is `[value[i0...iaxis] for value in values]`.
 
-  #### Example:
-    ```python
-    >>> t1 = tf.ragged.constant([[1, 2], [3, 4, 5]])
-    >>> t2 = tf.ragged.constant([[6], [7, 8, 9]])
-    >>> tf.ragged.stack([t1, t2], axis=0)
-    [[[1, 2], [3, 4, 5]], [[6], [7, 9, 0]]]
-    >>> tf.ragged.stack([t1, t2], axis=1)
-    [[[1, 2], [6]], [[3, 4, 5], [7, 8, 9]]]
-    ```
+  #### Examples:
+
+  >>> # Stacking two ragged tensors.
+  >>> t1 = tf.ragged.constant([[1, 2], [3, 4, 5]])
+  >>> t2 = tf.ragged.constant([[6], [7, 8, 9]])
+  >>> tf.ragged.stack([t1, t2], axis=0)
+  <tf.RaggedTensor [[[1, 2], [3, 4, 5]], [[6], [7, 8, 9]]]>
+  >>> tf.ragged.stack([t1, t2], axis=1)
+  <tf.RaggedTensor [[[1, 2], [6]], [[3, 4, 5], [7, 8, 9]]]>
+
+  >>> # Stacking two dense tensors with different sizes.
+  >>> t3 = tf.constant([[1, 2, 3], [4, 5, 6]])
+  >>> t4 = tf.constant([[5], [6], [7]])
+  >>> tf.ragged.stack([t3, t4], axis=0)
+  <tf.RaggedTensor [[[1, 2, 3], [4, 5, 6]], [[5], [6], [7]]]>
 
   Args:
     values: A list of `tf.Tensor` or `tf.RaggedTensor`.  May not be empty. All
diff --git a/tensorflow/python/ops/ragged/ragged_factory_ops.py b/tensorflow/python/ops/ragged/ragged_factory_ops.py
index 7ab450ee7f5..bb4337cc011 100644
--- a/tensorflow/python/ops/ragged/ragged_factory_ops.py
+++ b/tensorflow/python/ops/ragged/ragged_factory_ops.py
@@ -40,10 +40,8 @@ def constant(pylist, dtype=None, ragged_rank=None, inner_shape=None,
 
   Example:
 
-  ```python
-  >>> ragged.constant([[1, 2], [3], [4, 5, 6]]).eval()
-  RaggedTensorValue(values=[1, 2, 3, 4, 5, 6], splits=[0, 2, 3, 6])
-  ```
+  >>> tf.ragged.constant([[1, 2], [3], [4, 5, 6]])
+  <tf.RaggedTensor [[1, 2], [3], [4, 5, 6]]>
 
   All scalar values in `pylist` must have the same nesting depth `K`, and the
   returned `RaggedTensor` will have rank `K`.  If `pylist` contains no scalar
@@ -98,10 +96,9 @@ def constant_value(pylist, dtype=None, ragged_rank=None, inner_shape=None,
 
   Example:
 
-  ```python
-  >>> ragged.constant_value([[1, 2], [3], [4, 5, 6]])
-  RaggedTensorValue(values=[1, 2, 3, 4, 5, 6], splits=[0, 2, 3, 6])
-  ```
+  >>> tf.compat.v1.ragged.constant_value([[1, 2], [3], [4, 5, 6]])
+  tf.RaggedTensorValue(values=array([1, 2, 3, 4, 5, 6]),
+                       row_splits=array([0, 2, 3, 6]))
 
   All scalar values in `pylist` must have the same nesting depth `K`, and the
   returned `RaggedTensorValue` will have rank `K`.  If `pylist` contains no
diff --git a/tensorflow/python/ops/ragged/ragged_functional_ops.py b/tensorflow/python/ops/ragged/ragged_functional_ops.py
index c63f11e3adc..cc45f729e58 100644
--- a/tensorflow/python/ops/ragged/ragged_functional_ops.py
+++ b/tensorflow/python/ops/ragged/ragged_functional_ops.py
@@ -41,15 +41,13 @@ def map_flat_values(op, *args, **kwargs):
 
   Examples:
 
-  ```python
-  >>> rt = ragged.constant([[1, 2, 3], [], [4, 5], [6]])
-  >>> ragged.map_flat_values(tf.ones_like, rt).eval().tolist()
+  >>> rt = tf.ragged.constant([[1, 2, 3], [], [4, 5], [6]])
+  >>> map_flat_values(tf.ones_like, rt).to_list()
   [[1, 1, 1], [], [1, 1], [1]]
-  >>> ragged.map_flat_values(tf.multiply, rt, rt).eval().tolist()
+  >>> map_flat_values(tf.multiply, rt, rt).to_list()
   [[1, 4, 9], [], [16, 25], [36]]
-  >>> ragged.map_flat_values(tf.add, rt, 5).eval().tolist()
+  >>> map_flat_values(tf.add, rt, 5).to_list()
   [[6, 7, 8], [], [9, 10], [11]]
-  ```
 
   Args:
     op: The operation that should be applied to the RaggedTensor `flat_values`.
diff --git a/tensorflow/python/ops/ragged/ragged_gather_ops.py b/tensorflow/python/ops/ragged/ragged_gather_ops.py
index 89a501df170..9b8fdb21c11 100644
--- a/tensorflow/python/ops/ragged/ragged_gather_ops.py
+++ b/tensorflow/python/ops/ragged/ragged_gather_ops.py
@@ -52,21 +52,19 @@ def gather(params, indices, validate_indices=None, axis=0, batch_dims=0,
 
   Examples:
 
-  ```python
   >>> params = tf.constant(['a', 'b', 'c', 'd', 'e'])
   >>> indices = tf.constant([3, 1, 2, 1, 0])
   >>> ragged_params = tf.ragged.constant([['a', 'b', 'c'], ['d'], [], ['e']])
   >>> ragged_indices = tf.ragged.constant([[3, 1, 2], [1], [], [0]])
 
-  >>> print ragged.gather(params, ragged_indices)
-  [['d', 'b', 'c'], ['b'], [], ['a']]
+  >>> tf.gather(params, ragged_indices)
+  <tf.RaggedTensor [[b'd', b'b', b'c'], [b'b'], [], [b'a']]>
 
-  >>> print ragged.gather(ragged_params, indices)
-  [['e'], ['d'], [], ['d'], ['a', 'b', 'c']]
+  >>> tf.gather(ragged_params, indices)
+  <tf.RaggedTensor [[b'e'], [b'd'], [], [b'd'], [b'a', b'b', b'c']]>
 
-  >>> print ragged.gather(ragged_params, ragged_indices)
-  [[['e'], ['d'], []], [['d']], [], [['a', 'b', 'c']]]
-  ```
+  >>> tf.gather(ragged_params, ragged_indices)
+  <tf.RaggedTensor [[[b'e'], [b'd'], []], [[b'd']], [], [[b'a', b'b', b'c']]]>
 
   Args:
     params: The potentially ragged tensor from which to gather values. Must be
@@ -148,25 +146,23 @@ def gather_nd(params, indices, batch_dims=0, name=None):
     A potentially ragged tensor with shape `[A1...AN, B_{I+1}...BM]`.
 
   #### Examples:
-    ```python
-    >>> params = tf.compat.v1.ragged.constant_value(
-    ...     [ [ ['000', '001'], ['010'              ]          ],
-    ...       [ ['100'       ], ['110', '111', '112'], ['120'] ],
-    ...       [ [            ], ['210'              ]          ] ])
 
-    >>> # Gather 2D slices from a 3D tensor
-    >>> ragged.gather_nd(params, [[2], [0]])
-    [ [ [            ], ['210'] ]
-      [ ['000', '001'], ['010'] ] ]
+  >>> params = tf.ragged.constant(
+  ...     [ [ ['000', '001'], ['010'              ]          ],
+  ...       [ ['100'       ], ['110', '111', '112'], ['120'] ],
+  ...       [ [            ], ['210'              ]          ] ])
 
-    >>> # Gather 1D slices from a 3D tensor
-    >>> ragged.gather_nd(params, [[2, 1], [0, 0]])
-    [['210'], ['000', '001']]
+  >>> # Gather 2D slices from a 3D tensor
+  >>> tf.gather_nd(params, [[2], [0]])
+  <tf.RaggedTensor [[[], [b'210']], [[b'000', b'001'], [b'010']]]>
 
-    >>> # Gather scalars from a 3D tensor
-    >>> ragged.gather_nd(params, [[0, 0, 1], [1, 1, 2]])
-    ['001', '112']
-    ```
+  >>> # Gather 1D slices from a 3D tensor
+  >>> tf.gather_nd(params, [[2, 1], [0, 0]])
+  <tf.RaggedTensor [[b'210'], [b'000', b'001']]>
+
+  >>> # Gather scalars from a 3D tensor
+  >>> tf.gather_nd(params, [[0, 0, 1], [1, 1, 2]]).numpy()
+  array([b'001', b'112'], dtype=object)
   """
   if not isinstance(batch_dims, int) or batch_dims != 0:
     raise ValueError('batch_dims != 0 is not supported for ragged gather yet.')
diff --git a/tensorflow/python/ops/ragged/ragged_getitem.py b/tensorflow/python/ops/ragged/ragged_getitem.py
index 27d0dae6b7f..b8c35e5d73b 100644
--- a/tensorflow/python/ops/ragged/ragged_getitem.py
+++ b/tensorflow/python/ops/ragged/ragged_getitem.py
@@ -70,30 +70,28 @@ def ragged_tensor_getitem(self, key):
 
   Examples:
 
-    ```python
-    >>> # A 2-D ragged tensor with 1 ragged dimension.
-    >>> rt = ragged.constant([['a', 'b', 'c'], ['d', 'e'], ['f'], ['g']])
-    >>> rt[0].eval().tolist()       # First row (1-D `Tensor`)
-    ['a', 'b', 'c']
-    >>> rt[:3].eval().tolist()      # First three rows (2-D RaggedTensor)
-    [['a', 'b', 'c'], ['d', 'e'], '[f'], [g']]
-    >>> rt[3, 0].eval().tolist()    # 1st element of 4th row (scalar)
-    'g'
+  >>> # A 2-D ragged tensor with 1 ragged dimension.
+  >>> rt = tf.ragged.constant([['a', 'b', 'c'], ['d', 'e'], ['f'], ['g']])
+  >>> rt[0].numpy()                 # First row (1-D `Tensor`)
+  array([b'a', b'b', b'c'], dtype=object)
+  >>> rt[:3].to_list()              # First three rows (2-D RaggedTensor)
+  [[b'a', b'b', b'c'], [b'd', b'e'], [b'f']]
+  >>> rt[3, 0].numpy()              # 1st element of 4th row (scalar)
+  b'g'
 
-    >>> # A 3-D ragged tensor with 2 ragged dimensions.
-    >>> rt = ragged.constant([[[1, 2, 3], [4]],
-    ...                    [[5], [], [6]],
-    ...                    [[7]],
-    ...                    [[8, 9], [10]]])
-    >>> rt[1].eval().tolist()       # Second row (2-D RaggedTensor)
-    [[5], [], [6]]
-    >>> rt[3, 0].eval().tolist()    # First element of fourth row (1-D Tensor)
-    [8, 9]
-    >>> rt[:, 1:3].eval().tolist()  # Items 1-3 of each row (3-D RaggedTensor)
-    [[[4]], [[], [6]], [], [[10]]]
-    >>> rt[:, -1:].eval().tolist()  # Last item of each row (3-D RaggedTensor)
-    [[[4]], [[6]], [[7]], [[10]]]
-    ```
+  >>> # A 3-D ragged tensor with 2 ragged dimensions.
+  >>> rt = tf.ragged.constant([[[1, 2, 3], [4]],
+  ...                          [[5], [], [6]],
+  ...                          [[7]],
+  ...                          [[8, 9], [10]]])
+  >>> rt[1].to_list()               # Second row (2-D RaggedTensor)
+  [[5], [], [6]]
+  >>> rt[3, 0].numpy()              # First element of fourth row (1-D Tensor)
+  array([8, 9], dtype=int32)
+  >>> rt[:, 1:3].to_list()          # Items 1-3 of each row (3-D RaggedTensor)
+  [[[4]], [[], [6]], [], [[10]]]
+  >>> rt[:, -1:].to_list()          # Last item of each row (3-D RaggedTensor)
+  [[[4]], [[6]], [[7]], [[10]]]
   """
   scope_tensors = [self] + list(_tensors_in_key_list(key))
   if isinstance(key, (list, tuple)):
diff --git a/tensorflow/python/ops/ragged/ragged_map_ops.py b/tensorflow/python/ops/ragged/ragged_map_ops.py
index e647d47126e..64bae498b31 100644
--- a/tensorflow/python/ops/ragged/ragged_map_ops.py
+++ b/tensorflow/python/ops/ragged/ragged_map_ops.py
@@ -163,7 +163,7 @@ def map_fn(fn,
     elems=ragged.constant([[1, 2, 3], [4, 5], [6, 7]], dtype=tf.int64)
     out = map_fn(fn=lambda x: x+1, elems,
       dtype=ragged.RaggedTensorType(type=tf.int64, ragged_rank=0))
-    # out = ragged.constant([[2, 3, 4], [5, 6], [7, 8]])
+    # out = tf.ragged.constant([[2, 3, 4], [5, 6], [7, 8]])
     ```
   """
   if not callable(fn):
diff --git a/tensorflow/python/ops/ragged/ragged_math_ops.py b/tensorflow/python/ops/ragged/ragged_math_ops.py
index 22b6288caee..2cf27f57045 100644
--- a/tensorflow/python/ops/ragged/ragged_math_ops.py
+++ b/tensorflow/python/ops/ragged/ragged_math_ops.py
@@ -58,14 +58,12 @@ def range(starts, limits=None, deltas=1, dtype=None,
 
   Examples:
 
-  ```python
-  >>> ragged.range([3, 5, 2]).eval().tolist()
+  >>> tf.ragged.range([3, 5, 2]).to_list()
   [[0, 1, 2], [0, 1, 2, 3, 4], [0, 1]]
-  >>> ragged.range([0, 5, 8], [3, 3, 12]).eval().tolist()
+  >>> tf.ragged.range([0, 5, 8], [3, 3, 12]).to_list()
   [[0, 1, 2], [], [8, 9, 10, 11]]
-  >>> ragged.range([0, 5, 8], [3, 3, 12], 2).eval().tolist()
+  >>> tf.ragged.range([0, 5, 8], [3, 3, 12], 2).to_list()
   [[0, 2], [], [8, 10]]
-  ```
 
   The input tensors `starts`, `limits`, and `deltas` may be scalars or vectors.
   The vector inputs must all have the same size.  Scalar inputs are broadcast
@@ -371,56 +369,56 @@ Computes the %(combination)s of elements across dimensions of a `RaggedTensor`.
   Raises:
     ValueError: If `axis` contains a `Tensor` whose value is not constant.
   ####Example:
-    ```python%(example)s    ```
+    %(example)s
 """
 _RAGGED_REDUCE_SUM_EXAMPLE = """
-    >>> rt = ragged.constant([[3, 1, 4], [1, 5], [9], [2, 6]])
-    >>> ragged.reduce_sum(rt, axis=0).eval().tolist()
-    [15, 12, 4]  # = [3+1+9+2, 1+5+6, 4]
-    >>> ragged.reduce_sum(rt, axis=1).eval().tolist()
-    [8, 6, 9, 8]  # = [3+1+4, 1+5, 9, 2+6]
+    >>> rt = tf.ragged.constant([[3, 1, 4], [1, 5], [9], [2, 6]])
+    >>> tf.reduce_sum(rt, axis=0).numpy()  # = [3+1+9+2, 1+5+6, 4]
+    array([15, 12, 4], dtype=int32)
+    >>> tf.reduce_sum(rt, axis=1).numpy()  # = [3+1+4, 1+5, 9, 2+6]
+    array([8, 6, 9, 8], dtype=int32)
 """
 _RAGGED_REDUCE_PROD_EXAMPLE = """
-    >>> rt = ragged.constant([[3, 1, 4], [1, 5], [9], [2, 6]])
-    >>> ragged.reduce_prod(rt, axis=0).eval().tolist()
-    [54, 30, 4]  # = [3*1*9*2, 1*5*6, 4]
-    >>> ragged.reduce_prod(rt, axis=1).eval().tolist()
-    [12, 5, 9, 12]  # = [3*1*4, 1*5, 9, 2*6]
+    >>> rt = tf.ragged.constant([[3, 1, 4], [1, 5], [9], [2, 6]])
+    >>> tf.reduce_prod(rt, axis=0).numpy()  # = [3*1*9*2, 1*5*6, 4]
+    array([54, 30, 4], dtype=int32)
+    >>> tf.reduce_prod(rt, axis=1).numpy()  # = [3*1*4, 1*5, 9, 2*6]
+    array([12, 5, 9, 12], dtype=int32)
 """
 _RAGGED_REDUCE_MIN_EXAMPLE = """
-    >>> rt = ragged.constant([[3, 1, 4], [1, 5], [9], [2, 6]])
-    >>> ragged.reduce_min(rt, axis=0).eval().tolist()
-    [1, 1, 4]  # = [min(3, 1, 9, 2), min(1, 5, 6), 4]
-    >>> ragged.reduce_min(rt, axis=1).eval().tolist()
-    [1, 1, 9, 2]  # = [min(3, 1, 4), min(1, 5), 9, min(2, 6)]
+    >>> rt = tf.ragged.constant([[3, 1, 4], [1, 5], [9], [2, 6]])
+    >>> tf.reduce_min(rt, axis=0).numpy()
+    array([1, 1, 4], dtype=int32)
+    >>> tf.reduce_min(rt, axis=1).numpy()
+    array([1, 1, 9, 2], dtype=int32)
 """
 _RAGGED_REDUCE_MAX_EXAMPLE = """
-    >>> rt = ragged.constant([[3, 1, 4], [1, 5], [9], [2, 6]])
-    >>> ragged.reduce_max(rt, axis=0).eval().tolist()
-    [9, 6, 4]  # = [max(3, 1, 9, 2), max(1, 5, 6), 4]
-    >>> ragged.reduce_max(rt, axis=1).eval().tolist()
-    [4, 5, 9, 6]  # = [max(3, 1, 4), max(1, 5), 9, max(2, 6)]
+    >>> rt = tf.ragged.constant([[3, 1, 4], [1, 5], [9], [2, 6]])
+    >>> tf.reduce_max(rt, axis=0).numpy()
+    array([9, 6, 4], dtype=int32)
+    >>> tf.reduce_max(rt, axis=1).numpy()
+    array([4, 5, 9, 6], dtype=int32)
 """
 _RAGGED_REDUCE_MEAN_EXAMPLE = """
-    >>> rt = ragged.constant([[3, 1, 4], [1, 5], [9], [2, 6]])
-    >>> ragged.reduce_mean(rt, axis=0).eval().tolist()
-    [3.75, 4, 4]  # = [mean(3, 1, 9, 2), mean(1, 5, 6), 4]
-    >>> ragged.reduce_mean(rt, axis=1).eval().tolist()
-    [2.66666, 3, 9, 4]  # = [mean(3, 1, 4), mean(1, 5), 9, mean(2, 6)]
+    >>> rt = tf.ragged.constant([[3, 1, 4], [1, 5], [9], [2, 6]])
+    >>> tf.reduce_mean(rt, axis=0).numpy()
+    array([3.75, 4.  , 4. ])
+    >>> tf.reduce_mean(rt, axis=1).numpy()
+    array([2.6666..., 3.  , 9.  , 4.  ])
 """
 _RAGGED_REDUCE_ALL_EXAMPLE = """
-    >>> rt = ragged.constant([[True, True], [True, True, False, True], [False, True]])
-    >>> ragged.reduce_all(rt, axis=0).eval().tolist()
-    [False, True, False, True]
-    >>> ragged.reduce_all(rt, axis=1).eval().tolist()
-    [True, False, False]
+    >>> rt = tf.ragged.constant([[True, True], [True, True, False, True], [False, True]])
+    >>> tf.reduce_all(rt, axis=0).numpy()
+    array([False,  True, False,  True])
+    >>> tf.reduce_all(rt, axis=1).numpy()
+    array([ True, False, False])
 """
 _RAGGED_REDUCE_ANY_EXAMPLE = """
-    >>> rt = ragged.constant([[True, True], [True, True, False, True], [False, True]])
-    >>> ragged.reduce_any(rt, axis=0).eval().tolist()
-    [True, True, False, True]
-    >>> ragged.reduce_any(rt, axis=1).eval().tolist()
-    [True, True, True]
+    >>> rt = tf.ragged.constant([[True, True], [True, True, False, True], [False, True]])
+    >>> tf.reduce_any(rt, axis=0).numpy()
+    array([ True,  True, False,  True])
+    >>> tf.reduce_any(rt, axis=1).numpy()
+    array([ True,  True,  True])
 """
 
 
diff --git a/tensorflow/python/ops/ragged/ragged_string_ops.py b/tensorflow/python/ops/ragged/ragged_string_ops.py
index b93b02bc12c..5064f2ffdfa 100644
--- a/tensorflow/python/ops/ragged/ragged_string_ops.py
+++ b/tensorflow/python/ops/ragged/ragged_string_ops.py
@@ -37,12 +37,10 @@ def string_bytes_split(input, name=None):  # pylint: disable=redefined-builtin
 
   Examples:
 
-  ```python
-  >>> tf.strings.bytes_split('hello')
-  ['h', 'e', 'l', 'l', 'o']
+  >>> tf.strings.bytes_split('hello').numpy()
+  array([b'h', b'e', b'l', b'l', b'o'], dtype=object)
   >>> tf.strings.bytes_split(['hello', '123'])
-  <RaggedTensor [['h', 'e', 'l', 'l', 'o'], ['1', '2', '3']]>
-  ```
+  <tf.RaggedTensor [[b'h', b'e', b'l', b'l', b'o'], [b'1', b'2', b'3']]>
 
   Note that this op splits strings into bytes, not unicode characters.  To
   split strings into unicode characters, use `tf.strings.unicode_split`.
@@ -112,11 +110,12 @@ def unicode_encode(input,
     A `N` dimensional `string` tensor with shape `[D1...DN]`.
 
   #### Example:
-    ```python
-      >>> input = [[71, 246, 246, 100, 110, 105, 103, 104, 116], [128522]]
-      >>> unicode_encode(input, 'UTF-8')
-      ['G\xc3\xb6\xc3\xb6dnight', '\xf0\x9f\x98\x8a']
-    ```
+
+  >>> input = tf.ragged.constant(
+  ...     [[71, 246, 246, 100, 110, 105, 103, 104, 116], [128522]])
+  >>> print(unicode_encode(input, 'UTF-8'))
+  tf.Tensor([b'G\xc3\xb6\xc3\xb6dnight' b'\xf0\x9f\x98\x8a'],
+            shape=(2,), dtype=string)
   """
   with ops.name_scope(name, "UnicodeEncode", [input]):
     input_tensor = ragged_tensor.convert_to_tensor_or_ragged_tensor(input)
@@ -211,11 +210,10 @@ def unicode_decode(input,
     `tf.RaggedTensor` otherwise.
 
   #### Example:
-    ```python
-    >>> input = [s.encode('utf8') for s in (u'G\xf6\xf6dnight', u'\U0001f60a')]
-    >>> tf.strings.unicode_decode(input, 'UTF-8').tolist()
-    [[71, 246, 246, 100, 110, 105, 103, 104, 116], [128522]]
-    ```
+
+  >>> input = [s.encode('utf8') for s in (u'G\xf6\xf6dnight', u'\U0001f60a')]
+  >>> tf.strings.unicode_decode(input, 'UTF-8').to_list()
+  [[71, 246, 246, 100, 110, 105, 103, 104, 116], [128522]]
   """
   with ops.name_scope(name, "UnicodeDecode", [input]):
     return _unicode_decode(input, input_encoding, errors, replacement_char,
@@ -269,14 +267,14 @@ def unicode_decode_with_offsets(input,
     `tf.RaggedTensor`s otherwise.
 
   #### Example:
-    ```python
-    >>> input = [s.encode('utf8') for s in (u'G\xf6\xf6dnight', u'\U0001f60a')]
-    >>> result = tf.strings.unicode_decode_with_offsets(input, 'UTF-8')
-    >>> result[0].tolist()  # codepoints
-    [[71, 246, 246, 100, 110, 105, 103, 104, 116], [128522]]
-    >>> result[1].tolist()  # offsets
-   [[0, 1, 3, 5, 6, 7, 8, 9, 10], [0]]
-    ```
+
+  >>> input = [s.encode('utf8') for s in (u'G\xf6\xf6dnight', u'\U0001f60a')]
+  >>> result = tf.strings.unicode_decode_with_offsets(input, 'UTF-8')
+  >>> result[0].to_list()  # codepoints
+  [[71, 246, 246, 100, 110, 105, 103, 104, 116], [128522]]
+  >>> result[1].to_list()  # offsets
+  [[0, 1, 3, 5, 6, 7, 8, 9, 10], [0]]
+
   """
   with ops.name_scope(name, "UnicodeDecodeWithOffsets", [input]):
     return _unicode_decode(input, input_encoding, errors, replacement_char,
@@ -314,12 +312,11 @@ def unicode_split(input,
     `tf.RaggedTensor` otherwise.
 
   #### Example:
-    ```python
-    >>> input = [s.encode('utf8') for s in (u'G\xf6\xf6dnight', u'\U0001f60a')]
-    >>> tf.strings.unicode_split(input, 'UTF-8').tolist()
-    [['G', '\xc3\xb6', '\xc3\xb6', 'd', 'n', 'i', 'g', 'h', 't'],
-     ['\xf0\x9f\x98\x8a']]
-    ```
+
+  >>> input = [s.encode('utf8') for s in (u'G\xf6\xf6dnight', u'\U0001f60a')]
+  >>> tf.strings.unicode_split(input, 'UTF-8').to_list()
+  [[b'G', b'\xc3\xb6', b'\xc3\xb6', b'd', b'n', b'i', b'g', b'h', b't'],
+   [b'\xf0\x9f\x98\x8a']]
   """
   with ops.name_scope(name, "UnicodeSplit", [input]):
     codepoints = _unicode_decode(input, input_encoding, errors,
@@ -374,15 +371,15 @@ def unicode_split_with_offsets(input,
     `tf.RaggedTensor`s otherwise.
 
   #### Example:
-    ```python
-    >>> input = [s.encode('utf8') for s in (u'G\xf6\xf6dnight', u'\U0001f60a')]
-    >>> result = tf.strings.unicode_split_with_offsets(input, 'UTF-8')
-    >>> result[0].tolist()  # character substrings
-    [['G', '\xc3\xb6', '\xc3\xb6', 'd', 'n', 'i', 'g', 'h', 't'],
-     ['\xf0\x9f\x98\x8a']]
-    >>> result[1].tolist()  # offsets
-   [[0, 1, 3, 5, 6, 7, 8, 9, 10], [0]]
-    ```
+
+  >>> input = [s.encode('utf8') for s in (u'G\xf6\xf6dnight', u'\U0001f60a')]
+  >>> result = tf.strings.unicode_split_with_offsets(input, 'UTF-8')
+  >>> result[0].to_list()  # character substrings
+  [[b'G', b'\xc3\xb6', b'\xc3\xb6', b'd', b'n', b'i', b'g', b'h', b't'],
+   [b'\xf0\x9f\x98\x8a']]
+  >>> result[1].to_list()  # offsets
+  [[0, 1, 3, 5, 6, 7, 8, 9, 10], [0]]
+
   """
   with ops.name_scope(name, "UnicodeSplitWithOffsets", [input]):
     codepoints, offsets = _unicode_decode(input, input_encoding, errors,
@@ -464,12 +461,10 @@ def string_split_v2(input, sep=None, maxsplit=-1, name=None):  # pylint: disable
 
   Example:
 
-  ```python
-  >>> tf.strings.split('hello world')
-  <Tensor ['hello', 'world']>
+  >>> tf.strings.split('hello world').numpy()
+   array([b'hello', b'world'], dtype=object)
   >>> tf.strings.split(['hello world', 'a b c'])
-  <tf.RaggedTensor [['hello', 'world'], ['a', 'b', 'c']]>
-  ```
+  <tf.RaggedTensor [[b'hello', b'world'], [b'a', b'b', b'c']]>
 
   If `sep` is given, consecutive delimiters are not grouped together and are
   deemed to delimit empty strings. For example, `input` of `"1<>2<><>3"` and
@@ -536,15 +531,14 @@ def string_split(source, sep=None, skip_empty=True, delimiter=None,
 
   Examples:
 
-  ```python
-  >>> tf.strings.split(['hello world', 'a b c'])
-  tf.SparseTensor(indices=[[0, 0], [0, 1], [1, 0], [1, 1], [1, 2]],
-                  values=['hello', 'world', 'a', 'b', 'c']
-                  dense_shape=[2, 3])
+  >>> print(tf.compat.v1.string_split(['hello world', 'a b c']))
+  SparseTensor(indices=tf.Tensor( [[0 0] [0 1] [1 0] [1 1] [1 2]], ...),
+               values=tf.Tensor([b'hello' b'world' b'a' b'b' b'c'], ...),
+               dense_shape=tf.Tensor([2 3], shape=(2,), dtype=int64))
 
-  >>> tf.strings.split(['hello world', 'a b c'], result_type="RaggedTensor")
-  <tf.RaggedTensor [['hello', 'world'], ['a', 'b', 'c']]>
-  ```
+  >>> print(tf.compat.v1.string_split(['hello world', 'a b c'],
+  ...     result_type="RaggedTensor"))
+  <tf.RaggedTensor [[b'hello', b'world'], [b'a', b'b', b'c']]>
 
   Args:
     source: `1-D` string `Tensor`, the strings to split.
@@ -593,15 +587,14 @@ def strings_split_v1(input=None, sep=None, maxsplit=-1,  # pylint: disable=redef
 
   Examples:
 
-  ```python
-  >>> tf.strings.split(['hello world', 'a b c'])
-  tf.SparseTensor(indices=[[0, 0], [0, 1], [1, 0], [1, 1], [1, 2]],
-                  values=['hello', 'world', 'a', 'b', 'c']
-                  dense_shape=[2, 3])
+  >>> print(tf.compat.v1.strings.split(['hello world', 'a b c']))
+  SparseTensor(indices=tf.Tensor( [[0 0] [0 1] [1 0] [1 1] [1 2]], ...),
+               values=tf.Tensor([b'hello' b'world' b'a' b'b' b'c'], ...),
+               dense_shape=tf.Tensor([2 3], shape=(2,), dtype=int64))
 
-  >>> tf.strings.split(['hello world', 'a b c'], result_type="RaggedTensor")
-  <tf.RaggedTensor [['hello', 'world'], ['a', 'b', 'c']]>
-  ```
+  >>> print(tf.compat.v1.strings.split(['hello world', 'a b c'],
+  ...     result_type="RaggedTensor"))
+  <tf.RaggedTensor [[b'hello', b'world'], [b'a', b'b', b'c']]>
 
   If `sep` is given, consecutive delimiters are not grouped together and are
   deemed to delimit empty strings. For example, `input` of `"1<>2<><>3"` and
diff --git a/tensorflow/python/ops/ragged/ragged_tensor.py b/tensorflow/python/ops/ragged/ragged_tensor.py
index b5ea9b0145f..58b5388950c 100644
--- a/tensorflow/python/ops/ragged/ragged_tensor.py
+++ b/tensorflow/python/ops/ragged/ragged_tensor.py
@@ -104,12 +104,10 @@ class RaggedTensor(composite_tensor.CompositeTensor):
 
   Example:
 
-  ```python
   >>> print(tf.RaggedTensor.from_row_splits(
-  ...     values=[3, 1, 4, 1, 5, 9, 2, 6],
-  ...     row_splits=[0, 4, 4, 7, 8, 8]))
+  ...       values=[3, 1, 4, 1, 5, 9, 2, 6],
+  ...       row_splits=[0, 4, 4, 7, 8, 8]))
   <tf.RaggedTensor [[3, 1, 4, 1], [], [5, 9, 2], [6], []]>
-  ```
 
   ### Alternative Row-Partitioning Schemes
 
@@ -139,7 +137,6 @@ class RaggedTensor(composite_tensor.CompositeTensor):
   Example: The following ragged tensors are equivalent, and all represent the
   nested list `[[3, 1, 4, 1], [], [5, 9, 2], [6], []]`.
 
-  ```python
   >>> values = [3, 1, 4, 1, 5, 9, 2, 6]
   >>> rt1 = RaggedTensor.from_row_splits(values, row_splits=[0, 4, 4, 7, 8, 8])
   >>> rt2 = RaggedTensor.from_row_lengths(values, row_lengths=[4, 0, 3, 1, 0])
@@ -147,7 +144,6 @@ class RaggedTensor(composite_tensor.CompositeTensor):
   ...     values, value_rowids=[0, 0, 0, 0, 2, 2, 2, 3], nrows=5)
   >>> rt4 = RaggedTensor.from_row_starts(values, row_starts=[0, 4, 4, 7, 8])
   >>> rt5 = RaggedTensor.from_row_limits(values, row_limits=[4, 4, 7, 8, 8])
-  ```
 
   ### Multiple Ragged Dimensions
 
@@ -155,42 +151,36 @@ class RaggedTensor(composite_tensor.CompositeTensor):
   a nested `RaggedTensor` for the `values` tensor.  Each nested `RaggedTensor`
   adds a single ragged dimension.
 
-  ```python
   >>> inner_rt = RaggedTensor.from_row_splits(  # =rt1 from above
   ...     values=[3, 1, 4, 1, 5, 9, 2, 6], row_splits=[0, 4, 4, 7, 8, 8])
   >>> outer_rt = RaggedTensor.from_row_splits(
   ...     values=inner_rt, row_splits=[0, 3, 3, 5])
-  >>> print outer_rt.to_list()
+  >>> print(outer_rt.to_list())
   [[[3, 1, 4, 1], [], [5, 9, 2]], [], [[6], []]]
-  >>> print outer_rt.ragged_rank
+  >>> print(outer_rt.ragged_rank)
   2
-  ```
 
   The factory function `RaggedTensor.from_nested_row_splits` may be used to
   construct a `RaggedTensor` with multiple ragged dimensions directly, by
   providing a list of `row_splits` tensors:
 
-  ```python
   >>> RaggedTensor.from_nested_row_splits(
   ...     flat_values=[3, 1, 4, 1, 5, 9, 2, 6],
   ...     nested_row_splits=([0, 3, 3, 5], [0, 4, 4, 7, 8, 8])).to_list()
   [[[3, 1, 4, 1], [], [5, 9, 2]], [], [[6], []]]
-  ```
 
   ### Uniform Inner Dimensions
 
   `RaggedTensor`s with uniform inner dimensions can be defined
   by using a multidimensional `Tensor` for `values`.
 
-  ```python
-  >>> rt = RaggedTensor.from_row_splits(values=tf.ones([5, 3]),
-  ..                                    row_splits=[0, 2, 5])
-  >>> print rt.to_list()
+  >>> rt = RaggedTensor.from_row_splits(values=tf.ones([5, 3], tf.int32),
+  ...                                   row_splits=[0, 2, 5])
+  >>> print(rt.to_list())
   [[[1, 1, 1], [1, 1, 1]],
    [[1, 1, 1], [1, 1, 1], [1, 1, 1]]]
-   >>> print rt.shape
-   (2, ?, 3)
-  ```
+   >>> print(rt.shape)
+   (2, None, 3)
 
   ### Uniform Outer Dimensions
 
@@ -200,38 +190,35 @@ class RaggedTensor(composite_tensor.CompositeTensor):
   constructed with this method from a `RaggedTensor` values with shape
   `[4, None]`:
 
-  ```python
   >>> values = tf.ragged.constant([[1, 2, 3], [4], [5, 6], [7, 8, 9, 10]])
-  >>> print values.shape
+  >>> print(values.shape)
   (4, None)
-  >>> rt1 = tf.RaggedTensor.from_uniform_row_lengths(values, 2)
-  >>> print rt1
-  <tf.RaggedTensor [[[1, 2, 3], [4]], [[5, 6], [7, 8, 9, 10]]])>
-  >>> print rt1.shape
+  >>> rt6 = tf.RaggedTensor.from_uniform_row_length(values, 2)
+  >>> print(rt6)
+  <tf.RaggedTensor [[[1, 2, 3], [4]], [[5, 6], [7, 8, 9, 10]]]>
+  >>> print(rt6.shape)
   (2, 2, None)
-  ```
 
-  Note that `rt1` only contains one ragged dimension (the innermost
+  Note that `rt6` only contains one ragged dimension (the innermost
   dimension). In contrast, if `from_row_splits` is used to construct a similar
   `RaggedTensor`, then that `RaggedTensor` will have two ragged dimensions:
 
-  ```python
-  >>> rt2 = tf.RaggedTensor.from_row_splits(values, [0, 2, 4])
-  >>> print rt2.shape
+  >>> rt7 = tf.RaggedTensor.from_row_splits(values, [0, 2, 4])
+  >>> print(rt7.shape)
   (2, None, None)
-  ```
 
   Uniform and ragged outer dimensions may be interleaved, meaning that a
   tensor with any combination of ragged and uniform dimensions may be created.
   For example, a RaggedTensor `t4` with shape `[3, None, 4, 8, None, 2]` could
   be constructed as follows:
 
-  ```
+  ```python
   t0 = tf.zeros([1000, 2])                           # Shape:         [1000, 2]
   t1 = RaggedTensor.from_row_lengths(t0, [...])      #           [160, None, 2]
   t2 = RaggedTensor.from_uniform_row_length(t1, 8)   #         [20, 8, None, 2]
   t3 = RaggedTensor.from_uniform_row_length(t2, 4)   #       [5, 4, 8, None, 2]
   t4 = RaggedTensor.from_row_lengths(t3, [...])      # [3, None, 4, 8, None, 2]
+  ```
   """
 
   #=============================================================================
@@ -362,13 +349,12 @@ class RaggedTensor(composite_tensor.CompositeTensor):
       ValueError: If `nrows` is incompatible with `value_rowids`.
 
     #### Example:
-      ```python
-      >>> print(tf.RaggedTensor.from_value_rowids(
-      ...     values=[3, 1, 4, 1, 5, 9, 2, 6],
-      ...     value_rowids=[0, 0, 0, 0, 2, 2, 2, 3],
-      ...     nrows=5))
-      <tf.RaggedTensor [[3, 1, 4, 1], [], [5, 9, 2], [6], []]>
-      ```
+
+    >>> print(tf.RaggedTensor.from_value_rowids(
+    ...     values=[3, 1, 4, 1, 5, 9, 2, 6],
+    ...     value_rowids=[0, 0, 0, 0, 2, 2, 2, 3],
+    ...     nrows=5))
+    <tf.RaggedTensor [[3, 1, 4, 1], [], [5, 9, 2], [6], []]>
     """
     if not isinstance(validate, bool):
       raise TypeError("validate must have type bool")
@@ -471,12 +457,11 @@ class RaggedTensor(composite_tensor.CompositeTensor):
       ValueError: If `row_splits` is an empty list.
 
     #### Example:
-      ```python
-      >>> print(tf.RaggedTensor.from_row_splits(
-      ...     values=[3, 1, 4, 1, 5, 9, 2, 6],
-      ...     row_splits=[0, 4, 4, 7, 8, 8]))
-      <tf.RaggedTensor [[3, 1, 4, 1], [], [5, 9, 2], [6], []]>
-      ```
+
+    >>> print(tf.RaggedTensor.from_row_splits(
+    ...     values=[3, 1, 4, 1, 5, 9, 2, 6],
+    ...     row_splits=[0, 4, 4, 7, 8, 8]))
+    <tf.RaggedTensor [[3, 1, 4, 1], [], [5, 9, 2], [6], []]>
     """
     if not isinstance(validate, bool):
       raise TypeError("validate must have type bool")
@@ -529,12 +514,11 @@ class RaggedTensor(composite_tensor.CompositeTensor):
       `result.ragged_rank = values.ragged_rank + 1`.
 
     #### Example:
-      ```python
-      >>> print(tf.RaggedTensor.from_row_lengths(
-      ...     values=[3, 1, 4, 1, 5, 9, 2, 6],
-      ...     row_lengths=[4, 0, 3, 1, 0]))
-      <tf.RaggedTensor [[3, 1, 4, 1], [], [5, 9, 2], [6], []])>
-      ```
+
+    >>> print(tf.RaggedTensor.from_row_lengths(
+    ...     values=[3, 1, 4, 1, 5, 9, 2, 6],
+    ...     row_lengths=[4, 0, 3, 1, 0]))
+    <tf.RaggedTensor [[3, 1, 4, 1], [], [5, 9, 2], [6], []]>
     """
     if not isinstance(validate, bool):
       raise TypeError("validate must have type bool")
@@ -584,12 +568,11 @@ class RaggedTensor(composite_tensor.CompositeTensor):
       `result.ragged_rank = values.ragged_rank + 1`.
 
     #### Example:
-      ```python
-      >>> print(tf.RaggedTensor.from_row_starts(
-      ...     values=[3, 1, 4, 1, 5, 9, 2, 6],
-      ...     row_starts=[0, 4, 4, 7, 8]))
-      <tf.RaggedTensor [[3, 1, 4, 1], [], [5, 9, 2], [6], []]>
-      ```
+
+    >>> print(tf.RaggedTensor.from_row_starts(
+    ...     values=[3, 1, 4, 1, 5, 9, 2, 6],
+    ...     row_starts=[0, 4, 4, 7, 8]))
+    <tf.RaggedTensor [[3, 1, 4, 1], [], [5, 9, 2], [6], []]>
     """
     if not isinstance(validate, bool):
       raise TypeError("validate must have type bool")
@@ -633,12 +616,11 @@ class RaggedTensor(composite_tensor.CompositeTensor):
       `result.ragged_rank = values.ragged_rank + 1`.
 
     #### Example:
-      ```python
-      >>> print(tf.RaggedTensor.from_row_limits(
-      ...     values=[3, 1, 4, 1, 5, 9, 2, 6],
-      ...     row_limits=[4, 4, 7, 8, 8]))
-      <tf.RaggedTensor [[3, 1, 4, 1], [], [5, 9, 2], [6], []]>
-      ```
+
+    >>> print(tf.RaggedTensor.from_row_limits(
+    ...     values=[3, 1, 4, 1, 5, 9, 2, 6],
+    ...     row_limits=[4, 4, 7, 8, 8]))
+    <tf.RaggedTensor [[3, 1, 4, 1], [], [5, 9, 2], [6], []]>
     """
     if not isinstance(validate, bool):
       raise TypeError("validate must have type bool")
@@ -678,26 +660,22 @@ class RaggedTensor(composite_tensor.CompositeTensor):
     can be constructed with this method from a `RaggedTensor` values with shape
     `[4, None]`:
 
-    ```python
     >>> values = tf.ragged.constant([[1, 2, 3], [4], [5, 6], [7, 8, 9, 10]])
-    >>> print values.shape
+    >>> print(values.shape)
     (4, None)
-    >>> rt1 = tf.RaggedTensor.from_uniform_row_lengths(values, 2)
-    >>> print rt1
-    <tf.RaggedTensor [[[1, 2, 3], [4]], [[5, 6], [7, 8, 9, 10]]])>
-    >>> print rt1.shape
+    >>> rt1 = tf.RaggedTensor.from_uniform_row_length(values, 2)
+    >>> print(rt1)
+    <tf.RaggedTensor [[[1, 2, 3], [4]], [[5, 6], [7, 8, 9, 10]]]>
+    >>> print(rt1.shape)
     (2, 2, None)
-    ```
 
     Note that `rt1` only contains one ragged dimension (the innermost
     dimension). In contrast, if `from_row_splits` is used to construct a similar
     `RaggedTensor`, then that `RaggedTensor` will have two ragged dimensions:
 
-    ```python
     >>> rt2 = tf.RaggedTensor.from_row_splits(values, [0, 2, 4])
-    >>> print rt2.shape
+    >>> print(rt2.shape)
     (2, None, None)
-    ```
 
     Args:
       values: A potentially ragged tensor with shape `[nvals, ...]`.
@@ -1007,13 +985,11 @@ class RaggedTensor(composite_tensor.CompositeTensor):
 
     Examples:
 
-      ```python
-      >>> ragged.constant([[0], [1, 2]]).shape
-      TensorShape([Dimension(2), Dimension(None)])
+    >>> tf.ragged.constant([[0], [1, 2]]).shape
+    TensorShape([2, None])
 
-      >>> ragged.constant([[[0, 1]], [[1, 2], [3, 4]]], ragged_rank=1).shape
-      TensorShape([Dimension(2), Dimension(None), Dimension(2)
-      ```
+    >>> tf.ragged.constant([[[0, 1]], [[1, 2], [3, 4]]], ragged_rank=1).shape
+    TensorShape([2, None, 2])
     """
     nrows = tensor_shape.dimension_at_index(self._row_splits.shape, 0) - 1
 
@@ -1054,11 +1030,10 @@ class RaggedTensor(composite_tensor.CompositeTensor):
       A potentially ragged tensor.
 
     #### Example:
-      ```python
-      >>> rt = ragged.constant([[3, 1, 4, 1], [], [5, 9, 2], [6], []])
-      >>> print rt.values
-      tf.Tensor([3, 1, 4, 1, 5, 9, 2, 6])
-      ```
+
+    >>> rt = tf.ragged.constant([[3, 1, 4, 1], [], [5, 9, 2], [6], []])
+    >>> print(rt.values)
+     tf.Tensor([3 1 4 1 5 9 2 6], shape=(8,), dtype=int32)
     """
     return self._values
 
@@ -1077,11 +1052,10 @@ class RaggedTensor(composite_tensor.CompositeTensor):
       `self.values.shape[0]`.
 
     #### Example:
-      ```python
-      >>> rt = ragged.constant([[3, 1, 4, 1], [], [5, 9, 2], [6], []])
-      >>> print rt.row_splits  # indices of row splits in rt.values
-      tf.Tensor([0, 4, 4, 7, 8, 8])
-      ```
+
+    >>> rt = tf.ragged.constant([[3, 1, 4, 1], [], [5, 9, 2], [6], []])
+    >>> print(rt.row_splits)  # indices of row splits in rt.values
+    tf.Tensor([0 4 4 7 8 8], shape=(6,), dtype=int64)
     """
     return self._row_splits
 
@@ -1104,11 +1078,9 @@ class RaggedTensor(composite_tensor.CompositeTensor):
 
     #### Example:
 
-      ```python
-      >>> rt = ragged.constant([[[3, 1, 4, 1], [], [5, 9, 2]], [], [[6], []]])
-      >>> print rt.flat_values()
-      tf.Tensor([3, 1, 4, 1, 5, 9, 2, 6])
-      ```
+    >>> rt = tf.ragged.constant([[[3, 1, 4, 1], [], [5, 9, 2]], [], [[6], []]])
+    >>> print(rt.flat_values)
+    tf.Tensor([3 1 4 1 5 9 2 6], shape=(8,), dtype=int32)
     """
     rt_values = self.values
     while isinstance(rt_values, RaggedTensor):
@@ -1131,15 +1103,13 @@ class RaggedTensor(composite_tensor.CompositeTensor):
 
     #### Example:
 
-      ```python
-      >>> rt = ragged.constant([[[[3, 1, 4, 1], [], [5, 9, 2]], [], [[6], []]]])
-      >>> for i, splits in enumerate(rt.nested_row_splits()):
-      ...   print('Splits for dimension %d: %s' % (i+1, splits))
-      Splits for dimension 1: [0, 1]
-      Splits for dimension 2: [0, 3, 3, 5]
-      Splits for dimension 3: [0, 4, 4, 7, 8, 8]
-      ```
-
+    >>> rt = tf.ragged.constant(
+    ...     [[[[3, 1, 4, 1], [], [5, 9, 2]], [], [[6], []]]])
+    >>> for i, splits in enumerate(rt.nested_row_splits):
+    ...   print('Splits for dimension %d: %s' % (i+1, splits.numpy()))
+    Splits for dimension 1: [0 3]
+    Splits for dimension 2: [0 3 3 5]
+    Splits for dimension 3: [0 4 4 7 8 8]
     """
     rt_nested_splits = [self.row_splits]
     rt_values = self.values
@@ -1164,13 +1134,12 @@ class RaggedTensor(composite_tensor.CompositeTensor):
       The returned tensor is nonnegative, and is sorted in ascending order.
 
     #### Example:
-      ```python
-      >>> rt = ragged.constant([[3, 1, 4, 1], [], [5, 9, 2], [6], []])
-      >>> rt.values
-      tf.Tensor([3, 1, 4, 1, 5, 9, 2, 6])
-      >>> rt.value_rowids()
-      tf.Tensor([0, 0, 0, 0, 2, 2, 2, 3])  # corresponds 1:1 with rt.values
-      ```
+
+    >>> rt = tf.ragged.constant([[3, 1, 4, 1], [], [5, 9, 2], [6], []])
+    >>> print(rt.values)
+    tf.Tensor([3 1 4 1 5 9 2 6], shape=(8,), dtype=int32)
+    >>> print(rt.value_rowids())  # corresponds 1:1 with rt.values
+    tf.Tensor([0 0 0 0 2 2 2 3], shape=(8,), dtype=int64)
     """
     if self._cached_value_rowids is not None:
       return self._cached_value_rowids
@@ -1198,15 +1167,13 @@ class RaggedTensor(composite_tensor.CompositeTensor):
 
     #### Example:
 
-      ```python
-      >>> rt = ragged.constant([[[[3, 1, 4, 1], [], [5, 9, 2]], [], [[6], []]]])
-      >>> for i, ids in enumerate(rt.nested_value_rowids()):
-      ...   print('row ids for dimension %d: %s' % (i+1, ids))
-      row ids for dimension 1: [0]
-      row ids for dimension 2: [0, 0, 0, 2, 2]
-      row ids for dimension 3: [0, 0, 0, 0, 2, 2, 2, 3]
-      ```
-
+    >>> rt = tf.ragged.constant(
+    ...     [[[[3, 1, 4, 1], [], [5, 9, 2]], [], [[6], []]]])
+    >>> for i, ids in enumerate(rt.nested_value_rowids()):
+    ...   print('row ids for dimension %d: %s' % (i+1, ids.numpy()))
+    row ids for dimension 1: [0 0 0]
+    row ids for dimension 2: [0 0 0 2 2]
+    row ids for dimension 3: [0 0 0 0 2 2 2 3]
     """
     with ops.name_scope(name, "RaggedNestedValueRowIds", [self]):
       rt_nested_ids = [self.value_rowids()]
@@ -1230,11 +1197,10 @@ class RaggedTensor(composite_tensor.CompositeTensor):
       A scalar `Tensor` with dtype `out_type`.
 
     #### Example:
-      ```python
-      >>> rt = ragged.constant([[3, 1, 4, 1], [], [5, 9, 2], [6], []])
-      >>> rt.nrows()  # rt has 5 rows.
-      5
-      ```
+
+    >>> rt = tf.ragged.constant([[3, 1, 4, 1], [], [5, 9, 2], [6], []])
+    >>> print(rt.nrows())  # rt has 5 rows.
+    tf.Tensor(5, shape=(), dtype=int64)
     """
     if out_type is None:
       out_type = self._row_splits.dtype
@@ -1259,13 +1225,12 @@ class RaggedTensor(composite_tensor.CompositeTensor):
       The returned tensor is nonnegative, and is sorted in ascending order.
 
     #### Example:
-      ```python
-      >>> rt = ragged.constant([[3, 1, 4, 1], [], [5, 9, 2], [6], []])
-      >>> rt.values
-      tf.Tensor([3, 1, 4, 1, 5, 9, 2, 6])
-      >>> rt.row_starts()  # indices of row starts in rt.values
-      tf.Tensor([0, 4, 4, 7, 8])
-      ```
+
+    >>> rt = tf.ragged.constant([[3, 1, 4, 1], [], [5, 9, 2], [6], []])
+    >>> print(rt.values)
+    tf.Tensor([3 1 4 1 5 9 2 6], shape=(8,), dtype=int32)
+    >>> print(rt.row_starts())  # indices of row starts in rt.values
+    tf.Tensor([0 4 4 7 8], shape=(5,), dtype=int64)
     """
     with ops.name_scope(name, "RaggedRowStarts", [self]):
       return self.row_splits[:-1]
@@ -1284,13 +1249,12 @@ class RaggedTensor(composite_tensor.CompositeTensor):
       The returned tensor is nonnegative, and is sorted in ascending order.
 
     #### Example:
-      ```python
-      >>> rt = ragged.constant([[3, 1, 4, 1], [], [5, 9, 2], [6], []])
-      >>> rt.values
-      tf.Tensor([3, 1, 4, 1, 5, 9, 2, 6])
-      >>> rt.row_limits()  # indices of row limits in rt.values
-      tf.Tensor([4, 4, 7, 8, 8])
-      ```
+
+    >>> rt = tf.ragged.constant([[3, 1, 4, 1], [], [5, 9, 2], [6], []])
+    >>> print(rt.values)
+    tf.Tensor([3 1 4 1 5 9 2 6], shape=(8,), dtype=int32)
+    >>> print(rt.row_limits())  # indices of row limits in rt.values
+    tf.Tensor([4 4 7 8 8], shape=(5,), dtype=int64)
     """
     with ops.name_scope(name, "RaggedRowLimits", [self]):
       return self.row_splits[1:]
@@ -1313,13 +1277,13 @@ class RaggedTensor(composite_tensor.CompositeTensor):
       ValueError: If `axis` is out of bounds.
 
     #### Example:
-      ```python
-      >>> rt = ragged.constant([[[3, 1, 4], [1]], [], [[5, 9], [2]], [[6]], []])
-      >>> rt.row_lengths(rt)  # lengths of rows in rt
-      tf.Tensor([2, 0, 2, 1, 0])
-      >>> rt.row_lengths(axis=2)  # lengths of axis=2 rows.
-      <tf.RaggedTensor [[3, 1], [], [2, 1], [1], []]>
-      ```
+
+    >>> rt = tf.ragged.constant(
+    ...     [[[3, 1, 4], [1]], [], [[5, 9], [2]], [[6]], []])
+    >>> print(rt.row_lengths())  # lengths of rows in rt
+    tf.Tensor([2 0 2 1 0], shape=(5,), dtype=int64)
+    >>> print(rt.row_lengths(axis=2))  # lengths of axis=2 rows.
+    <tf.RaggedTensor [[3, 1], [], [2, 1], [1], []]>
     """
     if self._cached_row_lengths is not None:
       return self._cached_row_lengths
@@ -1379,11 +1343,10 @@ class RaggedTensor(composite_tensor.CompositeTensor):
       where `output[i]` is the bounding size for dimension `axis[i]`.
 
     #### Example:
-      ```python
-      >>> rt = ragged.constant([[1, 2, 3, 4], [5], [], [6, 7, 8, 9], [10]])
-      >>> rt.bounding_shape()
-      [5, 4]
-      ```
+
+    >>> rt = tf.ragged.constant([[1, 2, 3, 4], [5], [], [6, 7, 8, 9], [10]])
+    >>> rt.bounding_shape().numpy()
+    array([5, 4])
     """
     if out_type is None:
       out_type = self._row_splits.dtype
@@ -1520,15 +1483,13 @@ class RaggedTensor(composite_tensor.CompositeTensor):
 
     #### Examples:
 
-    ```python
     >>> rt = tf.ragged.constant([[[1, 2], [3]], [[4, 5, 6]]])
-    >>> rt.merge_dims(0, 1)
-    [[1, 2], [3], [4, 5, 6]]
-    >>> rt.merge_dims(1, 2)
-    [[1, 2, 3], [4, 5, 6]]
-    >>> rt.merge_dims(0, 2)
-    [1, 2, 3, 4, 5, 6]
-    ```
+    >>> print(rt.merge_dims(0, 1))
+    <tf.RaggedTensor [[1, 2], [3], [4, 5, 6]]>
+    >>> print(rt.merge_dims(1, 2))
+    <tf.RaggedTensor [[1, 2, 3], [4, 5, 6]]>
+    >>> print(rt.merge_dims(0, 2))
+    tf.Tensor([1 2 3 4 5 6], shape=(6,), dtype=int32)
 
     To mimic the behavior of `np.flatten` (which flattens all dimensions), use
     `rt.merge_dims(0, -1).  To mimic the behavior of `tf.layers.Flatten` (which
@@ -1580,7 +1541,6 @@ class RaggedTensor(composite_tensor.CompositeTensor):
 
     Examples:
 
-    ```python
     >>> dt = tf.constant([[5, 7, 0], [0, 3, 0], [6, 0, 0]])
     >>> tf.RaggedTensor.from_tensor(dt)
     <tf.RaggedTensor [[5, 7, 0], [0, 3, 0], [6, 0, 0]]>
@@ -1591,11 +1551,10 @@ class RaggedTensor(composite_tensor.CompositeTensor):
     <tf.RaggedTensor [[5, 7], [0, 3], [6]]>
 
     >>> dt = tf.constant([[[5, 0], [7, 0], [0, 0]],
-                          [[0, 0], [3, 0], [0, 0]],
-                          [[6, 0], [0, 0], [0, 0]]])
+    ...                   [[0, 0], [3, 0], [0, 0]],
+    ...                   [[6, 0], [0, 0], [0, 0]]])
     >>> tf.RaggedTensor.from_tensor(dt, lengths=([2, 0, 3], [1, 1, 2, 0, 1]))
     <tf.RaggedTensor [[[5], [7]], [], [[6, 0], [], [0]]]>
-    ```
 
     Args:
       tensor: The `Tensor` to convert.  Must have rank `ragged_rank + 1` or
@@ -1773,14 +1732,13 @@ class RaggedTensor(composite_tensor.CompositeTensor):
 
     Example:
 
-    ```python
-    >>> rt = ragged.constant([[9, 8, 7], [], [6, 5], [4]])
-    >>> print rt.to_tensor()
+    >>> rt = tf.ragged.constant([[9, 8, 7], [], [6, 5], [4]])
+    >>> print(rt.to_tensor())
+    tf.Tensor(
     [[9 8 7]
      [0 0 0]
      [6 5 0]
-     [4 0 0]]
-    ```
+     [4 0 0]], shape=(4, 3), dtype=int32)
 
     Args:
       default_value: Value to set for indices not specified in `self`. Defaults
@@ -1864,13 +1822,11 @@ class RaggedTensor(composite_tensor.CompositeTensor):
 
     Example:
 
-    ```python
-    >>> st = SparseTensor(indices=[[0, 1], [0, 2], [0, 3], [1, 0], [3, 0]],
-    ...                   values=[1, 2, 3, 4, 5],
-    ...                   dense_shape=[4, 3])
-    >>> rt.RaggedTensor.from_sparse(st).eval().tolist()
+    >>> st = tf.SparseTensor(indices=[[0, 0], [0, 1], [0, 2], [1, 0], [3, 0]],
+    ...                      values=[1, 2, 3, 4, 5],
+    ...                      dense_shape=[4, 3])
+    >>> tf.RaggedTensor.from_sparse(st).to_list()
     [[1, 2, 3], [4], [], [5]]
-    ```
 
     Currently, only two-dimensional `SparseTensors` are supported.
 
@@ -1923,13 +1879,13 @@ class RaggedTensor(composite_tensor.CompositeTensor):
 
     Example:
 
-    ```python
-    >>> rt = ragged.constant([[1, 2, 3], [4], [], [5, 6]])
-    >>> rt.to_sparse().eval()
-    SparseTensorValue(indices=[[0, 0], [0, 1], [0, 2], [1, 0], [3, 0], [3, 1]],
-                      values=[1, 2, 3, 4, 5, 6],
-                      dense_shape=[4, 3])
-    ```
+    >>> rt = tf.ragged.constant([[1, 2, 3], [4], [], [5, 6]])
+    >>> print(rt.to_sparse())
+    SparseTensor(indices=tf.Tensor(
+                     [[0 0] [0 1] [0 2] [1 0] [3 0] [3 1]],
+                     shape=(6, 2), dtype=int64),
+                 values=tf.Tensor([1 2 3 4 5 6], shape=(6,), dtype=int32),
+                 dense_shape=tf.Tensor([4 3], shape=(2,), dtype=int64))
 
     Args:
       name: A name prefix for the returned tensors (optional).
@@ -1965,17 +1921,15 @@ class RaggedTensor(composite_tensor.CompositeTensor):
 
     Example:
 
-    ```python
-    >>> rt = ragged.constant([[0], [1, 2]])
+    >>> rt = tf.ragged.constant([[0], [1, 2]])
     >>> et = rt._to_variant()
-    >>> stacked_et = ragged.stack([et, et])
-    >>> ragged.RaggedTensor._from_variant(  # scalar input.
-          et, dtype=tf.int32, output_ragged_rank=1).eval().tolist()
+    >>> stacked_et = tf.stack([et, et])
+    >>> tf.RaggedTensor._from_variant(  # scalar input.
+    ...     et, dtype=tf.int32, output_ragged_rank=1).to_list()
     [[0], [1, 2]]
-    >>> ragged.RaggedTensor._from_variant(  # batched input.
-          stacked_et, dtype=tf.int32, output_ragged_rank=2).eval().tolist()
+    >>> tf.RaggedTensor._from_variant(  # batched input.
+    ...     stacked_et, dtype=tf.int32, output_ragged_rank=2).to_list()
     [[[0], [1, 2]], [[0], [1, 2]]]
-    ```
 
     Args:
       variant: A `variant` Tensor representing an encoded (possibly
@@ -2025,7 +1979,7 @@ class RaggedTensor(composite_tensor.CompositeTensor):
     a scalar `variant` Tensor is returned.
 
     Example:
-    >>> rt = ragged.constant([[[0]], [[1]], [[2]]])
+    >>> rt = tf.ragged.constant([[[0]], [[1]], [[2]]])
     >>> rt._to_variant().shape.as_list()
     []
     >>> rt._to_variant(batched_input=True).shape.as_list()
diff --git a/tensorflow/python/ops/ragged/ragged_util.py b/tensorflow/python/ops/ragged/ragged_util.py
index aec9162adad..57ab670cf9b 100644
--- a/tensorflow/python/ops/ragged/ragged_util.py
+++ b/tensorflow/python/ops/ragged/ragged_util.py
@@ -86,10 +86,13 @@ def repeat_ranges(params, splits, repeats):
     A `Tensor` with the same rank and type as `params`.
 
   #### Example:
-    ```python
-    >>> repeat_ranges(['a', 'b', 'c'], [0, 2, 3], 3)
-    ['a', 'b', 'a', 'b', 'a', 'b', 'c', 'c', 'c']
-    ```
+
+  >>> print(repeat_ranges(
+  ...     params=tf.constant(['a', 'b', 'c']),
+  ...     splits=tf.constant([0, 2, 3]),
+  ...     repeats=tf.constant(3)))
+  tf.Tensor([b'a' b'b' b'a' b'b' b'a' b'b' b'c' b'c' b'c'],
+      shape=(9,), dtype=string)
   """
   # Divide `splits` into starts and limits, and repeat them `repeats` times.
   if repeats.shape.ndims != 0:
diff --git a/tensorflow/python/ops/ragged/ragged_where_op.py b/tensorflow/python/ops/ragged/ragged_where_op.py
index 542f53a176e..23af6b20146 100644
--- a/tensorflow/python/ops/ragged/ragged_where_op.py
+++ b/tensorflow/python/ops/ragged/ragged_where_op.py
@@ -74,28 +74,25 @@ def where(condition, x=None, y=None, name=None):
       `condition`, `x`, and `y` have incompatible shapes.
 
   #### Examples:
-    ```python
-    >>> # Coordinates where condition is true.
-    >>> condition = tf.compat.v1.ragged.constant_value(
-    ...     [[True, False, True], [False, True]])
-    >>> ragged.where(condition)
-    [[0, 0], [0, 2], [1, 1]]
 
-    >>> # Elementwise selection between x and y, based on condition.
-    >>> condition = tf.compat.v1.ragged.constant_value(
-    ...     [[True, False, True], [False, True]])
-    >>> x = tf.compat.v1.ragged.constant_value([['A', 'B', 'C'], ['D', 'E']])
-    >>> y = tf.compat.v1.ragged.constant_value([['a', 'b', 'c'], ['d', 'e']])
-    >>> ragged.where(condition, x, y)
-    [['A', 'b', 'C'], ['d', 'E']]
+  >>> # Coordinates where condition is true.
+  >>> condition = tf.ragged.constant([[True, False, True], [False, True]])
+  >>> print(where(condition))
+  tf.Tensor( [[0 0] [0 2] [1 1]], shape=(3, 2), dtype=int64)
 
-    >>> # Row selection between x and y, based on condition.
-    >>> condition = [True, False]
-    >>> x = tf.compat.v1.ragged.constant_value([['A', 'B', 'C'], ['D', 'E']])
-    >>> y = tf.compat.v1.ragged.constant_value([['a', 'b', 'c'], ['d', 'e']])
-    >>> ragged.where(condition, x, y)
-    [['A', 'B', 'C'], ['d', 'e']]
-    ```
+  >>> # Elementwise selection between x and y, based on condition.
+  >>> condition = tf.ragged.constant([[True, False, True], [False, True]])
+  >>> x = tf.ragged.constant([['A', 'B', 'C'], ['D', 'E']])
+  >>> y = tf.ragged.constant([['a', 'b', 'c'], ['d', 'e']])
+  >>> print(where(condition, x, y))
+  <tf.RaggedTensor [[b'A', b'b', b'C'], [b'd', b'E']]>
+
+  >>> # Row selection between x and y, based on condition.
+  >>> condition = [True, False]
+  >>> x = tf.ragged.constant([['A', 'B', 'C'], ['D', 'E']])
+  >>> y = tf.ragged.constant([['a', 'b', 'c'], ['d', 'e']])
+  >>> print(where(condition, x, y))
+  <tf.RaggedTensor [[b'A', b'B', b'C'], [b'd', b'e']]>
   """
   if (x is None) != (y is None):
     raise ValueError('x and y must be either both None or both non-None')
diff --git a/tensorflow/python/ops/ragged/segment_id_ops.py b/tensorflow/python/ops/ragged/segment_id_ops.py
index 69791fb4080..5329860743e 100644
--- a/tensorflow/python/ops/ragged/segment_id_ops.py
+++ b/tensorflow/python/ops/ragged/segment_id_ops.py
@@ -37,10 +37,8 @@ def row_splits_to_segment_ids(splits, name=None, out_type=None):
   Returns an integer vector `segment_ids`, where `segment_ids[i] == j` if
   `splits[j] <= i < splits[j+1]`.  Example:
 
-  ```python
-  >>> ragged.row_splits_to_segment_ids([0, 3, 3, 5, 6, 9]).eval()
-  [ 0 0 0 2 2 3 4 4 4 ]
-  ```
+  >>> print(tf.ragged.row_splits_to_segment_ids([0, 3, 3, 5, 6, 9]))
+   tf.Tensor([0 0 0 2 2 3 4 4 4], shape=(9,), dtype=int64)
 
   Args:
     splits: A sorted 1-D integer Tensor.  `splits[0]` must be zero.
@@ -83,10 +81,8 @@ def segment_ids_to_row_splits(segment_ids, num_segments=None,
   Returns an integer vector `splits`, where `splits[0] = 0` and
   `splits[i] = splits[i-1] + count(segment_ids==i)`.  Example:
 
-  ```python
-  >>> ragged.segment_ids_to_row_splits([0, 0, 0, 2, 2, 3, 4, 4, 4]).eval()
-  [ 0 3 3 5 6 9 ]
-  ```
+  >>> print(tf.ragged.segment_ids_to_row_splits([0, 0, 0, 2, 2, 3, 4, 4, 4]))
+  tf.Tensor([0 3 3 5 6 9], shape=(6,), dtype=int64)
 
   Args:
     segment_ids: A 1-D integer Tensor.
diff --git a/tensorflow/python/ops/resource_variable_ops.py b/tensorflow/python/ops/resource_variable_ops.py
index dc6ebd0f64f..2c288618794 100644
--- a/tensorflow/python/ops/resource_variable_ops.py
+++ b/tensorflow/python/ops/resource_variable_ops.py
@@ -521,8 +521,7 @@ class BaseResourceVariable(variables.VariableV1):
     if self._cached_value is not None:
       return self._cached_value
     with ops.colocate_with(None, ignore_existing=True):
-      with ops.device(self._handle.device):
-        return self._read_variable_op()
+      return self._read_variable_op()
 
   def _as_graph_element(self):
     """Conversion function for Graph.as_graph_element()."""
@@ -626,9 +625,7 @@ class BaseResourceVariable(variables.VariableV1):
      the read operation.
     """
     with ops.name_scope("Read"):
-      # Ensure we read the variable in the same device as the handle.
-      with ops.device(self._handle.device):
-        value = self._read_variable_op()
+      value = self._read_variable_op()
     # Return an identity so it can get placed on whatever device the context
     # specifies instead of the device where the variable is.
     return array_ops.identity(value)
diff --git a/tensorflow/python/ops/signal/fft_ops.py b/tensorflow/python/ops/signal/fft_ops.py
index 13b8d4b2213..cfe7799ac57 100644
--- a/tensorflow/python/ops/signal/fft_ops.py
+++ b/tensorflow/python/ops/signal/fft_ops.py
@@ -19,12 +19,13 @@ from __future__ import print_function
 
 import numpy as np
 
+from tensorflow.python.compat import compat
 from tensorflow.python.framework import dtypes as _dtypes
 from tensorflow.python.framework import ops as _ops
-from tensorflow.python.ops import manip_ops
 from tensorflow.python.framework import tensor_util as _tensor_util
 from tensorflow.python.ops import array_ops as _array_ops
 from tensorflow.python.ops import gen_spectral_ops
+from tensorflow.python.ops import manip_ops
 from tensorflow.python.ops import math_ops as _math_ops
 from tensorflow.python.util.tf_export import tf_export
 
@@ -115,14 +116,23 @@ def _rfft_wrapper(fft_fn, fft_rank, default_name):
     """Wrapper around gen_spectral_ops.rfft* that infers fft_length argument."""
     with _ops.name_scope(name, default_name,
                          [input_tensor, fft_length]) as name:
-      input_tensor = _ops.convert_to_tensor(input_tensor, _dtypes.float32)
+      input_tensor = _ops.convert_to_tensor(input_tensor,
+                                            preferred_dtype=_dtypes.float32)
+      real_dtype = input_tensor.dtype
+      if real_dtype == _dtypes.float32:
+        complex_dtype = _dtypes.complex64
+      elif real_dtype == _dtypes.float64:
+        complex_dtype = _dtypes.complex128
       input_tensor.shape.with_rank_at_least(fft_rank)
       if fft_length is None:
         fft_length = _infer_fft_length_for_rfft(input_tensor, fft_rank)
       else:
         fft_length = _ops.convert_to_tensor(fft_length, _dtypes.int32)
       input_tensor = _maybe_pad_for_rfft(input_tensor, fft_rank, fft_length)
-      return fft_fn(input_tensor, fft_length, name)
+
+      if not compat.forward_compatible(2019, 10, 12):
+        return fft_fn(input_tensor, fft_length, name=name)
+      return fft_fn(input_tensor, fft_length, Tcomplex=complex_dtype, name=name)
   _rfft.__doc__ = fft_fn.__doc__
   return _rfft
 
@@ -134,15 +144,20 @@ def _irfft_wrapper(ifft_fn, fft_rank, default_name):
     """Wrapper irfft* that infers fft_length argument."""
     with _ops.name_scope(name, default_name,
                          [input_tensor, fft_length]) as name:
-      input_tensor = _ops.convert_to_tensor(input_tensor, _dtypes.complex64)
+      input_tensor = _ops.convert_to_tensor(input_tensor,
+                                            preferred_dtype=_dtypes.complex64)
       input_tensor.shape.with_rank_at_least(fft_rank)
+      complex_dtype = input_tensor.dtype
+      real_dtype = complex_dtype.real_dtype
       if fft_length is None:
         fft_length = _infer_fft_length_for_irfft(input_tensor, fft_rank)
       else:
         fft_length = _ops.convert_to_tensor(fft_length, _dtypes.int32)
       input_tensor = _maybe_pad_for_rfft(input_tensor, fft_rank, fft_length,
                                          is_reverse=True)
-      return ifft_fn(input_tensor, fft_length, name)
+      if not compat.forward_compatible(2019, 10, 12):
+        return ifft_fn(input_tensor, fft_length, name=name)
+      return ifft_fn(input_tensor, fft_length, Treal=real_dtype, name=name)
   _irfft.__doc__ = ifft_fn.__doc__
   return _irfft
 
@@ -223,8 +238,10 @@ def _rfft_grad_helper(rank, irfft_fn):
   def _grad(op, grad):
     """A gradient function for RFFT with the provided `rank` and `irfft_fn`."""
     fft_length = op.inputs[1]
+    complex_dtype = grad.dtype
+    real_dtype = complex_dtype.real_dtype
     input_shape = _array_ops.shape(op.inputs[0])
-    is_even = _math_ops.cast(1 - (fft_length[-1] % 2), _dtypes.complex64)
+    is_even = _math_ops.cast(1 - (fft_length[-1] % 2), complex_dtype)
 
     def _tile_for_broadcasting(matrix, t):
       expanded = _array_ops.reshape(
@@ -248,13 +265,13 @@ def _rfft_grad_helper(rank, irfft_fn):
           _array_ops.expand_dims(_math_ops.range(length), 0), (length, 1))
       b = _array_ops.transpose(a, [1, 0])
       return _math_ops.exp(
-          -2j * np.pi * _math_ops.cast(a * b, _dtypes.complex64) /
-          _math_ops.cast(length, _dtypes.complex64))
+          -2j * np.pi * _math_ops.cast(a * b, complex_dtype) /
+          _math_ops.cast(length, complex_dtype))
 
     def _ymask(length):
       """A sequence of [1+0j, -1+0j, 1+0j, -1+0j, ...] with length `length`."""
       return _math_ops.cast(1 - 2 * (_math_ops.range(length) % 2),
-                            _dtypes.complex64)
+                            complex_dtype)
 
     y0 = grad[..., 0:1]
     if rank == 1:
@@ -288,7 +305,7 @@ def _rfft_grad_helper(rank, irfft_fn):
     # factor, plus some additional terms to make up for the components dropped
     # due to Hermitian symmetry.
     input_size = _math_ops.cast(
-        _fft_size_for_grad(op.inputs[0], rank), _dtypes.float32)
+        _fft_size_for_grad(op.inputs[0], rank), real_dtype)
     the_irfft = irfft_fn(grad, fft_length)
     return 0.5 * (the_irfft * input_size + _math_ops.real(extra_terms)), None
 
@@ -307,21 +324,27 @@ def _irfft_grad_helper(rank, rfft_fn):
     # graph we special-case the situation where the FFT length and last
     # dimension of the input are known at graph construction time.
     fft_length = op.inputs[1]
+    real_dtype = grad.dtype
+    if real_dtype == _dtypes.float32:
+      complex_dtype = _dtypes.complex64
+    elif real_dtype == _dtypes.float64:
+      complex_dtype = _dtypes.complex128
     is_odd = _math_ops.mod(fft_length[-1], 2)
     input_last_dimension = _array_ops.shape(op.inputs[0])[-1]
     mask = _array_ops.concat(
-        [[1.0], 2.0 * _array_ops.ones([input_last_dimension - 2 + is_odd]),
-         _array_ops.ones([1 - is_odd])], 0)
+        [[1.0], 2.0 * _array_ops.ones(
+            [input_last_dimension - 2 + is_odd], real_dtype),
+         _array_ops.ones([1 - is_odd], real_dtype)], 0)
 
     rsize = _math_ops.reciprocal(_math_ops.cast(
-        _fft_size_for_grad(grad, rank), _dtypes.float32))
+        _fft_size_for_grad(grad, rank), real_dtype))
 
     # The gradient of IRFFT is the RFFT of the incoming gradient times a scaling
     # factor and a mask. The mask scales the gradient for the Hermitian
     # symmetric components of the RFFT by a factor of two, since these
     # components are de-duplicated in the RFFT.
     the_rfft = rfft_fn(grad, fft_length)
-    return the_rfft * _math_ops.cast(rsize * mask, _dtypes.complex64), None
+    return the_rfft * _math_ops.cast(rsize * mask, complex_dtype), None
 
   return _grad
 
diff --git a/tensorflow/python/ops/special_math_ops.py b/tensorflow/python/ops/special_math_ops.py
index 5f3b179d5d2..ad44d428fb4 100644
--- a/tensorflow/python/ops/special_math_ops.py
+++ b/tensorflow/python/ops/special_math_ops.py
@@ -197,29 +197,29 @@ def einsum(equation, *inputs, **kwargs):
 
   ```python
   # Matrix multiplication
-  >>> einsum('ij,jk->ik', m0, m1)  # output[i,k] = sum_j m0[i,j] * m1[j, k]
+  einsum('ij,jk->ik', m0, m1)  # output[i,k] = sum_j m0[i,j] * m1[j, k]
 
   # Dot product
-  >>> einsum('i,i->', u, v)  # output = sum_i u[i]*v[i]
+  einsum('i,i->', u, v)  # output = sum_i u[i]*v[i]
 
   # Outer product
-  >>> einsum('i,j->ij', u, v)  # output[i,j] = u[i]*v[j]
+  einsum('i,j->ij', u, v)  # output[i,j] = u[i]*v[j]
 
   # Transpose
-  >>> einsum('ij->ji', m)  # output[j,i] = m[i,j]
+  einsum('ij->ji', m)  # output[j,i] = m[i,j]
 
   # Trace
-  >>> einsum('ii', m)  # output[j,i] = trace(m) = sum_i m[i, i]
+  einsum('ii', m)  # output[j,i] = trace(m) = sum_i m[i, i]
 
   # Batch matrix multiplication
-  >>> einsum('aij,ajk->aik', s, t)  # out[a,i,k] = sum_j s[a,i,j] * t[a, j, k]
+  einsum('aij,ajk->aik', s, t)  # out[a,i,k] = sum_j s[a,i,j] * t[a, j, k]
   ```
 
   To enable and control broadcasting, use an ellipsis.  For example, to do
   batch matrix multiplication, you could use:
 
   ```python
-  >>> einsum('...ij,...jk->...ik', u, v)
+  einsum('...ij,...jk->...ik', u, v)
   ```
 
   This function behaves like `numpy.einsum`, but does not support:
diff --git a/tensorflow/python/ops/spectral_ops_test_util.py b/tensorflow/python/ops/spectral_ops_test_util.py
deleted file mode 100644
index 1f2e730edc8..00000000000
--- a/tensorflow/python/ops/spectral_ops_test_util.py
+++ /dev/null
@@ -1,50 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Utilities for writing test involving spectral_ops."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from tensorflow.python.framework import ops
-from tensorflow.python.platform import test
-
-
-def _use_eigen_kernels():
-  use_eigen_kernels = False  # Eigen kernels are default
-  if test.is_gpu_available(cuda_only=True):
-    use_eigen_kernels = False
-  return use_eigen_kernels
-
-
-def fft_kernel_label_map():
-  """Returns a generator overriding kernel selection.
-
-  This is used to force testing of the eigen kernels, even
-  when they are not the default registered kernels.
-
-  Returns:
-    A generator in which to wrap every test.
-  """
-  if _use_eigen_kernels():
-    d = dict([(op, "eigen")
-              for op in [
-                  "FFT", "FFT2D", "FFT3D", "IFFT", "IFFT2D", "IFFT3D",
-                  "IRFFT", "IRFFT2D", "IRFFT3D", "RFFT", "RFFT2D", "RFFT3D"
-              ]])
-    return ops.get_default_graph()._kernel_label_map(d)  # pylint: disable=protected-access
-  else:
-    return ops.get_default_graph()._kernel_label_map({})  # pylint: disable=protected-access
-
diff --git a/tensorflow/python/ops/variable_scope.py b/tensorflow/python/ops/variable_scope.py
index ef1fe5cab6f..8b4c8692abc 100644
--- a/tensorflow/python/ops/variable_scope.py
+++ b/tensorflow/python/ops/variable_scope.py
@@ -2090,6 +2090,27 @@ class variable_scope(object):
         assert c.name == "foo/c:0"
   ```
 
+  Keep in mind that the counters for `default_name` are discarded once the
+  parent scope is exited. Therefore when the code re-enters the scope (for
+  instance by saving it), all nested default_name counters will be restarted.
+
+  For instance:
+
+  ```python
+  with tf.compat.v1.variable_scope("foo") as vs:
+    with tf.compat.v1.variable_scope(None, default_name="bar"):
+      v = tf.compat.v1.get_variable("a", [1])
+      assert v.name == "foo/bar/a:0", v.name
+    with tf.compat.v1.variable_scope(None, default_name="bar"):
+      v = tf.compat.v1.get_variable("b", [1])
+      assert v.name == "foo/bar_1/b:0"
+
+  with tf.compat.v1.variable_scope(vs):
+    with tf.compat.v1.variable_scope(None, default_name="bar"):
+      v = tf.compat.v1.get_variable("c", [1])
+      assert v.name == "foo/bar/c:0"   # Uses bar instead of bar_2!
+  ```
+
   Basic example of sharing a variable AUTO_REUSE:
 
   ```python
diff --git a/tensorflow/python/ops/variables.py b/tensorflow/python/ops/variables.py
index f78dd5e94ad..7871dc00a66 100644
--- a/tensorflow/python/ops/variables.py
+++ b/tensorflow/python/ops/variables.py
@@ -17,7 +17,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import abc
 import enum  # pylint: disable=g-bad-import-order
 import itertools
 import functools
@@ -28,6 +27,7 @@ from tensorflow.core.framework import attr_value_pb2
 from tensorflow.core.framework import variable_pb2
 from tensorflow.python import pywrap_tensorflow  # pylint: disable=unused-import
 from tensorflow.python import _pywrap_utils
+from tensorflow.python.compat import compat as fwd_compat
 from tensorflow.python.eager import context
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
@@ -35,8 +35,8 @@ from tensorflow.python.framework import tensor_shape
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import gen_array_ops
-from tensorflow.python.ops import gen_math_ops
 from tensorflow.python.ops import gen_state_ops
+from tensorflow.python.ops import gen_math_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import state_ops
 from tensorflow.python.platform import tf_logging as logging
@@ -1092,7 +1092,10 @@ class Variable(six.with_metaclass(VariableMetaclass, trackable.Trackable)):
   def __eq__(self, other):
     """Compares two variables element-wise for equality."""
     if ops.Tensor._USE_EQUALITY and ops.executing_eagerly_outside_functions():  # pylint: disable=protected-access
-      return gen_math_ops.equal(self, other)
+      if fwd_compat.forward_compatible(2019, 9, 25):
+        return gen_math_ops.equal(self, other, incompatible_shape_error=False)
+      else:
+        return gen_math_ops.equal(self, other)
     else:
       # In legacy graph mode, tensor equality is object equality
       return self is other
@@ -1101,7 +1104,11 @@ class Variable(six.with_metaclass(VariableMetaclass, trackable.Trackable)):
   def __ne__(self, other):
     """Compares two variables element-wise for equality."""
     if ops.Tensor._USE_EQUALITY and ops.executing_eagerly_outside_functions():  # pylint: disable=protected-access
-      return gen_math_ops.not_equal(self, other)
+      if fwd_compat.forward_compatible(2019, 9, 25):
+        return gen_math_ops.not_equal(
+            self, other, incompatible_shape_error=False)
+      else:
+        return gen_math_ops.not_equal(self, other)
     else:
       # In legacy graph mode, tensor equality is object equality
       return self is not other
@@ -3376,14 +3383,3 @@ def report_uninitialized_variables(var_list=None,
 
 ops.register_tensor_conversion_function(
     PartitionedVariable, PartitionedVariable._TensorConversionFunction)  # pylint: disable=protected-access
-
-
-class AbstractVariableMetaclass(VariableMetaclass, abc.ABCMeta):
-  """Metaclass combining `VariableMetaclass` and `abc.ABCMeta`."""
-  pass
-
-
-@six.add_metaclass(AbstractVariableMetaclass)
-class AbstractVariable(Variable):
-  """`Variable`, but abstract."""
-  pass
diff --git a/tensorflow/python/ops/while_v2.py b/tensorflow/python/ops/while_v2.py
index dfdf1ef83e9..86831719a97 100644
--- a/tensorflow/python/ops/while_v2.py
+++ b/tensorflow/python/ops/while_v2.py
@@ -296,13 +296,15 @@ def while_loop(cond,
     util.maybe_set_lowering_attr(outputs[0].op)
     util.maybe_propagate_compile_time_consts_in_xla(outputs[0].op)
 
-    # Return identities for each output of the While op, rather than the output
-    # of the While op directly. This makes pruning work if the output of
-    # while_loop() is fetched: the lowering pass converts the While outputs into
-    # IdentityN outputs, which if fetched will cause all ops in the body to be
-    # run (since it takes all exit ops as input). After lowering, each output
-    # identity op will end up with only the appropriate exit op as input.
-    outputs = tuple(array_ops.identity(t) for t in outputs)
+    if not ops.get_default_graph().building_function:
+      # In V1 graph mode, return identities for each output of the While op,
+      # rather than the output of the While op directly. This makes pruning work
+      # if the output of while_loop() is fetched: the lowering pass converts the
+      # While outputs into IdentityN outputs, which if fetched will cause all
+      # ops in the body to be run (since it takes all exit ops as input). After
+      # lowering, each output identity op will end up with only the appropriate
+      # exit op as input.
+      outputs = tuple(array_ops.identity(t) for t in outputs)
 
   outputs = _pack_sequence_as(
       orig_loop_vars, outputs[first_loop_var_index:first_loop_var_index +
diff --git a/tensorflow/python/platform/benchmark.py b/tensorflow/python/platform/benchmark.py
index b183026f87c..bee27d1ca9f 100644
--- a/tensorflow/python/platform/benchmark.py
+++ b/tensorflow/python/platform/benchmark.py
@@ -23,6 +23,7 @@ import os
 import re
 import sys
 import time
+import types
 
 import six
 
@@ -51,6 +52,30 @@ TEST_REPORTER_TEST_ENV = "TEST_REPORT_FILE_PREFIX"
 OVERRIDE_GLOBAL_THREADPOOL = "TF_OVERRIDE_GLOBAL_THREADPOOL"
 
 
+def _rename_function(f, arg_num, name):
+  """Rename the given function's name appears in the stack trace."""
+  func_code = six.get_function_code(f)
+  if six.PY2:
+    new_code = types.CodeType(arg_num, func_code.co_nlocals,
+                              func_code.co_stacksize, func_code.co_flags,
+                              func_code.co_code, func_code.co_consts,
+                              func_code.co_names, func_code.co_varnames,
+                              func_code.co_filename, name,
+                              func_code.co_firstlineno, func_code.co_lnotab,
+                              func_code.co_freevars, func_code.co_cellvars)
+  else:
+    new_code = types.CodeType(arg_num, 0, func_code.co_nlocals,
+                              func_code.co_stacksize, func_code.co_flags,
+                              func_code.co_code, func_code.co_consts,
+                              func_code.co_names, func_code.co_varnames,
+                              func_code.co_filename, name,
+                              func_code.co_firstlineno, func_code.co_lnotab,
+                              func_code.co_freevars, func_code.co_cellvars)
+
+  return types.FunctionType(new_code, f.__globals__, name, f.__defaults__,
+                            f.__closure__)
+
+
 def _global_report_benchmark(
     name, iters=None, cpu_time=None, wall_time=None,
     throughput=None, extras=None, metrics=None):
@@ -136,13 +161,44 @@ class _BenchmarkRegistrar(type):
   """The Benchmark class registrar.  Used by abstract Benchmark class."""
 
   def __new__(mcs, clsname, base, attrs):
-    newclass = super(mcs, _BenchmarkRegistrar).__new__(
-        mcs, clsname, base, attrs)
+    newclass = type.__new__(mcs, clsname, base, attrs)
     if not newclass.is_abstract():
       GLOBAL_BENCHMARK_REGISTRY.add(newclass)
     return newclass
 
 
+class ParameterizedBenchmark(_BenchmarkRegistrar):
+  """Metaclass to generate parameterized benchmarks."""
+
+  def __new__(mcs, clsname, base, attrs):
+    param_config_list = attrs["_benchmark_parameters"]
+
+    for name in attrs.copy().keys():
+      if not name.startswith("benchmark"):
+        continue
+
+      original_benchmark = attrs[name]
+      del attrs[name]
+
+      for param_config in param_config_list:
+        test_name_suffix = param_config[0]
+        params = param_config[1:]
+        benchmark_name = name + "__" + test_name_suffix
+        if benchmark_name in attrs:
+          raise Exception(
+              "Benchmark named {} already defined.".format(benchmark_name))
+
+        def create_benchmark_function(params):
+          return lambda self: original_benchmark(self, *params)
+
+        benchmark = create_benchmark_function(params)
+        # Renaming is important because `report_benchmark` function looks up the
+        # function name in the stack trace.
+        attrs[benchmark_name] = _rename_function(benchmark, 1, benchmark_name)
+
+    return super(mcs, ParameterizedBenchmark).__new__(mcs, clsname, base, attrs)
+
+
 class Benchmark(six.with_metaclass(_BenchmarkRegistrar, object)):
   """Abstract class that provides helper functions for running benchmarks.
 
diff --git a/tensorflow/python/profiler/BUILD b/tensorflow/python/profiler/BUILD
index eec7cd273bb..ef113ae9b17 100644
--- a/tensorflow/python/profiler/BUILD
+++ b/tensorflow/python/profiler/BUILD
@@ -37,8 +37,8 @@ py_library(
         ":option_builder",
         ":tfprof_logger",
         "//tensorflow/core/profiler:protos_all_py",
+        "//tensorflow/python:_pywrap_tfprof",
         "//tensorflow/python:errors",
-        "//tensorflow/python:pywrap_tensorflow",
         "@six_archive//:six",
     ],
 )
diff --git a/tensorflow/python/profiler/internal/model_analyzer_testlib.py b/tensorflow/python/profiler/internal/model_analyzer_testlib.py
index 895646997b1..edce43b9d6c 100644
--- a/tensorflow/python/profiler/internal/model_analyzer_testlib.py
+++ b/tensorflow/python/profiler/internal/model_analyzer_testlib.py
@@ -19,7 +19,7 @@ from __future__ import print_function
 
 import contextlib
 
-from tensorflow.python import pywrap_tensorflow as print_mdl
+from tensorflow.python import _pywrap_tfprof as print_mdl
 from tensorflow.python.framework import dtypes
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import init_ops
diff --git a/tensorflow/python/profiler/model_analyzer.py b/tensorflow/python/profiler/model_analyzer.py
index f3b521a6bfb..aa876e6dafb 100644
--- a/tensorflow/python/profiler/model_analyzer.py
+++ b/tensorflow/python/profiler/model_analyzer.py
@@ -27,7 +27,7 @@ import six
 from google.protobuf import message
 from tensorflow.core.profiler import tfprof_options_pb2
 from tensorflow.core.profiler import tfprof_output_pb2
-from tensorflow.python import pywrap_tensorflow as print_mdl
+from tensorflow.python import _pywrap_tfprof as print_mdl
 from tensorflow.python.eager import context
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
diff --git a/tensorflow/python/profiler/profile_context.py b/tensorflow/python/profiler/profile_context.py
index fa4260a7120..c5c8d669baf 100644
--- a/tensorflow/python/profiler/profile_context.py
+++ b/tensorflow/python/profiler/profile_context.py
@@ -25,7 +25,7 @@ import sys
 import threading
 
 from tensorflow.core.protobuf import config_pb2
-from tensorflow.python import pywrap_tensorflow as print_mdl
+from tensorflow.python import _pywrap_tfprof as print_mdl
 from tensorflow.python.client import session
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
diff --git a/tensorflow/python/saved_model/BUILD b/tensorflow/python/saved_model/BUILD
index 7fd56e118e8..6357cc7b544 100644
--- a/tensorflow/python/saved_model/BUILD
+++ b/tensorflow/python/saved_model/BUILD
@@ -323,9 +323,12 @@ tf_py_test(
     additional_deps = [
         ":loader",
         ":save",
+        ":save_options",
         ":signature_constants",
         ":tag_constants",
         "@absl_py//absl/testing:parameterized",
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python:error_interpolation",
         "//tensorflow/python/eager:def_function",
         "//tensorflow/python/eager:test",
     ],
diff --git a/tensorflow/python/saved_model/constants.py b/tensorflow/python/saved_model/constants.py
index 90511a409ed..7c4d91b003d 100644
--- a/tensorflow/python/saved_model/constants.py
+++ b/tensorflow/python/saved_model/constants.py
@@ -86,6 +86,25 @@ tf_export(
         "saved_model.constants.SAVED_MODEL_FILENAME_PBTXT"
     ]).export_constant(__name__, "SAVED_MODEL_FILENAME_PBTXT")
 
+# Subdirectory where debugging related files are written.
+DEBUG_DIRECTORY = "debug"
+tf_export(
+    "saved_model.DEBUG_DIRECTORY",
+    v1=[
+        "saved_model.DEBUG_DIRECTORY",
+        "saved_model.constants.DEBUG_DIRECTORY",
+    ]).export_constant(__name__, "DEBUG_DIRECTORY")
+
+# File name for GraphDebugInfo protocol buffer which corresponds to the
+# SavedModel.
+DEBUG_INFO_FILENAME_PB = "saved_model_debug_info.pb"
+tf_export(
+    "saved_model.DEBUG_INFO_FILENAME_PB",
+    v1=[
+        "saved_model.DEBUG_INFO_FILENAME_PB",
+        "saved_model.constants.DEBUG_INFO_FILENAME_PB"
+    ]).export_constant(__name__, "DEBUG_INFO_FILENAME_PB")
+
 # File name for json format of SavedModel.
 # Not exported while keras_saved_model is in contrib.
 SAVED_MODEL_FILENAME_JSON = "saved_model.json"
diff --git a/tensorflow/python/saved_model/function_deserialization.py b/tensorflow/python/saved_model/function_deserialization.py
index 7641e5e6d8e..078888c570f 100644
--- a/tensorflow/python/saved_model/function_deserialization.py
+++ b/tensorflow/python/saved_model/function_deserialization.py
@@ -308,8 +308,7 @@ def load_function_def_library(library, load_shared_name_suffix=None):
     # extra function definitions are a no-op since they already imported as a
     # function before and passed in explicitly (due to the topologic sort
     # import).
-    func_graph = function_def_lib.function_def_to_graph(
-        copy, copy_functions=False)
+    func_graph = function_def_lib.function_def_to_graph(copy)
 
     for dep in _list_function_deps(fdef, library_function_names):
       functions[dep].add_to_graph(func_graph)
diff --git a/tensorflow/python/saved_model/load.py b/tensorflow/python/saved_model/load.py
index 88f0f819ea7..b4754d4c6e7 100644
--- a/tensorflow/python/saved_model/load.py
+++ b/tensorflow/python/saved_model/load.py
@@ -206,7 +206,7 @@ class Loader(object):
         return obj
       elif resource_variable_ops.is_resource_variable(obj):
         return obj.handle
-      elif isinstance(obj, tracking.TrackableAsset):
+      elif isinstance(obj, tracking.Asset):
         return obj.asset_path
       elif tensor_util.is_tensor(obj):
         return obj
@@ -343,7 +343,7 @@ class Loader(object):
     filename = os.path.join(
         saved_model_utils.get_assets_dir(self._export_dir),
         self._asset_file_def[proto.asset_file_def_index].filename)
-    return tracking.TrackableAsset(filename), setattr
+    return tracking.Asset(filename), setattr
 
   def _recreate_function(self, proto):
     return function_deserialization.recreate_function(
diff --git a/tensorflow/python/saved_model/load_test.py b/tensorflow/python/saved_model/load_test.py
index 0fb77683daa..1a08a0ad950 100644
--- a/tensorflow/python/saved_model/load_test.py
+++ b/tensorflow/python/saved_model/load_test.py
@@ -204,8 +204,8 @@ class LoadTest(test.TestCase, parameterized.TestCase):
     file2 = self._make_asset("contents 2")
 
     root = tracking.AutoTrackable()
-    root.asset1 = tracking.TrackableAsset(file1)
-    root.asset2 = tracking.TrackableAsset(file2)
+    root.asset1 = tracking.Asset(file1)
+    root.asset2 = tracking.Asset(file2)
 
     save_dir = os.path.join(self.get_temp_dir(), "save_dir")
     save.save(root, save_dir)
@@ -253,7 +253,7 @@ class LoadTest(test.TestCase, parameterized.TestCase):
 
   def test_capture_assets(self, cycles):
     root = tracking.AutoTrackable()
-    root.vocab = tracking.TrackableAsset(self._make_asset("contents"))
+    root.vocab = tracking.Asset(self._make_asset("contents"))
     root.f = def_function.function(
         lambda: root.vocab.asset_path,
         input_signature=[])
@@ -266,7 +266,7 @@ class LoadTest(test.TestCase, parameterized.TestCase):
 
   def test_capture_assets_in_graph(self, cycles):
     root = tracking.AutoTrackable()
-    root.vocab = tracking.TrackableAsset(self._make_asset("contents"))
+    root.vocab = tracking.Asset(self._make_asset("contents"))
     root.f = def_function.function(
         lambda: root.vocab.asset_path,
         input_signature=[])
@@ -290,8 +290,8 @@ class LoadTest(test.TestCase, parameterized.TestCase):
   def test_dedup_assets(self, cycles):
     vocab = self._make_asset("contents")
     root = tracking.AutoTrackable()
-    root.asset1 = tracking.TrackableAsset(vocab)
-    root.asset2 = tracking.TrackableAsset(vocab)
+    root.asset1 = tracking.Asset(vocab)
+    root.asset2 = tracking.Asset(vocab)
     imported = cycle(root, cycles)
     self.assertEqual(imported.asset1.asset_path.numpy(),
                      imported.asset2.asset_path.numpy())
diff --git a/tensorflow/python/saved_model/load_v1_in_v2.py b/tensorflow/python/saved_model/load_v1_in_v2.py
index e2e62bae386..ec1647cb949 100644
--- a/tensorflow/python/saved_model/load_v1_in_v2.py
+++ b/tensorflow/python/saved_model/load_v1_in_v2.py
@@ -26,6 +26,7 @@ from tensorflow.python.eager import wrap_function
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.ops import array_ops
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.saved_model import function_deserialization
@@ -127,12 +128,26 @@ class _EagerSavedModelLoader(loader_impl.SavedModelLoader):
     signature_functions = {}
     for signature_key, signature_def in meta_graph_def.signature_def.items():
       if signature_def.inputs:
-        input_names, input_specs = zip(*signature_def.inputs.items())
+        original_input_names, input_specs = zip(*signature_def.inputs.items())
       else:
-        input_names = []
+        original_input_names = []
         input_specs = []
       # TODO(allenl): Support optional arguments
-      feeds = [wrapped.graph.as_graph_element(inp.name) for inp in input_specs]
+      feeds = [
+          wrap_function._get_element_from_tensor_info(input_spec, wrapped.graph)  # pylint: disable=protected-access
+          for input_spec in input_specs
+      ]
+      input_names = []
+      for original_input_name, feed in zip(original_input_names, feeds):
+        if isinstance(feed, sparse_tensor.SparseTensor):
+          # We have to give explicit name for SparseTensor arguments, because
+          # these are not present in the TensorInfo.
+          indices_name = "%s_indices" % original_input_name
+          values_name = "%s_values" % original_input_name
+          dense_shape_name = "%s_dense_shape" % original_input_name
+          input_names.extend([indices_name, values_name, dense_shape_name])
+        else:
+          input_names.append(original_input_name)
       fetches = {name: out for name, out in signature_def.outputs.items()}
       try:
         signature_fn = wrapped.prune(feeds=feeds, fetches=fetches)
@@ -201,7 +216,7 @@ class _EagerSavedModelLoader(loader_impl.SavedModelLoader):
     for tensor_name, value in loader_impl.get_asset_tensors(
         self._export_dir, meta_graph_def).items():
       asset_feed_tensors.append(wrapped.graph.as_graph_element(tensor_name))
-      asset_paths.append(tracking.TrackableAsset(value))
+      asset_paths.append(tracking.Asset(value))
     init_fn = wrapped.prune(
         feeds=asset_feed_tensors,
         fetches=[init_anchor, wrapped.graph.as_graph_element(init_op)])
diff --git a/tensorflow/python/saved_model/load_v1_in_v2_test.py b/tensorflow/python/saved_model/load_v1_in_v2_test.py
index 906b8198335..f02ab14b21c 100644
--- a/tensorflow/python/saved_model/load_v1_in_v2_test.py
+++ b/tensorflow/python/saved_model/load_v1_in_v2_test.py
@@ -529,6 +529,38 @@ class LoadTest(test.TestCase):
     forty_two = constant_op.constant([42], dtype=dtypes.int64)
     self.assertEqual([84], imported_fn(forty_two)["output"].values.numpy())
 
+  def _model_with_sparse_input(self):
+    """Generate a graph with a SparseTensor input and serialize in V1 format."""
+    export_graph = ops.Graph()
+    with export_graph.as_default():
+      in_sparse_placeholder = array_ops.sparse_placeholder(
+          dtype=dtypes.int64, shape=[2, 2])
+      out_sparse_tensor = sparse_tensor.SparseTensor(
+          indices=in_sparse_placeholder.indices,
+          values=in_sparse_placeholder.values,
+          dense_shape=in_sparse_placeholder.dense_shape) * 2
+      with session_lib.Session() as session:
+        path = os.path.join(self.get_temp_dir(), "saved_model", str(ops.uid()))
+        simple_save.simple_save(
+            session,
+            path,
+            inputs={"start": in_sparse_placeholder},
+            outputs={"output": out_sparse_tensor})
+    return path
+
+  def test_load_sparse_inputs(self):
+    path = self._model_with_sparse_input()
+    imported = load.load(path)
+    imported_fn = imported.signatures["serving_default"]
+    indices = constant_op.constant([[0, 0], [0, 1], [1, 1]], dtype=dtypes.int64)
+    values = constant_op.constant([42, 43, 44], dtype=dtypes.int64)
+    dense_shape = constant_op.constant([2, 2], dtype=dtypes.int64)
+    result = imported_fn(
+        start_indices=indices,
+        start_values=values,
+        start_dense_shape=dense_shape)
+    self.assertAllEqual([84, 86, 88], result["output"].values.numpy())
+
   def _model_with_defun(self):
     """Generate a graph with a Defun and serialize in V1 format."""
     export_graph = ops.Graph()
diff --git a/tensorflow/python/saved_model/save.py b/tensorflow/python/saved_model/save.py
index b1b69f1ff32..2ee97c445c5 100644
--- a/tensorflow/python/saved_model/save.py
+++ b/tensorflow/python/saved_model/save.py
@@ -31,6 +31,7 @@ from tensorflow.python.eager import def_function
 from tensorflow.python.eager import function as defun
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import error_interpolation
 from tensorflow.python.framework import meta_graph
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_util
@@ -270,7 +271,7 @@ class _SaveableView(object):
         object_map[obj] = new_variable
         resource_map[obj.handle] = new_variable.handle
         self.captured_tensor_node_ids[obj.handle] = node_id
-      elif isinstance(obj, tracking.TrackableAsset):
+      elif isinstance(obj, tracking.Asset):
         _process_asset(obj, asset_info, resource_map)
         self.captured_tensor_node_ids[obj.asset_path] = node_id
 
@@ -498,7 +499,7 @@ _AssetInfo = collections.namedtuple(
         "asset_initializers_by_resource",
         # Map from base asset filenames to full paths
         "asset_filename_map",
-        # Map from TrackableAsset to index of corresponding AssetFileDef
+        # Map from Asset to index of corresponding AssetFileDef
         "asset_index"])
 
 
@@ -546,7 +547,8 @@ def _fill_meta_graph_def(meta_graph_def, saveable_view, signature_functions,
     namespace_whitelist: List of strings containing whitelisted op namespaces.
 
   Returns:
-    An _AssetInfo, which contains information to help creating the SavedModel.
+    A tuple of (_AssetInfo, Graph) containing the captured assets and
+    exported Graph generated from tracing the saveable_view.
   """
   # List objects from the eager context to make sure Optimizers give us the
   # right Graph-dependent variables.
@@ -662,7 +664,7 @@ def _serialize_object_graph(saveable_view, asset_file_def_index):
 
 def _write_object_proto(obj, proto, asset_file_def_index):
   """Saves an object into SavedObject proto."""
-  if isinstance(obj, tracking.TrackableAsset):
+  if isinstance(obj, tracking.Asset):
     proto.asset.SetInParent()
     proto.asset.asset_file_def_index = asset_file_def_index[obj]
   elif resource_variable_ops.is_resource_variable(obj):
@@ -700,6 +702,28 @@ def _write_object_proto(obj, proto, asset_file_def_index):
     proto.user_object.CopyFrom(registered_type_proto)
 
 
+def _export_debug_info(exported_graph):
+  """Exports debug information from a graph.
+
+  Args:
+    exported_graph: A Graph that has been created by tracing a saveable view.
+
+  Returns:
+    Corresponding GraphDebugInfo with traces for ops in all functions of the
+    exported_graph.
+  """
+  exported_operations = []
+  for fn_name in exported_graph._functions:  # pylint: disable=protected-access
+    fn = exported_graph._get_function(fn_name)  # pylint: disable=protected-access
+    if not isinstance(fn, defun._EagerDefinedFunction):  # pylint: disable=protected-access
+      continue
+
+    fn_graph = fn.graph
+    for fn_op in fn_graph.get_operations():
+      exported_operations.append((fn_name, fn_op))
+  return error_interpolation.create_graph_debug_info_def(exported_operations)
+
+
 @tf_export("saved_model.save",
            v1=["saved_model.save", "saved_model.experimental.save"])
 def save(obj, export_dir, signatures=None, options=None):
@@ -907,6 +931,16 @@ def save(obj, export_dir, signatures=None, options=None):
       saveable_view, asset_info.asset_index)
   meta_graph_def.object_graph_def.CopyFrom(object_graph_proto)
   file_io.atomic_write_string_to_file(path, saved_model.SerializeToString())
+
+  # Save debug info, if requested.
+  if options.save_debug_info:
+    graph_debug_info = _export_debug_info(exported_graph)
+    file_io.atomic_write_string_to_file(
+        os.path.join(
+            utils_impl.get_or_create_debug_dir(export_dir),
+            constants.DEBUG_INFO_FILENAME_PB),
+        graph_debug_info.SerializeToString())
+
   # Clean reference cycles so repeated export()s don't make work for the garbage
   # collector. Before this point we need to keep references to captured
   # constants in the saved graph.
diff --git a/tensorflow/python/saved_model/save_options.py b/tensorflow/python/saved_model/save_options.py
index fc70c1fcd77..50a8d74dc9e 100644
--- a/tensorflow/python/saved_model/save_options.py
+++ b/tensorflow/python/saved_model/save_options.py
@@ -33,9 +33,9 @@ class SaveOptions(object):
   """
 
   # Define object attributes in __slots__ for improved memory and performance.
-  __slots__ = ("namespace_whitelist",)
+  __slots__ = ("namespace_whitelist", "save_debug_info")
 
-  def __init__(self, namespace_whitelist=None):
+  def __init__(self, namespace_whitelist=None, save_debug_info=False):
     """Creates an object that stores options for SavedModel saving.
 
     Args:
@@ -43,9 +43,14 @@ class SaveOptions(object):
         when saving a model. Saving an object that uses namespaced ops must
         explicitly add all namespaces to the whitelist. The namespaced ops must
         be registered into the framework when loading the SavedModel.
+      save_debug_info: Boolean indicating whether debug information is saved.
+        If True, then a debug/saved_model_debug_info.pb file will be written
+        with the contents of a GraphDebugInfo binary protocol buffer containing
+        stack trace information for all ops and functions that are saved.
     """
     self.namespace_whitelist = _validate_namespace_whitelist(
         namespace_whitelist)
+    self.save_debug_info = save_debug_info
 
 
 def _validate_namespace_whitelist(namespace_whitelist):
diff --git a/tensorflow/python/saved_model/save_test.py b/tensorflow/python/saved_model/save_test.py
index 542e7130273..09a5b24e435 100644
--- a/tensorflow/python/saved_model/save_test.py
+++ b/tensorflow/python/saved_model/save_test.py
@@ -24,6 +24,7 @@ import sys
 from google.protobuf import text_format
 
 from tensorflow.core.framework import graph_pb2
+from tensorflow.core.protobuf import graph_debug_info_pb2
 from tensorflow.python.client import session as session_lib
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.eager import backprop
@@ -48,6 +49,7 @@ from tensorflow.python.ops import variables
 from tensorflow.python.saved_model import loader
 from tensorflow.python.saved_model import loader_impl
 from tensorflow.python.saved_model import save
+from tensorflow.python.saved_model import save_options
 from tensorflow.python.saved_model import signature_constants
 from tensorflow.python.saved_model import tag_constants
 from tensorflow.python.training.tracking import tracking
@@ -415,6 +417,48 @@ class SavingOptionsTest(test.TestCase):
       save._verify_ops(graph_def, [])
     save._verify_ops(graph_def, ["Test"])
 
+  def test_save_debug_info_enabled(self):
+    root = tracking.AutoTrackable()
+    root.f = def_function.function(
+        lambda x: math_ops.mul(2., x, name="DEBUG_INFO_OP"),
+        input_signature=[tensor_spec.TensorSpec(None, dtypes.float32)])
+    save_dir = os.path.join(self.get_temp_dir(), "saved_model")
+    save.save(
+        root,
+        save_dir,
+        root.f,
+        options=save_options.SaveOptions(save_debug_info=True))
+    debug_info_file_name = os.path.join(save_dir, "debug",
+                                        "saved_model_debug_info.pb")
+    self.assertTrue(os.path.exists(debug_info_file_name))
+    debug_info = graph_debug_info_pb2.GraphDebugInfo()
+    with open(debug_info_file_name, "rb") as f:
+      debug_info.ParseFromString(f.read())
+
+    # Verify that there is a trace for DEBUG_INFO_OP just to ensure that
+    # function debug info tracing is nominally functioning.
+    found_op = False
+    for key in debug_info.traces.keys():
+      if key.startswith("DEBUG_INFO_OP@"):
+        found_op = True
+        break
+    self.assertTrue(found_op, "Did not find DEBUG_INFO_OP in trace")
+
+  def test_save_debug_info_disabled(self):
+    root = tracking.AutoTrackable()
+    root.f = def_function.function(
+        lambda x: math_ops.mul(2., x, name="DEBUG_INFO_OP"),
+        input_signature=[tensor_spec.TensorSpec(None, dtypes.float32)])
+    save_dir = os.path.join(self.get_temp_dir(), "saved_model")
+    save.save(
+        root,
+        save_dir,
+        root.f,
+        options=save_options.SaveOptions(save_debug_info=False))
+    debug_info_file_name = os.path.join(save_dir, "debug",
+                                        "saved_model_debug_info.pb")
+    self.assertFalse(os.path.exists(debug_info_file_name))
+
 
 class AssetTests(test.TestCase):
 
@@ -426,7 +470,7 @@ class AssetTests(test.TestCase):
 
   def test_asset_path_returned(self):
     root = tracking.AutoTrackable()
-    root.path = tracking.TrackableAsset(self._vocab_path)
+    root.path = tracking.Asset(self._vocab_path)
     save_dir = os.path.join(self.get_temp_dir(), "saved_model")
     root.get_asset = def_function.function(lambda: root.path.asset_path)
     save.save(root, save_dir, signatures=root.get_asset.get_concrete_function())
@@ -469,7 +513,7 @@ class AssetTests(test.TestCase):
     root.f = def_function.function(
         lambda x: 2. * x,
         input_signature=[tensor_spec.TensorSpec(None, dtypes.float32)])
-    root.asset = tracking.TrackableAsset(self._vocab_path)
+    root.asset = tracking.Asset(self._vocab_path)
 
     export_dir = os.path.join(self.get_temp_dir(), "save_dir")
     save.save(root, export_dir)
diff --git a/tensorflow/python/saved_model/utils_impl.py b/tensorflow/python/saved_model/utils_impl.py
index 3dd7d6c7ae4..70e507a3c13 100644
--- a/tensorflow/python/saved_model/utils_impl.py
+++ b/tensorflow/python/saved_model/utils_impl.py
@@ -244,3 +244,19 @@ def get_assets_dir(export_dir):
   return os.path.join(
       compat.as_text(export_dir),
       compat.as_text(constants.ASSETS_DIRECTORY))
+
+
+def get_or_create_debug_dir(export_dir):
+  """Returns path to the debug sub-directory, creating if it does not exist."""
+  debug_dir = get_debug_dir(export_dir)
+
+  if not file_io.file_exists(debug_dir):
+    file_io.recursive_create_dir(debug_dir)
+
+  return debug_dir
+
+
+def get_debug_dir(export_dir):
+  """Returns path to the debug sub-directory in the SavedModel."""
+  return os.path.join(
+      compat.as_text(export_dir), compat.as_text(constants.DEBUG_DIRECTORY))
diff --git a/tensorflow/python/summary/writer/fake_summary_writer.py b/tensorflow/python/summary/writer/fake_summary_writer.py
new file mode 100644
index 00000000000..eac34afc4ad
--- /dev/null
+++ b/tensorflow/python/summary/writer/fake_summary_writer.py
@@ -0,0 +1,143 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Fake summary writer for unit tests."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.core.framework import summary_pb2
+from tensorflow.python.framework import test_util
+from tensorflow.python.summary.writer import writer
+from tensorflow.python.summary.writer import writer_cache
+
+
+# TODO(ptucker): Replace with mock framework.
+class FakeSummaryWriter(object):
+  """Fake summary writer."""
+
+  _replaced_summary_writer = None
+
+  @classmethod
+  def install(cls):
+    if cls._replaced_summary_writer:
+      raise ValueError('FakeSummaryWriter already installed.')
+    cls._replaced_summary_writer = writer.FileWriter
+    writer.FileWriter = FakeSummaryWriter
+    writer_cache.FileWriter = FakeSummaryWriter
+
+  @classmethod
+  def uninstall(cls):
+    if not cls._replaced_summary_writer:
+      raise ValueError('FakeSummaryWriter not installed.')
+    writer.FileWriter = cls._replaced_summary_writer
+    writer_cache.FileWriter = cls._replaced_summary_writer
+    cls._replaced_summary_writer = None
+
+  def __init__(self, logdir, graph=None):
+    self._logdir = logdir
+    self._graph = graph
+    self._summaries = {}
+    self._added_graphs = []
+    self._added_meta_graphs = []
+    self._added_session_logs = []
+    self._added_run_metadata = {}
+
+  @property
+  def summaries(self):
+    return self._summaries
+
+  def assert_summaries(self,
+                       test_case,
+                       expected_logdir=None,
+                       expected_graph=None,
+                       expected_summaries=None,
+                       expected_added_graphs=None,
+                       expected_added_meta_graphs=None,
+                       expected_session_logs=None):
+    """Assert expected items have been added to summary writer."""
+    if expected_logdir is not None:
+      test_case.assertEqual(expected_logdir, self._logdir)
+    if expected_graph is not None:
+      test_case.assertTrue(expected_graph is self._graph)
+    expected_summaries = expected_summaries or {}
+    for step in expected_summaries:
+      test_case.assertTrue(
+          step in self._summaries,
+          msg='Missing step %s from %s.' % (step, self._summaries.keys()))
+      actual_simple_values = {}
+      for step_summary in self._summaries[step]:
+        for v in step_summary.value:
+          # Ignore global_step/sec since it's written by Supervisor in a
+          # separate thread, so it's non-deterministic how many get written.
+          if 'global_step/sec' != v.tag:
+            actual_simple_values[v.tag] = v.simple_value
+      test_case.assertEqual(expected_summaries[step], actual_simple_values)
+    if expected_added_graphs is not None:
+      test_case.assertEqual(expected_added_graphs, self._added_graphs)
+    if expected_added_meta_graphs is not None:
+      test_case.assertEqual(len(expected_added_meta_graphs),
+                            len(self._added_meta_graphs))
+      for expected, actual in zip(expected_added_meta_graphs,
+                                  self._added_meta_graphs):
+        test_util.assert_meta_graph_protos_equal(test_case, expected, actual)
+    if expected_session_logs is not None:
+      test_case.assertEqual(expected_session_logs, self._added_session_logs)
+
+  def add_summary(self, summ, current_global_step):
+    """Add summary."""
+    if isinstance(summ, bytes):
+      summary_proto = summary_pb2.Summary()
+      summary_proto.ParseFromString(summ)
+      summ = summary_proto
+    if current_global_step in self._summaries:
+      step_summaries = self._summaries[current_global_step]
+    else:
+      step_summaries = []
+      self._summaries[current_global_step] = step_summaries
+    step_summaries.append(summ)
+
+  # NOTE: Ignore global_step since its value is non-deterministic.
+  def add_graph(self, graph, global_step=None, graph_def=None):
+    """Add graph."""
+    if (global_step is not None) and (global_step < 0):
+      raise ValueError('Invalid global_step %s.' % global_step)
+    if graph_def is not None:
+      raise ValueError('Unexpected graph_def %s.' % graph_def)
+    self._added_graphs.append(graph)
+
+  def add_meta_graph(self, meta_graph_def, global_step=None):
+    """Add metagraph."""
+    if (global_step is not None) and (global_step < 0):
+      raise ValueError('Invalid global_step %s.' % global_step)
+    self._added_meta_graphs.append(meta_graph_def)
+
+  # NOTE: Ignore global_step since its value is non-deterministic.
+  def add_session_log(self, session_log, global_step=None):
+    # pylint: disable=unused-argument
+    self._added_session_logs.append(session_log)
+
+  def add_run_metadata(self, run_metadata, tag, global_step=None):
+    if (global_step is not None) and (global_step < 0):
+      raise ValueError('Invalid global_step %s.' % global_step)
+    self._added_run_metadata[tag] = run_metadata
+
+  def flush(self):
+    pass
+
+  def reopen(self):
+    pass
+
+  def close(self):
+    pass
diff --git a/tensorflow/python/tensorflow.i b/tensorflow/python/tensorflow.i
index 11eba2652ad..6c3acd6b293 100644
--- a/tensorflow/python/tensorflow.i
+++ b/tensorflow/python/tensorflow.i
@@ -17,11 +17,10 @@ limitations under the License.
  * The includes are intentionally not alphabetically sorted, as the order of
  * includes follows dependency order */
 
-%include "tensorflow/python/pywrap_tfe.i"
-
-%include "tensorflow/python/util/tfprof.i"
 %include "tensorflow/python/util/py_checkpoint_reader.i"
 
+%include "tensorflow/python/pywrap_tfe.i"
+
 %include "tensorflow/python/lib/core/py_func.i"
 %include "tensorflow/python/lib/core/py_exception_registry.i"
 
@@ -56,3 +55,5 @@ limitations under the License.
 
 %include "tensorflow/python/util/traceme.i"
 %include "tensorflow/python/util/scoped_annotation.i"
+
+%include "tensorflow/compiler/mlir/python/mlir.i"
diff --git a/tensorflow/python/tools/api/generator/api_gen.bzl b/tensorflow/python/tools/api/generator/api_gen.bzl
index 71610d3574b..75430c10c0e 100644
--- a/tensorflow/python/tools/api/generator/api_gen.bzl
+++ b/tensorflow/python/tools/api/generator/api_gen.bzl
@@ -103,7 +103,8 @@ def gen_api_init_files(
             " --apiname=" + api_name + " --apiversion=" + str(api_version) +
             compat_api_version_flags + " " + compat_init_template_flags +
             loading_flag + " --package=" + ",".join(packages) +
-            " --output_package=" + output_package + " $(OUTS)"
+            " --output_package=" + output_package +
+            " --use_relative_imports=True $(OUTS)"
         ),
         srcs = srcs,
         tools = [":" + api_gen_binary_target],
diff --git a/tensorflow/python/tools/api/generator/api_init_files.bzl b/tensorflow/python/tools/api/generator/api_init_files.bzl
index 38dfff6525f..32a6554518d 100644
--- a/tensorflow/python/tools/api/generator/api_init_files.bzl
+++ b/tensorflow/python/tools/api/generator/api_init_files.bzl
@@ -39,6 +39,8 @@ TENSORFLOW_API_INIT_FILES = [
     "math/__init__.py",
     "mixed_precision/__init__.py",
     "mixed_precision/experimental/__init__.py",
+    "mlir/__init__.py",
+    "mlir/experimental/__init__.py",
     "nest/__init__.py",
     "nn/__init__.py",
     "quantization/__init__.py",
diff --git a/tensorflow/python/tools/api/generator/api_init_files_v1.bzl b/tensorflow/python/tools/api/generator/api_init_files_v1.bzl
index 94d72c2a878..dfd1f12c8f2 100644
--- a/tensorflow/python/tools/api/generator/api_init_files_v1.bzl
+++ b/tensorflow/python/tools/api/generator/api_init_files_v1.bzl
@@ -49,6 +49,8 @@ TENSORFLOW_API_INIT_FILES_V1 = [
     "manip/__init__.py",
     "math/__init__.py",
     "metrics/__init__.py",
+    "mlir/__init__.py",
+    "mlir/experimental/__init__.py",
     "nest/__init__.py",
     "nn/__init__.py",
     "nn/rnn_cell/__init__.py",
diff --git a/tensorflow/python/tools/api/generator/create_python_api.py b/tensorflow/python/tools/api/generator/create_python_api.py
index 1343b1f668f..2d89a7696d2 100644
--- a/tensorflow/python/tools/api/generator/create_python_api.py
+++ b/tensorflow/python/tools/api/generator/create_python_api.py
@@ -105,7 +105,9 @@ def get_canonical_import(import_set):
 class _ModuleInitCodeBuilder(object):
   """Builds a map from module name to imports included in that module."""
 
-  def __init__(self, output_package, api_version, lazy_loading=_LAZY_LOADING):
+  def __init__(
+      self, output_package, api_version, lazy_loading=_LAZY_LOADING,
+      use_relative_imports=False):
     self._output_package = output_package
     # Maps API module to API symbol name to set of tuples of the form
     # (module name, priority).
@@ -120,6 +122,7 @@ class _ModuleInitCodeBuilder(object):
     # Controls whether or not exported symbols are lazily loaded or statically
     # imported.
     self._lazy_loading = lazy_loading
+    self._use_relative_imports = use_relative_imports
 
   def _check_already_imported(self, symbol_id, api_name):
     if (api_name in self._dest_import_to_id and
@@ -162,10 +165,14 @@ class _ModuleInitCodeBuilder(object):
     # We store all possible ways of importing this symbol and later pick just
     # one.
     priority = 0
-    if symbol and hasattr(symbol, '__module__'):
+    if symbol:
       # Give higher priority to source module if it matches
       # symbol's original module.
-      priority = int(source_module_name == symbol.__module__)
+      if hasattr(symbol, '__module__'):
+        priority = int(source_module_name == symbol.__module__)
+      # Give higher priority if symbol name matches its __name__.
+      if hasattr(symbol, '__name__'):
+        priority += int(source_name == symbol.__name__)
     self._module_imports[dest_module_name][full_api_name].add(
         (import_str, priority))
 
@@ -195,7 +202,10 @@ class _ModuleInitCodeBuilder(object):
               dest_module_name=parent_module,
               dest_name=module_split[submodule_index])
         else:
-          import_from = '.'
+          if self._use_relative_imports:
+            import_from = '.'
+          elif submodule_index > 0:
+            import_from += '.' + '.'.join(module_split[:submodule_index])
           self.add_import(
               symbol=None,
               source_module_name=import_from,
@@ -374,7 +384,8 @@ def get_api_init_text(packages,
                       api_name,
                       api_version,
                       compat_api_versions=None,
-                      lazy_loading=_LAZY_LOADING):
+                      lazy_loading=_LAZY_LOADING,
+                      use_relative_imports=False):
   """Get a map from destination module to __init__.py code for that module.
 
   Args:
@@ -388,6 +399,8 @@ def get_api_init_text(packages,
       directory.
     lazy_loading: Boolean flag. If True, a lazy loading `__init__.py` file is
       produced and if `False`, static imports are used.
+    use_relative_imports: True if we should use relative imports when
+      importing submodules.
 
   Returns:
     A dictionary where
@@ -398,7 +411,7 @@ def get_api_init_text(packages,
   if compat_api_versions is None:
     compat_api_versions = []
   module_code_builder = _ModuleInitCodeBuilder(
-      output_package, api_version, lazy_loading)
+      output_package, api_version, lazy_loading, use_relative_imports)
   # Traverse over everything imported above. Specifically,
   # we want to traverse over TensorFlow Python modules.
 
@@ -513,7 +526,7 @@ def get_module_docstring(module_name, package, api_name):
 def create_api_files(output_files, packages, root_init_template, output_dir,
                      output_package, api_name, api_version,
                      compat_api_versions, compat_init_templates,
-                     lazy_loading=_LAZY_LOADING):
+                     lazy_loading=_LAZY_LOADING, use_relative_imports=False):
   """Creates __init__.py files for the Python API.
 
   Args:
@@ -533,6 +546,8 @@ def create_api_files(output_files, packages, root_init_template, output_dir,
       in the same order as compat_api_versions.
     lazy_loading: Boolean flag. If True, a lazy loading `__init__.py` file is
       produced and if `False`, static imports are used.
+    use_relative_imports: True if we should use relative imports when
+      import submodules.
 
   Raises:
     ValueError: if output_files list is missing a required file.
@@ -550,7 +565,7 @@ def create_api_files(output_files, packages, root_init_template, output_dir,
 
   module_text_map, deprecation_footer_map = get_api_init_text(
       packages, output_package, api_name,
-      api_version, compat_api_versions, lazy_loading)
+      api_version, compat_api_versions, lazy_loading, use_relative_imports)
 
   # Add imports to output files.
   missing_output_files = []
@@ -653,6 +668,10 @@ def main():
            '\'static\' means all exported symbols are loaded in the '
            '__init__.py file. \'default\' uses the value of the '
            '_LAZY_LOADING constant in create_python_api.py.')
+  parser.add_argument(
+      '--use_relative_imports', default=False, type=bool,
+      help='Whether to import submodules using relative imports or absolute '
+           'imports')
   args = parser.parse_args()
 
   if len(args.outputs) == 1:
@@ -683,7 +702,7 @@ def main():
   create_api_files(outputs, packages, args.root_init_template, args.apidir,
                    args.output_package, args.apiname, args.apiversion,
                    args.compat_apiversions, args.compat_init_templates,
-                   lazy_loading)
+                   lazy_loading, args.use_relative_imports)
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/tpu/ops/tpu_ops.py b/tensorflow/python/tpu/ops/tpu_ops.py
index 9bfb84ec07e..d87bd2dd11a 100644
--- a/tensorflow/python/tpu/ops/tpu_ops.py
+++ b/tensorflow/python/tpu/ops/tpu_ops.py
@@ -210,8 +210,9 @@ def infeed_dequeue(dtype, shape, name=None):
   """
   if dtype not in _SUPPORTED_INFEED_DTYPES:
     raise TypeError(
-        "{} is not a supported TPU infeed type. Supported types are: "
-        "{}".format(dtype, list(_SUPPORTED_INFEED_DTYPES)))
+        "Operation '{}' has type {} which is not a supported TPU infeed type. "
+        "Supported types are: {}".format(name, dtype,
+                                         list(_SUPPORTED_INFEED_DTYPES)))
 
   return gen_tpu_ops.infeed_dequeue(dtype, shape, name=name)
 
diff --git a/tensorflow/python/tpu/tpu.py b/tensorflow/python/tpu/tpu.py
index 4bc43b3c84d..ad9398286fb 100644
--- a/tensorflow/python/tpu/tpu.py
+++ b/tensorflow/python/tpu/tpu.py
@@ -737,7 +737,7 @@ def _pad_all_input(inputs, padded_shapes):
               padding_map.padding_arg_index = real_shape_idx
               padding_maps.append(padding_map)
             real_shapes[core_idx].append(
-                math_ops.cast(input_shape_tensor[i], dtypes.uint32))
+                math_ops.cast(input_shape_tensor[i], dtypes.int32))
 
         paddings = []
         for i, s in enumerate(padded_shape.dims):
diff --git a/tensorflow/python/tpu/tpu_embedding.py b/tensorflow/python/tpu/tpu_embedding.py
index 9712f0bdc2e..105332f8e00 100644
--- a/tensorflow/python/tpu/tpu_embedding.py
+++ b/tensorflow/python/tpu/tpu_embedding.py
@@ -26,6 +26,7 @@ import six
 
 from tensorflow.core.protobuf.tpu import optimization_parameters_pb2
 from tensorflow.core.protobuf.tpu import tpu_embedding_configuration_pb2 as elc
+from tensorflow.python.eager import context
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
@@ -475,7 +476,7 @@ class TPUEmbedding(object):
     ```
   """
 
-  # TODO(shizhiw): Consider addign a field to FeatureConfig that indicates that
+  # TODO(shizhiw): Consider adding a field to FeatureConfig that indicates that
   # the feature should not be used to update embedding table (cr/204852758,
   # cr/204940540). Also, this can support different combiners for different
   # features within the same table.
@@ -757,7 +758,7 @@ class TPUEmbedding(object):
         A dictionary mapping from string of table name to AdagradSlotVariable,
          AdamSlotVariables etc with slot variables,
         A function which returns a list of ops to load embedding and slot
-         variables from TPU to CPU.
+         variables from CPU to TPU.
         A function which returns a list of ops to retrieve embedding and slot
          variables from TPU to CPU.
     """
@@ -765,7 +766,8 @@ class TPUEmbedding(object):
     slot_variables_by_table = {}
     load_op_fns = []
     retrieve_op_fns = []
-    for table in self._table_to_config_dict:
+
+    for i, table in enumerate(self._table_to_config_dict):
       if embedding_variable_name_by_table:
         embedding_variable_name = embedding_variable_name_by_table[table]
       else:
@@ -776,8 +778,15 @@ class TPUEmbedding(object):
         slot_variable_names = (
             self._optimizer_handler.get_default_slot_variable_names(table))
 
-      device_fn = _create_device_fn(self._hosts)
-      with ops.device(device_fn):
+      # TODO(b/139144091): Multi-host support for mid-level API in
+      #  eager context (TF 2.0)
+      # Workaround below allows single-host use case in TF 2.0
+      if context.executing_eagerly():
+        device = ''
+      else:
+        device = _create_device_fn(self._hosts)
+
+      with ops.device(device):
         table_variables = _create_partitioned_variables(
             name=embedding_variable_name,
             num_hosts=self._num_hosts,
@@ -787,11 +796,13 @@ class TPUEmbedding(object):
             collections=[ops.GraphKeys.GLOBAL_VARIABLES])
         embedding_variables_by_table[table] = table_variables
 
+        # Only loads embedding config to load/retrieve nodes for the first table
+        # on the first host, other nodes would use config from the first node.
+        config = None if i else self.config_proto.SerializeToString()
         slot_variables_for_table, load_ops_fn, retrieve_ops_fn = (
             self._optimizer_handler.create_variables_and_ops(
                 table, slot_variable_names, self._num_hosts,
-                self._table_to_config_dict[table], table_variables)
-        )
+                self._table_to_config_dict[table], table_variables, config))
         slot_variables_by_table[table] = slot_variables_for_table
         load_op_fns.append(load_ops_fn)
         retrieve_op_fns.append(retrieve_ops_fn)
@@ -879,29 +890,29 @@ class TPUEmbedding(object):
                                i, feature, combiner))
 
         if (enqueue_data.sample_indices is not None and
-            enqueue_data.sample_indices.op.device !=
-            enqueue_data.embedding_indices.op.device):
+            enqueue_data.sample_indices.device !=
+            enqueue_data.embedding_indices.device):
           raise ValueError(
               'Device of sample_indices does not agree with '
               'that of emebdding_indices for feature {}.'.format(feature))
         if (enqueue_data.aggregation_weights is not None and
-            enqueue_data.aggregation_weights.op.device !=
-            enqueue_data.embedding_indices.op.device):
+            enqueue_data.aggregation_weights.device !=
+            enqueue_data.embedding_indices.device):
           raise ValueError(
               'Device of aggregation_weights does not agree with '
               'that of emebdding_indices for feature {}.'.format(feature))
         # Check all features are on the same device.
         if device is None:
-          device = enqueue_data.embedding_indices.op.device
+          device = enqueue_data.embedding_indices.device
           device_feature = feature
         else:
-          if device != enqueue_data.embedding_indices.op.device:
+          if device != enqueue_data.embedding_indices.device:
             raise ValueError('Devices are different between features in '
                              '`enqueue_datas_list[{}]`; '
                              'devices: {}, {}; features: {}, {}.'.format(
                                  i, device,
-                                 enqueue_data.embedding_indices.op.device,
-                                 feature, device_feature))
+                                 enqueue_data.embedding_indices.device, feature,
+                                 device_feature))
 
       if i % self._num_cores_per_host:
         if device != contiguous_device:
@@ -1106,7 +1117,7 @@ class _OptimizerHandler(object):
     raise NotImplementedError()
 
   def create_variables_and_ops(self, table, slot_variable_names, num_hosts,
-                               table_config, table_variables):
+                               table_config, table_variables, config_proto):
     raise NotImplementedError()
 
 
@@ -1124,7 +1135,7 @@ class _AdagradHandler(_OptimizerHandler):
     return AdagradSlotVariableName('{}/{}'.format(table, 'Adagrad'))
 
   def create_variables_and_ops(self, table, slot_variable_names, num_hosts,
-                               table_config, table_variables):
+                               table_config, table_variables, config_proto):
     accumulator_initializer = init_ops.constant_initializer(
         self._optimization_parameters.initial_accumulator)
     accumulator_variables = _create_partitioned_variables(
@@ -1142,9 +1153,10 @@ class _AdagradHandler(_OptimizerHandler):
       Returns:
         A list of ops to load embedding and slot variables from CPU to TPU.
       """
+      config = config_proto
       load_op_list = []
-      for host_id, table_variable, accumulator_variable in (zip(
-          range(num_hosts), table_variables, accumulator_variables)):
+      for host_id, table_variable, accumulator_variable in zip(
+          range(num_hosts), table_variables, accumulator_variables):
         with ops.colocate_with(table_variable):
           load_parameters_op = (
               tpu_ops.load_tpu_embedding_adagrad_parameters(
@@ -1152,7 +1164,9 @@ class _AdagradHandler(_OptimizerHandler):
                   accumulators=accumulator_variable,
                   table_name=table,
                   num_shards=num_hosts,
-                  shard_id=host_id))
+                  shard_id=host_id,
+                  config=config))
+        config = None
         load_op_list.append(load_parameters_op)
       return load_op_list
 
@@ -1162,6 +1176,7 @@ class _AdagradHandler(_OptimizerHandler):
       Returns:
         A list of ops to retrieve embedding and slot variables from TPU to CPU.
       """
+      config = config_proto
       retrieve_op_list = []
       for host_id, table_variable, accumulator_variable in (zip(
           range(num_hosts), table_variables, accumulator_variables)):
@@ -1170,10 +1185,12 @@ class _AdagradHandler(_OptimizerHandler):
               tpu_ops.retrieve_tpu_embedding_adagrad_parameters(
                   table_name=table,
                   num_shards=num_hosts,
-                  shard_id=host_id))
+                  shard_id=host_id,
+                  config=config))
           retrieve_parameters_op = control_flow_ops.group(
               state_ops.assign(table_variable, retrieved_table),
               state_ops.assign(accumulator_variable, retrieved_accumulator))
+        config = None
         retrieve_op_list.append(retrieve_parameters_op)
       return retrieve_op_list
 
@@ -1205,7 +1222,7 @@ class _AdamHandler(_OptimizerHandler):
                                  '{}/{}/v'.format(table, 'Adam'))
 
   def create_variables_and_ops(self, table, slot_variable_names, num_hosts,
-                               table_config, table_variables):
+                               table_config, table_variables, config_proto):
     m_initializer = init_ops.zeros_initializer()
     m_variables = _create_partitioned_variables(
         name=slot_variable_names.m,
@@ -1231,6 +1248,7 @@ class _AdamHandler(_OptimizerHandler):
         A list of ops to load embedding and slot variables from CPU to TPU.
       """
       load_op_list = []
+      config = config_proto
       for host_id, table_variable, m_variable, v_variable in (zip(
           range(num_hosts), table_variables,
           m_variables, v_variables)):
@@ -1242,7 +1260,11 @@ class _AdamHandler(_OptimizerHandler):
                   velocities=v_variable,
                   table_name=table,
                   num_shards=num_hosts,
-                  shard_id=host_id))
+                  shard_id=host_id,
+                  config=config))
+        # Set config to None to enforce that config is only loaded to the first
+        # table.
+        config = None
         load_op_list.append(load_parameters_op)
       return load_op_list
 
@@ -1252,8 +1274,8 @@ class _AdamHandler(_OptimizerHandler):
       Returns:
         A list of ops to retrieve embedding and slot variables from TPU to CPU.
       """
-
       retrieve_op_list = []
+      config = config_proto
       for host_id, table_variable, m_variable, v_variable in (zip(
           range(num_hosts), table_variables,
           m_variables, v_variables)):
@@ -1262,12 +1284,13 @@ class _AdamHandler(_OptimizerHandler):
               tpu_ops.retrieve_tpu_embedding_adam_parameters(
                   table_name=table,
                   num_shards=num_hosts,
-                  shard_id=host_id))
+                  shard_id=host_id,
+                  config=config))
           retrieve_parameters_op = control_flow_ops.group(
               state_ops.assign(table_variable, retrieved_table),
               state_ops.assign(m_variable, retrieved_m),
               state_ops.assign(v_variable, retrieved_v))
-
+        config = None
         retrieve_op_list.append(retrieve_parameters_op)
       return retrieve_op_list
 
@@ -1285,7 +1308,7 @@ class _StochasticGradientDescentHandler(_OptimizerHandler):
     return None
 
   def create_variables_and_ops(self, table, slot_variable_names, num_hosts,
-                               table_config, table_variables):
+                               table_config, table_variables, config_proto):
     del table_config
 
     def load_ops_fn():
@@ -1295,17 +1318,18 @@ class _StochasticGradientDescentHandler(_OptimizerHandler):
         A list of ops to load embedding and slot variables from CPU to TPU.
       """
       load_op_list = []
+      config = config_proto
       for host_id, table_variable in (zip(
           range(num_hosts), table_variables)):
         with ops.colocate_with(table_variable):
           load_parameters_op = (
-              tpu_ops
-              .load_tpu_embedding_stochastic_gradient_descent_parameters(
+              tpu_ops.load_tpu_embedding_stochastic_gradient_descent_parameters(
                   parameters=table_variable,
                   table_name=table,
                   num_shards=num_hosts,
-                  shard_id=host_id))
-
+                  shard_id=host_id,
+                  config=config))
+        config = None
         load_op_list.append(load_parameters_op)
       return load_op_list
 
@@ -1315,8 +1339,8 @@ class _StochasticGradientDescentHandler(_OptimizerHandler):
       Returns:
         A list of ops to retrieve embedding and slot variables from TPU to CPU.
       """
-
       retrieve_op_list = []
+      config = config_proto
       for host_id, table_variable in (zip(
           range(num_hosts), table_variables)):
         with ops.colocate_with(table_variable):
@@ -1325,10 +1349,11 @@ class _StochasticGradientDescentHandler(_OptimizerHandler):
               .retrieve_tpu_embedding_stochastic_gradient_descent_parameters(
                   table_name=table,
                   num_shards=num_hosts,
-                  shard_id=host_id))
+                  shard_id=host_id,
+                  config=config))
           retrieve_parameters_op = control_flow_ops.group(
               state_ops.assign(table_variable, retrieved_table))
-
+        config = None
         retrieve_op_list.append(retrieve_parameters_op)
       return retrieve_op_list
 
diff --git a/tensorflow/python/training/basic_session_run_hooks_test.py b/tensorflow/python/training/basic_session_run_hooks_test.py
index 758b79b69c6..3e1ccfed0dc 100644
--- a/tensorflow/python/training/basic_session_run_hooks_test.py
+++ b/tensorflow/python/training/basic_session_run_hooks_test.py
@@ -24,9 +24,6 @@ import shutil
 import tempfile
 import time
 
-from tensorflow.contrib.framework.python.framework import checkpoint_utils
-from tensorflow.contrib.framework.python.ops import variables
-from tensorflow.contrib.testing.python.framework import fake_summary_writer
 from tensorflow.python.client import session as session_lib
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import constant_op
@@ -45,8 +42,10 @@ from tensorflow.python.platform import gfile
 from tensorflow.python.platform import test
 from tensorflow.python.platform import tf_logging
 from tensorflow.python.summary import summary as summary_lib
+from tensorflow.python.summary.writer import fake_summary_writer
 from tensorflow.python.summary.writer import writer_cache
 from tensorflow.python.training import basic_session_run_hooks
+from tensorflow.python.training import checkpoint_utils
 from tensorflow.python.training import monitored_session
 from tensorflow.python.training import session_run_hook
 from tensorflow.python.training import training_util
@@ -151,7 +150,7 @@ class StopAtStepTest(test.TestCase):
   def test_stop_based_on_last_step(self):
     h = basic_session_run_hooks.StopAtStepHook(last_step=10)
     with ops.Graph().as_default():
-      global_step = variables.get_or_create_global_step()
+      global_step = training_util.get_or_create_global_step()
       no_op = control_flow_ops.no_op()
       h.begin()
       with session_lib.Session() as sess:
@@ -175,7 +174,7 @@ class StopAtStepTest(test.TestCase):
     h = basic_session_run_hooks.StopAtStepHook(num_steps=10)
 
     with ops.Graph().as_default():
-      global_step = variables.get_or_create_global_step()
+      global_step = training_util.get_or_create_global_step()
       no_op = control_flow_ops.no_op()
       h.begin()
       with session_lib.Session() as sess:
@@ -202,7 +201,7 @@ class StopAtStepTest(test.TestCase):
     h = basic_session_run_hooks.StopAtStepHook(num_steps=10)
 
     with ops.Graph().as_default():
-      global_step = variables.get_or_create_global_step()
+      global_step = training_util.get_or_create_global_step()
       no_op = control_flow_ops.no_op()
       h.begin()
       with session_lib.Session() as sess:
@@ -391,7 +390,7 @@ class CheckpointSaverHookTest(test.TestCase):
     self.graph = ops.Graph()
     with self.graph.as_default():
       self.scaffold = monitored_session.Scaffold()
-      self.global_step = variables.get_or_create_global_step()
+      self.global_step = training_util.get_or_create_global_step()
       self.train_op = training_util._increment_global_step(1)
 
   def tearDown(self):
@@ -467,7 +466,7 @@ class CheckpointSaverHookTest(test.TestCase):
   def test_listener_with_monitored_session(self):
     with ops.Graph().as_default():
       scaffold = monitored_session.Scaffold()
-      global_step = variables.get_or_create_global_step()
+      global_step = training_util.get_or_create_global_step()
       train_op = training_util._increment_global_step(1)
       listener = MockCheckpointSaverListener()
       hook = basic_session_run_hooks.CheckpointSaverHook(
@@ -494,7 +493,7 @@ class CheckpointSaverHookTest(test.TestCase):
   def test_listener_stops_training_in_after_save(self):
     with ops.Graph().as_default():
       scaffold = monitored_session.Scaffold()
-      variables.get_or_create_global_step()
+      training_util.get_or_create_global_step()
       train_op = training_util._increment_global_step(1)
       listener = MockCheckpointSaverListener()
       hook = basic_session_run_hooks.CheckpointSaverHook(
@@ -512,7 +511,7 @@ class CheckpointSaverHookTest(test.TestCase):
 
   def test_listener_with_default_saver(self):
     with ops.Graph().as_default():
-      global_step = variables.get_or_create_global_step()
+      global_step = training_util.get_or_create_global_step()
       train_op = training_util._increment_global_step(1)
       listener = MockCheckpointSaverListener()
       hook = basic_session_run_hooks.CheckpointSaverHook(
@@ -535,7 +534,7 @@ class CheckpointSaverHookTest(test.TestCase):
     }, listener_counts)
 
     with ops.Graph().as_default():
-      global_step = variables.get_or_create_global_step()
+      global_step = training_util.get_or_create_global_step()
       with monitored_session.SingularMonitoredSession(
           checkpoint_dir=self.model_dir) as sess2:
         global_step_saved_val = sess2.run(global_step)
@@ -543,7 +542,7 @@ class CheckpointSaverHookTest(test.TestCase):
 
   def test_two_listeners_with_default_saver(self):
     with ops.Graph().as_default():
-      global_step = variables.get_or_create_global_step()
+      global_step = training_util.get_or_create_global_step()
       train_op = training_util._increment_global_step(1)
       listener1 = MockCheckpointSaverListener()
       listener2 = MockCheckpointSaverListener()
@@ -569,7 +568,7 @@ class CheckpointSaverHookTest(test.TestCase):
     self.assertEqual(listener1_counts, listener2_counts)
 
     with ops.Graph().as_default():
-      global_step = variables.get_or_create_global_step()
+      global_step = training_util.get_or_create_global_step()
       with monitored_session.SingularMonitoredSession(
           checkpoint_dir=self.model_dir) as sess2:
         global_step_saved_val = sess2.run(global_step)
@@ -786,7 +785,7 @@ class CheckpointSaverHookMultiStepTest(test.TestCase):
     self.steps_per_run = 5
     with self.graph.as_default():
       self.scaffold = monitored_session.Scaffold()
-      self.global_step = variables.get_or_create_global_step()
+      self.global_step = training_util.get_or_create_global_step()
       self.train_op = training_util._increment_global_step(self.steps_per_run)
 
   def tearDown(self):
@@ -926,7 +925,7 @@ class StepCounterHookTest(test.TestCase):
   def test_step_counter_every_n_steps(self, mock_time):
     mock_time.return_value = MOCK_START_TIME
     with ops.Graph().as_default() as g, session_lib.Session() as sess:
-      variables.get_or_create_global_step()
+      training_util.get_or_create_global_step()
       train_op = training_util._increment_global_step(1)
       summary_writer = fake_summary_writer.FakeSummaryWriter(self.log_dir, g)
       hook = basic_session_run_hooks.StepCounterHook(
@@ -956,7 +955,7 @@ class StepCounterHookTest(test.TestCase):
   def test_step_counter_every_n_secs(self, mock_time):
     mock_time.return_value = MOCK_START_TIME
     with ops.Graph().as_default() as g, session_lib.Session() as sess:
-      variables.get_or_create_global_step()
+      training_util.get_or_create_global_step()
       train_op = training_util._increment_global_step(1)
       summary_writer = fake_summary_writer.FakeSummaryWriter(self.log_dir, g)
       hook = basic_session_run_hooks.StepCounterHook(
@@ -1018,7 +1017,7 @@ class StepCounterHookTest(test.TestCase):
 
   def test_log_warning_if_global_step_not_increased(self):
     with ops.Graph().as_default(), session_lib.Session() as sess:
-      variables.get_or_create_global_step()
+      training_util.get_or_create_global_step()
       train_op = training_util._increment_global_step(0)  # keep same.
       self.evaluate(variables_lib.global_variables_initializer())
       hook = basic_session_run_hooks.StepCounterHook(
@@ -1039,7 +1038,7 @@ class StepCounterHookTest(test.TestCase):
                                 steps_per_run,
                                 graph,
                                 sess):
-    variables.get_or_create_global_step()
+    training_util.get_or_create_global_step()
     self.train_op = training_util._increment_global_step(steps_per_run)
     self.summary_writer = fake_summary_writer.FakeSummaryWriter(
         self.log_dir, graph)
@@ -1137,7 +1136,7 @@ class SummarySaverHookTest(test.TestCase):
     self.summary_op = summary_lib.scalar('my_summary', tensor)
     self.summary_op2 = summary_lib.scalar('my_summary2', tensor2)
 
-    variables.get_or_create_global_step()
+    training_util.get_or_create_global_step()
     self.train_op = training_util._increment_global_step(1)
 
   def test_raise_when_scaffold_and_summary_op_both_missing(self):
@@ -1292,7 +1291,7 @@ class GlobalStepWaiterHookTest(test.TestCase):
 
   def test_not_wait_for_step_zero(self):
     with ops.Graph().as_default():
-      variables.get_or_create_global_step()
+      training_util.get_or_create_global_step()
       hook = basic_session_run_hooks.GlobalStepWaiterHook(wait_until_step=0)
       hook.begin()
       with session_lib.Session() as sess:
@@ -1304,7 +1303,7 @@ class GlobalStepWaiterHookTest(test.TestCase):
   @test.mock.patch.object(time, 'sleep')
   def test_wait_for_step(self, mock_sleep):
     with ops.Graph().as_default():
-      gstep = variables.get_or_create_global_step()
+      gstep = training_util.get_or_create_global_step()
       hook = basic_session_run_hooks.GlobalStepWaiterHook(wait_until_step=1000)
       hook.begin()
 
@@ -1418,7 +1417,7 @@ class ResourceSummarySaverHookTest(test.TestCase):
     self.summary_op = summary_lib.scalar('my_summary', tensor)
 
     with variable_scope.variable_scope('foo', use_resource=True):
-      variables.create_global_step()
+      training_util.create_global_step()
     self.train_op = training_util._increment_global_step(1)
 
   def test_save_steps(self):
@@ -1475,7 +1474,7 @@ class ProfilerHookTest(test.TestCase):
     self.graph = ops.Graph()
     self.filepattern = os.path.join(self.output_dir, 'timeline-*.json')
     with self.graph.as_default():
-      self.global_step = variables.get_or_create_global_step()
+      self.global_step = training_util.get_or_create_global_step()
       self.train_op = state_ops.assign_add(self.global_step, 1)
 
   def tearDown(self):
diff --git a/tensorflow/python/training/monitored_session_test.py b/tensorflow/python/training/monitored_session_test.py
index 3084dbe300b..095a524cb42 100644
--- a/tensorflow/python/training/monitored_session_test.py
+++ b/tensorflow/python/training/monitored_session_test.py
@@ -27,8 +27,6 @@ import threading
 import time
 import traceback
 
-from tensorflow.contrib.framework.python.ops import variables as variables_lib
-from tensorflow.contrib.testing.python.framework import util_test
 from tensorflow.core.protobuf import config_pb2
 from tensorflow.core.protobuf import debug_pb2
 from tensorflow.python.client import session as session_lib
@@ -52,8 +50,17 @@ from tensorflow.python.training import coordinator
 from tensorflow.python.training import monitored_session
 from tensorflow.python.training import saver as saver_lib
 from tensorflow.python.training import session_run_hook
+from tensorflow.python.training import summary_io
+from tensorflow.python.training import training_util
 
 
+def latest_summaries(base_dir):
+  """Parse summary events from latest event file in base_dir."""
+  file_paths = glob.glob(os.path.join(base_dir, 'events.*'))
+  file_path = sorted(file_paths)[-1] if file_paths else None
+  latest_events = summary_io.summary_iterator(file_path) if file_path else []
+  return [e for e in latest_events if e.HasField('summary')]
+
 class ScaffoldTest(test.TestCase):
   """Scaffold tests."""
 
@@ -274,7 +281,7 @@ class MonitoredTrainingSessionTest(test.TestCase):
   def test_saving_restoring_checkpoint(self):
     logdir = _test_dir(self.get_temp_dir(), 'test_saving_restoring_checkpoint')
     with ops.Graph().as_default():
-      gstep = variables_lib.get_or_create_global_step()
+      gstep = training_util.get_or_create_global_step()
       do_step = state_ops.assign_add(gstep, 1)
       with monitored_session.MonitoredTrainingSession(
           is_chief=True, checkpoint_dir=logdir) as session:
@@ -289,7 +296,7 @@ class MonitoredTrainingSessionTest(test.TestCase):
   def test_save_checkpoint_steps(self):
     logdir = _test_dir(self.get_temp_dir(), 'test_save_checkpoint_steps')
     with ops.Graph().as_default():
-      gstep = variables_lib.get_or_create_global_step()
+      gstep = training_util.get_or_create_global_step()
       new_gstep = state_ops.assign_add(gstep, 1)
       with monitored_session.MonitoredTrainingSession(
           is_chief=True,
@@ -306,7 +313,7 @@ class MonitoredTrainingSessionTest(test.TestCase):
   def test_save_checkpoint_secs(self):
     logdir = _test_dir(self.get_temp_dir(), 'test_save_checkpoint_secs')
     with ops.Graph().as_default():
-      gstep = variables_lib.get_or_create_global_step()
+      gstep = training_util.get_or_create_global_step()
       new_gstep = state_ops.assign_add(gstep, 1)
       with monitored_session.MonitoredTrainingSession(
           is_chief=True,
@@ -325,7 +332,7 @@ class MonitoredTrainingSessionTest(test.TestCase):
   def test_summaries_steps(self):
     logdir = _test_dir(self.get_temp_dir(), 'test_summaries_steps')
     with ops.Graph().as_default():
-      gstep = variables_lib.get_or_create_global_step()
+      gstep = training_util.get_or_create_global_step()
       new_gstep = state_ops.assign_add(gstep, 1)
       summary.scalar('my_summary_tag', new_gstep * 2)
       with monitored_session.MonitoredTrainingSession(
@@ -335,7 +342,7 @@ class MonitoredTrainingSessionTest(test.TestCase):
           log_step_count_steps=10) as session:
         for _ in range(101):
           session.run(new_gstep)
-    summaries = util_test.latest_summaries(logdir)
+    summaries = latest_summaries(logdir)
     tags = [s.summary.value[0].tag for s in summaries]
     self.assertIn('my_summary_tag', tags)
     self.assertIn('global_step/sec', tags)
@@ -343,7 +350,7 @@ class MonitoredTrainingSessionTest(test.TestCase):
   def test_summaries_secs(self):
     logdir = _test_dir(self.get_temp_dir(), 'test_summaries_secs')
     with ops.Graph().as_default():
-      gstep = variables_lib.get_or_create_global_step()
+      gstep = training_util.get_or_create_global_step()
       new_gstep = state_ops.assign_add(gstep, 1)
       summary.scalar('my_summary_tag', new_gstep * 2)
       with monitored_session.MonitoredTrainingSession(
@@ -356,7 +363,7 @@ class MonitoredTrainingSessionTest(test.TestCase):
         time.sleep(0.2)
         for _ in range(101):
           session.run(new_gstep)
-    summaries = util_test.latest_summaries(logdir)
+    summaries = latest_summaries(logdir)
     tags = [s.summary.value[0].tag for s in summaries]
     self.assertIn('my_summary_tag', tags)
     self.assertIn('global_step/sec', tags)
@@ -365,7 +372,7 @@ class MonitoredTrainingSessionTest(test.TestCase):
     logdir = _test_dir(self.get_temp_dir(), 'test_saving_restoring_checkpoint')
     fake_hook = FakeHook()
     with ops.Graph().as_default():
-      gstep = variables_lib.get_or_create_global_step()
+      gstep = training_util.get_or_create_global_step()
       do_step = state_ops.assign_add(gstep, 1)
       with monitored_session.MonitoredTrainingSession(
           is_chief=True,
@@ -414,7 +421,7 @@ class MonitoredTrainingSessionWithDistributeCoordinatorTest(test.TestCase):
 
     logdir = _test_dir(self.get_temp_dir(), 'test_summaries_enabled')
     with ops.Graph().as_default():
-      gstep = variables_lib.get_or_create_global_step()
+      gstep = training_util.get_or_create_global_step()
       new_gstep = state_ops.assign_add(gstep, 1)
       summary.scalar('my_summary_tag', new_gstep * 2)
       with context, monitored_session.MonitoredTrainingSession(
@@ -424,7 +431,7 @@ class MonitoredTrainingSessionWithDistributeCoordinatorTest(test.TestCase):
         for _ in range(101):
           session.run(new_gstep)
 
-    summaries = util_test.latest_summaries(logdir)
+    summaries = latest_summaries(logdir)
     tags = [s.summary.value[0].tag for s in summaries]
     self.assertIn('my_summary_tag', tags)
     self.assertIn('global_step/sec', tags)
@@ -435,7 +442,7 @@ class MonitoredTrainingSessionWithDistributeCoordinatorTest(test.TestCase):
 
     logdir = _test_dir(self.get_temp_dir(), 'test_summaries_disabled')
     with ops.Graph().as_default():
-      gstep = variables_lib.get_or_create_global_step()
+      gstep = training_util.get_or_create_global_step()
       new_gstep = state_ops.assign_add(gstep, 1)
       summary.scalar('my_summary_tag', new_gstep * 2)
       with context, monitored_session.MonitoredTrainingSession(
@@ -446,7 +453,7 @@ class MonitoredTrainingSessionWithDistributeCoordinatorTest(test.TestCase):
           session.run(new_gstep)
 
     # No summary is saved.
-    summaries = util_test.latest_summaries(logdir)
+    summaries = latest_summaries(logdir)
     self.assertEqual(len(summaries), 0)
 
   def test_checkpoint_hook_enabled(self):
@@ -455,7 +462,7 @@ class MonitoredTrainingSessionWithDistributeCoordinatorTest(test.TestCase):
 
     logdir = _test_dir(self.get_temp_dir(), 'test_save_checkpoint_enabled')
     with ops.Graph().as_default():
-      gstep = variables_lib.get_or_create_global_step()
+      gstep = training_util.get_or_create_global_step()
       new_gstep = state_ops.assign_add(gstep, 1)
       with context, monitored_session.MonitoredTrainingSession(
           checkpoint_dir=logdir,
@@ -475,7 +482,7 @@ class MonitoredTrainingSessionWithDistributeCoordinatorTest(test.TestCase):
 
     logdir = _test_dir(self.get_temp_dir(), 'test_save_checkpoint_disabled')
     with ops.Graph().as_default():
-      gstep = variables_lib.get_or_create_global_step()
+      gstep = training_util.get_or_create_global_step()
       new_gstep = state_ops.assign_add(gstep, 1)
       with context, monitored_session.MonitoredTrainingSession(
           checkpoint_dir=logdir,
@@ -496,7 +503,7 @@ class MonitoredTrainingSessionWithDistributeCoordinatorTest(test.TestCase):
 
     logdir = _test_dir(self.get_temp_dir(), 'test_save_checkpoint_disabled')
     with ops.Graph().as_default():
-      gstep = variables_lib.get_or_create_global_step()
+      gstep = training_util.get_or_create_global_step()
       new_gstep = state_ops.assign_add(gstep, 1)
       with context, monitored_session.MonitoredTrainingSession(
           checkpoint_dir=logdir,
@@ -1430,7 +1437,7 @@ class MonitoredSessionTest(test.TestCase):
   def test_last_step(self):
     logdir = _test_dir(self.get_temp_dir(), 'test_last_step')
     with ops.Graph().as_default():
-      gstep = variables_lib.get_or_create_global_step()
+      gstep = training_util.get_or_create_global_step()
       do_step = state_ops.assign_add(gstep, 1)
       # Run till step 3 and save.
       hooks = [basic_session_run_hooks.StopAtStepHook(last_step=3)]
@@ -1465,7 +1472,7 @@ class MonitoredSessionTest(test.TestCase):
   def test_num_steps(self):
     logdir = _test_dir(self.get_temp_dir(), 'test_num_steps')
     with ops.Graph().as_default():
-      gstep = variables_lib.get_or_create_global_step()
+      gstep = training_util.get_or_create_global_step()
       do_step = state_ops.assign_add(gstep, 1)
       # Do 3 steps and save.
       hooks = [basic_session_run_hooks.StopAtStepHook(num_steps=3)]
@@ -1504,7 +1511,7 @@ class MonitoredSessionTest(test.TestCase):
   def test_recovery(self):
     logdir = _test_dir(self.get_temp_dir(), 'test_recovery')
     with ops.Graph().as_default():
-      gstep = variables_lib.get_or_create_global_step()
+      gstep = training_util.get_or_create_global_step()
       do_step = state_ops.assign_add(gstep, 1)
       scaffold = monitored_session.Scaffold()
       # Use a hook to save the model every 100 steps.  It also saves it at
@@ -1536,7 +1543,7 @@ class MonitoredSessionTest(test.TestCase):
   def test_retry_initialization_on_aborted_error(self):
     # Tests that we silently retry on abort during initialization.
     with ops.Graph().as_default():
-      gstep = variables_lib.get_or_create_global_step()
+      gstep = training_util.get_or_create_global_step()
       self.init_raised_aborted_error = False
 
       def _init_fn(scaffold, session):
@@ -1557,7 +1564,7 @@ class MonitoredSessionTest(test.TestCase):
     # Tests that we silently retry on error.  Note that this does not test
     # recovery as we do not use a CheckpointSaver in this test.
     with ops.Graph().as_default():
-      gstep = variables_lib.get_or_create_global_step()
+      gstep = training_util.get_or_create_global_step()
       do_step = state_ops.assign_add(gstep, 1)
       hook = RaiseOnceAtCountN(4, ex)
       with monitored_session.MonitoredSession(hooks=[hook]) as session:
@@ -1587,7 +1594,7 @@ class MonitoredSessionTest(test.TestCase):
     logdir = _test_dir(self.get_temp_dir(),
                        'test_recover_and_retry_on_aborted_error')
     with ops.Graph().as_default():
-      gstep = variables_lib.get_or_create_global_step()
+      gstep = training_util.get_or_create_global_step()
       do_step = state_ops.assign_add(gstep, 1)
       scaffold = monitored_session.Scaffold()
       abort_hook = RaiseOnceAtCountN(
@@ -1615,7 +1622,7 @@ class MonitoredSessionTest(test.TestCase):
   def test_exit_cleanly_on_out_of_range_exception(self):
     # Tests that we stop cleanly when OutOfRange is raised.
     with ops.Graph().as_default():
-      gstep = variables_lib.get_or_create_global_step()
+      gstep = training_util.get_or_create_global_step()
       do_step = state_ops.assign_add(gstep, 1)
       hook = RaiseOnceAtCountN(2, errors_impl.OutOfRangeError(None, None,
                                                               'EOI'))
@@ -1634,7 +1641,7 @@ class MonitoredSessionTest(test.TestCase):
   def test_exit_cleanly_on_stop_iteration_exception(self):
     # Tests that we stop cleanly when OutOfRange is raised.
     with ops.Graph().as_default():
-      gstep = variables_lib.get_or_create_global_step()
+      gstep = training_util.get_or_create_global_step()
       do_step = state_ops.assign_add(gstep, 1)
       hook = RaiseOnceAtCountN(2, StopIteration)
       session = monitored_session.MonitoredSession(hooks=[hook])
@@ -1653,7 +1660,7 @@ class MonitoredSessionTest(test.TestCase):
     # Tests that regular exceptions just pass through a "with
     # MonitoredSession" block and set the session in stop mode.
     with ops.Graph().as_default():
-      gstep = variables_lib.get_or_create_global_step()
+      gstep = training_util.get_or_create_global_step()
       do_step = state_ops.assign_add(gstep, 1)
       hook = RaiseOnceAtCountN(4, RuntimeError('regular exception'))
       session = monitored_session.MonitoredSession(hooks=[hook])
@@ -1675,7 +1682,7 @@ class MonitoredSessionTest(test.TestCase):
     # passes through a "run()" call within a "with MonitoredSession" block and
     # set the session in stop mode.
     with ops.Graph().as_default():
-      gstep = variables_lib.get_or_create_global_step()
+      gstep = training_util.get_or_create_global_step()
       session = monitored_session.MonitoredSession()
       run_performed_without_error = False
       with self.assertRaisesRegexp(RuntimeError, 'a thread wants to stop'):
@@ -1696,7 +1703,7 @@ class MonitoredSessionTest(test.TestCase):
     # passes through returning from a "with MonitoredSession" block and
     # set the session in stop mode.
     with ops.Graph().as_default():
-      gstep = variables_lib.get_or_create_global_step()
+      gstep = training_util.get_or_create_global_step()
       session = monitored_session.MonitoredSession()
       with self.assertRaisesRegexp(RuntimeError, 'a thread wants to stop'):
         with session:
@@ -1714,7 +1721,7 @@ class MonitoredSessionTest(test.TestCase):
   def test_stop_cleanly_when_no_exception_in_with_body(self):
     # Tests that regular exceptions pass through
     with ops.Graph().as_default():
-      gstep = variables_lib.get_or_create_global_step()
+      gstep = training_util.get_or_create_global_step()
       do_step = state_ops.assign_add(gstep, 1)
       session = monitored_session.MonitoredSession()
       with session:
@@ -1728,7 +1735,7 @@ class MonitoredSessionTest(test.TestCase):
   def test_raises_regular_exceptions_in_with_body(self):
     # Tests that regular exceptions in "with body" are seen outside.
     with ops.Graph().as_default():
-      gstep = variables_lib.get_or_create_global_step()
+      gstep = training_util.get_or_create_global_step()
       do_step = state_ops.assign_add(gstep, 1)
       session = monitored_session.MonitoredSession()
       # We should see that exception.
@@ -2184,7 +2191,7 @@ class SingularMonitoredSessionTest(test.TestCase):
 
   def test_do_not_handle_aborted_error(self):
     with ops.Graph().as_default():
-      gstep = variables_lib.get_or_create_global_step()
+      gstep = training_util.get_or_create_global_step()
 
       class _RaiseAbortedHook(session_run_hook.SessionRunHook):
 
@@ -2204,7 +2211,7 @@ class SingularMonitoredSessionTest(test.TestCase):
   def test_exit_cleanly_on_out_of_range_exception(self):
     # Tests that we stop cleanly when OutOfRange is raised.
     with ops.Graph().as_default():
-      gstep = variables_lib.get_or_create_global_step()
+      gstep = training_util.get_or_create_global_step()
       do_step = state_ops.assign_add(gstep, 1)
       hook = RaiseOnceAtCountN(2, errors_impl.OutOfRangeError(None, None,
                                                               'EOI'))
@@ -2225,7 +2232,7 @@ class SingularMonitoredSessionTest(test.TestCase):
     # passes through a "run()" call within a "with MonitoredSession" block and
     # set the session in stop mode.
     with ops.Graph().as_default():
-      gstep = variables_lib.get_or_create_global_step()
+      gstep = training_util.get_or_create_global_step()
       session = monitored_session.SingularMonitoredSession()
       run_performed_without_error = False
       with self.assertRaisesRegexp(RuntimeError, 'a thread wants to stop'):
@@ -2244,7 +2251,7 @@ class SingularMonitoredSessionTest(test.TestCase):
   def test_stop_cleanly_when_no_exception_in_with_body(self):
     # Tests that regular exceptions pass through
     with ops.Graph().as_default():
-      gstep = variables_lib.get_or_create_global_step()
+      gstep = training_util.get_or_create_global_step()
       do_step = state_ops.assign_add(gstep, 1)
       session = monitored_session.SingularMonitoredSession()
       with session:
diff --git a/tensorflow/python/training/moving_averages.py b/tensorflow/python/training/moving_averages.py
index 4e89c966ad1..cd3e1b15902 100644
--- a/tensorflow/python/training/moving_averages.py
+++ b/tensorflow/python/training/moving_averages.py
@@ -28,7 +28,6 @@ from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables
 from tensorflow.python.training import slot_creator
-from tensorflow.python.util import object_identity
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -369,7 +368,7 @@ class ExponentialMovingAverage(object):
     self._num_updates = num_updates
     self._zero_debias = zero_debias
     self._name = name
-    self._averages = object_identity.ObjectIdentityDictionary()
+    self._averages = {}
 
   @property
   def name(self):
@@ -423,7 +422,7 @@ class ExponentialMovingAverage(object):
         raise TypeError("The variables must be half, float, or double: %s" %
                         var.name)
 
-      if var not in self._averages:
+      if var.experimental_ref() not in self._averages:
         # For variables: to lower communication bandwidth across devices we keep
         # the moving averages on the same device as the variables. For other
         # tensors, we rely on the existing device allocation mechanism.
@@ -445,8 +444,8 @@ class ExponentialMovingAverage(object):
                   "Variable", "VariableV2", "VarHandleOp"
               ]))
           if self._zero_debias:
-            zero_debias_true.add(avg)
-        self._averages[var] = avg
+            zero_debias_true.add(avg.experimental_ref())
+        self._averages[var.experimental_ref()] = avg
 
     with ops.name_scope(self.name) as scope:
       decay = ops.convert_to_tensor(self._decay, name="decay")
@@ -457,10 +456,9 @@ class ExponentialMovingAverage(object):
                                  (1.0 + num_updates) / (10.0 + num_updates))
       updates = []
       for var in var_list:
-        zero_debias = any(self._averages[var] is v for v in zero_debias_true)
-        updates.append(
-            assign_moving_average(
-                self._averages[var], var, decay, zero_debias=zero_debias))
+        avg = self._averages[var.experimental_ref()]
+        zero_debias = avg.experimental_ref() in zero_debias_true
+        updates.append(assign_moving_average(avg, var, decay, zero_debias))
       return control_flow_ops.group(*updates, name=scope)
 
   def average(self, var):
@@ -473,7 +471,7 @@ class ExponentialMovingAverage(object):
       A `Variable` object or `None` if the moving average of `var`
       is not maintained.
     """
-    return self._averages.get(var, None)
+    return self._averages.get(var.experimental_ref(), None)
 
   def average_name(self, var):
     """Returns the name of the `Variable` holding the average for `var`.
@@ -497,8 +495,8 @@ class ExponentialMovingAverage(object):
       by the `ExponentialMovingAverage class` to hold the moving average of
       `var`.
     """
-    if var in self._averages:
-      return self._averages[var].op.name
+    if var.experimental_ref() in self._averages:
+      return self._averages[var.experimental_ref()].op.name
     return ops.get_default_graph().unique_name(
         var.op.name + "/" + self.name, mark_as_used=False)
 
diff --git a/tensorflow/python/training/slot_creator.py b/tensorflow/python/training/slot_creator.py
index 488bd2ebcdc..9f310bb2a65 100644
--- a/tensorflow/python/training/slot_creator.py
+++ b/tensorflow/python/training/slot_creator.py
@@ -15,9 +15,9 @@
 
 """Standard functions for creating slots.
 
-A slot is a `Variable` created with the same shape as a primary variable or
-`Tensor`. A slot is always scoped in the namespace of the primary object and
-typically has the same device and type.
+A slot is a `Variable` created with the same first m-dimension as a primary
+variable or `Tensor`. A slot is always scoped in the namespace of the primary
+object and typically has the same device and type.
 
 Slots are typically used as accumulators to track values associated with
 the primary object:
@@ -84,11 +84,19 @@ def _create_slot_var(primary, val, scope, validate_shape, shape, dtype):
     # remove "'linear//weight' + '/'" and ':0'.
     real_slot_name = slot.name[len(primary.op.name + "/"):-2]
     slice_info = primary._save_slice_info
-    slot._set_save_slice_info(variables.Variable.SaveSliceInfo(
-        slice_info.full_name + "/" + real_slot_name,
-        slice_info.full_shape[:],
-        slice_info.var_offset[:],
-        slice_info.var_shape[:]))
+    # support slot's shape not same as primary's shape
+    # example: primary's shape = [10, 20, 30], slot's shape =
+    # None, [], [10], [10, 20] or [10, 20, 30] is allowed
+    # slot's shape = None or [10, 20, 30], set slot's slice_info same as primary
+    # slot's shape = [], don't set slot's slice_info
+    # slot's shape = [10] or [10, 20], set slot's slice_info according to ndims
+    n = slot.shape.ndims
+    if n is None or n > 0:
+      slot._set_save_slice_info(
+          variables.Variable.SaveSliceInfo(
+              slice_info.full_name + "/" + real_slot_name,
+              slice_info.full_shape[:n], slice_info.var_offset[:n],
+              slice_info.var_shape[:n]))
   # pylint: enable=protected-access
   return slot
 
diff --git a/tensorflow/python/training/slot_creator_test.py b/tensorflow/python/training/slot_creator_test.py
index ec2eec39324..2465afdae9c 100644
--- a/tensorflow/python/training/slot_creator_test.py
+++ b/tensorflow/python/training/slot_creator_test.py
@@ -23,6 +23,7 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import partitioned_variables
 from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables
@@ -134,6 +135,46 @@ class SlotCreatorTest(test.TestCase):
         slot = slot_creator.create_slot(v, v.initialized_value(), name="slot")
         self.assertEqual("scope/scope/var/slot", slot.op.name)
 
+  @test_util.run_deprecated_v1
+  def testCreateSlotFromFirstMDimensionVariable(self):
+    with self.test_session():
+      s = variables.Variable([1.0, 2.5], name="var")
+      p_v = variable_scope.get_variable(
+          "var",
+          shape=[2, 2],
+          partitioner=partitioned_variables.fixed_size_partitioner(2))
+      for i, v in enumerate(p_v):
+        slot = slot_creator.create_slot(v, s.initialized_value(), name="slot")
+        si = slot._save_slice_info
+
+        variables.global_variables_initializer().run()
+
+        self.assertEqual("var/part_%d/slot" % i, slot.op.name)
+        self.assertEqual([2], slot.get_shape().as_list())
+        self.assertEqual(dtypes.float32, slot.dtype.base_dtype)
+        self.assertAllEqual([1.0, 2.5], slot.eval())
+        self.assertAllEqual([2], si.full_shape)
+        self.assertAllEqual([i], si.var_offset)
+        self.assertAllEqual([1], si.var_shape)
+
+  @test_util.run_deprecated_v1
+  def testCreateSlotFromScalarVariable(self):
+    with self.test_session():
+      s = variables.Variable(1.0, name="var")
+      p_v = variable_scope.get_variable(
+          "var",
+          shape=[2, 2],
+          partitioner=partitioned_variables.fixed_size_partitioner(2))
+      for i, v in enumerate(p_v):
+        slot = slot_creator.create_slot(v, s.initialized_value(), name="slot")
+
+        variables.global_variables_initializer().run()
+
+        self.assertEqual("var/part_%d/slot" % i, slot.op.name)
+        self.assertEqual([], slot.get_shape().as_list())
+        self.assertEqual(dtypes.float32, slot.dtype.base_dtype)
+        self.assertAllEqual(1.0, slot.eval())
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/training/tracking/data_structures.py b/tensorflow/python/training/tracking/data_structures.py
index 652e9a930f9..9cab79a9005 100644
--- a/tensorflow/python/training/tracking/data_structures.py
+++ b/tensorflow/python/training/tracking/data_structures.py
@@ -140,9 +140,16 @@ class TrackableDataStructure(base.Trackable):
 
   def __init__(self):
     # Attributes prefixed with "_self_" for compatibility with
-    # wrapt.ObjectProxy.
+    # wrapt.ObjectProxy. All additional attrs MUST conform to this pattern, as
+    # extending `__slots__` on a subclass of ObjectProxy breaks in a variety of
+    # ways.
     self._self_trainable = True
     self._self_extra_variables = []
+    self._self_attribute_sentinel = layer_utils.AttributeSentinel(True)
+
+  @property
+  def _attribute_sentinel(self):
+    return self._self_attribute_sentinel
 
   @property
   def trainable(self):
@@ -164,6 +171,9 @@ class TrackableDataStructure(base.Trackable):
       # In subclassed models, legacy layers (tf.layers) must always use
       # resource variables.
       value._use_resource_variables = True  # pylint: disable=protected-access
+    value_attribute_sentinel = getattr(value, "_attribute_sentinel", None)
+    if value_attribute_sentinel:
+      value_attribute_sentinel.add_parent(self._attribute_sentinel)
     return value
 
   @property
@@ -186,7 +196,7 @@ class TrackableDataStructure(base.Trackable):
 
   @property
   def layers(self):
-    return layer_utils.filter_empty_layer_containers(self._layers)
+    return list(layer_utils.filter_empty_layer_containers(self._layers))
 
   @property
   def trainable_weights(self):
@@ -402,11 +412,37 @@ class ListWrapper(
     # Monotonic flags which indicate this object would not be restored properly,
     # and therefore should throw an error on save to avoid giving the impression
     # that restoring it will work.
-    self._non_append_mutation = False
-    self._external_modification = False
+    self._non_append_mutation_value = False
+    self._external_modification_value = False
     super(ListWrapper, self).__init__(wrapped_list)
     self._last_wrapped_list_snapshot = list(self._storage)
 
+  @property
+  def _non_append_mutation(self):
+    return self._non_append_mutation_value
+
+  @_non_append_mutation.setter
+  def _non_append_mutation(self, value):
+    # Trackable only cares that a mutation occured at some point; when
+    # attempting to save it checks whether a mutation occured and the object is
+    # in a "dirty" state but otherwise the specifics of how it got to that state
+    # are ignored. By contrast, the attribute cache needs to signal the mutation
+    # immediately since a caller could query the value of an attribute (And
+    # should not hit the cached value since the mutation may have affected the
+    # result.)
+    self._attribute_sentinel.invalidate_all()
+    self._non_append_mutation_value = value
+
+  @property
+  def _external_modification(self):
+    return self._external_modification_value
+
+  @_external_modification.setter
+  def _external_modification(self, value):
+    # Invalidate for the same reason as `_non_append_mutation`
+    self._attribute_sentinel.invalidate_all()
+    self._external_modification_value = value
+
   # pylint: disable=protected-access
   def __copy__(self):
     copied = super(ListWrapper, self).__copy__()
@@ -439,6 +475,10 @@ class ListWrapper(
 
   def _update_snapshot(self):
     """Acknowledges tracked changes to the wrapped list."""
+
+    # Mutation tracking for attributes reuses the same infrastructure as
+    # Trackable mutation tracking.
+    self._attribute_sentinel.invalidate_all()
     if self._external_modification or self._non_append_mutation:
       return
     self._last_wrapped_list_snapshot = list(self._storage)
@@ -763,6 +803,7 @@ class _DictWrapper(TrackableDataStructure, wrapt.ObjectProxy):
 
   def _update_snapshot(self):
     """Acknowledges tracked changes to the wrapped dict."""
+    self._attribute_sentinel.invalidate_all()
     if self._dirty:
       return
     self._self_last_wrapped_dict_snapshot = dict(self)
diff --git a/tensorflow/python/training/tracking/layer_utils.py b/tensorflow/python/training/tracking/layer_utils.py
index b83b0f84f91..ab0480c2228 100644
--- a/tensorflow/python/training/tracking/layer_utils.py
+++ b/tensorflow/python/training/tracking/layer_utils.py
@@ -21,8 +21,18 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import collections
+import functools
+import weakref
+
 from tensorflow.python.util import object_identity
 
+try:
+  # typing module is only used for comment type annotations.
+  import typing  # pylint: disable=g-import-not-at-top, unused-import
+except ImportError:
+  pass
+
 
 def is_layer(obj):
   """Implicit check for Layer-like objects."""
@@ -39,24 +49,206 @@ def has_weights(obj):
   return has_weight and not isinstance(obj, type)
 
 
+def cache_recursive_attribute(key):
+  """Decorator to cache Layer properties which recursively depend on sub-layers.
+
+  A number of attributes in Keras Layers take the form:
+
+  ```
+  @property
+  def thing(self):
+    return self._thing or any(layer.thing for layer in self.layers)
+  ```
+
+  This means that checking these properties (e.g. dynamic, stateful, etc) must
+  traverse the entire graph of layers to determine whether any descent has
+  changed its state. This decorator adds a mechanism for Layers and trackable
+  data structures to broadcast mutations (including the addition or deletion
+  of layers) and allows the top level layer to safely cache results. In general,
+  if computing an attribute triggers a depth first search it is a good candidate
+  for this caching mechanism.
+
+  The architecture is optimized for safety and correctness rather than absolute
+  optimality. This manifests in two ways:
+    1) Parents are never removed. It is possible for layer A to depend on layer
+       B but subsequently remove that dependency. In that case, layer B will
+       continue to broadcast its mutations to layer A until either A or B is
+       deleted. However because the only effect is to invalidate a cache this
+       does not affect correctness. (And robustly removing dependencies is
+       difficult and error prone.)
+
+    2) Layers aggressively invalidate their caches when there is any ambiguity
+       of whether or not it is necessary. For instance, consider the following:
+       ```
+       class MyLayer(tf.keras.layers.Layer):
+         def __init__(self):
+           super(MyLayer, self).__init__()
+
+           sub_layer = tf.keras.layers.Dense(1)
+           self.sub_layers = [
+               sub_layer  # This will be picked up, converted to a ListWrapper,
+                          # and added to self._layers
+           ]
+
+           # Include the layer twice.
+           self.sub_layers.append(sub_layer)
+
+           # Remove one copy, but one copy remains.
+           self.sub_layers.pop()
+       ```
+       In the example layer above, the set of tracked layers actually doesn't
+       change; however to know that in the general case the Layer needs
+       significant machinery to reason about what, if anything, has changed.
+       By invalidating on every mutation we don't need to concern ourselves
+       with the many types of mutations (append, pop, in-place replacement)
+       and their specific semantics.
+
+  Because mutations to layers are expected to be infrequent, this very
+  conservative approach captures the vast majority of the performance gains from
+  caching recursive properties while still remaining quite lightweight and easy
+  to reason about.
+
+  `tracking.cached_per_instance` provides a more detailed performance analysis
+  of the WeakKeyDictionary cache pattern.
+
+  Args:
+    key: A string indicating which field is being cached. While not strictly
+         necessary (since it could be obtained from f.__name__), it forces
+         deliberate behavior when caching an attribute.
+
+  Returns:
+    A caching decorater specialized to `key`.
+  """
+  cache = weakref.WeakKeyDictionary()
+  def outer(f):
+    """Attribute cache which has been specialized."""
+
+    @functools.wraps(f)
+    def wrapped(self):
+      """Cache aware version of `f`."""
+
+      # Sentinels are unique per Layer/Trackable, but can be hashed. (Unlike
+      # some trackable data structures.) Consequently it makes sense to use the
+      # sentinel as a cache key rather than `self`.
+      sentinel = getattr(self, "_attribute_sentinel")  # type: AttributeSentinel
+
+      if not sentinel.get(key) or sentinel not in cache:
+        cache[sentinel] = f(self)
+        sentinel.mark_cached(key)
+      output = cache[sentinel]
+      return output
+
+    return wrapped
+  return outer
+
+
+def invalidate_recursive_cache(key):
+  """Convenience decorator to invalidate the cache when setting attributes."""
+  def outer(f):
+    @functools.wraps(f)
+    def wrapped(self, value):
+      sentinel = getattr(self, "_attribute_sentinel")  # type: AttributeSentinel
+      sentinel.invalidate(key)
+      return f(self, value)
+    return wrapped
+  return outer
+
+
+class MutationSentinel(object):
+  """Container for tracking whether a property is in a cached state."""
+  _in_cached_state = False
+
+  def mark_as(self, value):  # type: (MutationSentinel, bool) -> bool
+    may_affect_upstream = (value != self._in_cached_state)
+    self._in_cached_state = value
+    return may_affect_upstream
+
+  @property
+  def in_cached_state(self):
+    return self._in_cached_state
+
+
+class AttributeSentinel(object):
+  """Container for managing attribute cache state within a Layer.
+
+  The cache can be invalidated either on an individual basis (for instance when
+  an attribute is mutated) or a layer-wide basis (such as when a new dependency
+  is added).
+  """
+
+  def __init__(self, always_propagate=False):
+    self._parents = weakref.WeakSet()
+    self.attributes = collections.defaultdict(MutationSentinel)
+
+    # The trackable data structure containers are simple pass throughs. They
+    # don't know or care about particular attributes. As a result, they will
+    # consider themselves to be in a cached state, so it's up to the Layer
+    # which contains them to terminate propagation.
+    self.always_propagate = always_propagate
+
+  def __repr__(self):
+    return "{}\n  {}".format(
+        super(AttributeSentinel, self).__repr__(),
+        {k: v.in_cached_state for k, v in self.attributes.items()})
+
+  def add_parent(self, node):
+    # type: (AttributeSentinel, AttributeSentinel) -> None
+
+    # Properly tracking removal is quite challenging; however since this is only
+    # used to invalidate a cache it's alright to be overly conservative. We need
+    # to invalidate the cache of `node` (since it has implicitly gained a child)
+    # but we don't need to invalidate self since attributes should not depend on
+    # parent Layers.
+    self._parents.add(node)
+    node.invalidate_all()
+
+  def get(self, key):
+    # type: (AttributeSentinel, str) -> bool
+    return self.attributes[key].in_cached_state
+
+  def _set(self, key, value):
+    # type: (AttributeSentinel, str, bool) -> None
+    may_affect_upstream = self.attributes[key].mark_as(value)
+    if may_affect_upstream or self.always_propagate:
+      for node in self._parents:  # type: AttributeSentinel
+        node.invalidate(key)
+
+  def mark_cached(self, key):
+    # type: (AttributeSentinel, str) -> None
+    self._set(key, True)
+
+  def invalidate(self, key):
+    # type: (AttributeSentinel, str) -> None
+    self._set(key, False)
+
+  def invalidate_all(self):
+    # Parents may have different keys than their children, so we locally
+    # invalidate but use the `invalidate_all` method of parents.
+    for key in self.attributes.keys():
+      self.attributes[key].mark_as(False)
+
+    for node in self._parents:
+      node.invalidate_all()
+
+
 def filter_empty_layer_containers(layer_list):
   """Filter out empty Layer-like containers and uniquify."""
   # TODO(b/130381733): Make this an attribute in base_layer.Layer.
   existing = object_identity.ObjectIdentitySet()
   to_visit = layer_list[::-1]
-  filtered = []
   while to_visit:
     obj = to_visit.pop()
     if obj in existing:
       continue
     existing.add(obj)
     if is_layer(obj):
-      filtered.append(obj)
-    elif hasattr(obj, "layers"):
+      yield obj
+    else:
+      sub_layers = getattr(obj, "layers", None) or []
+
       # Trackable data structures will not show up in ".layers" lists, but
       # the layers they contain will.
-      to_visit.extend(obj.layers[::-1])
-  return filtered
+      to_visit.extend(sub_layers[::-1])
 
 
 def gather_trainable_weights(trainable, sub_layers, extra_variables):
diff --git a/tensorflow/python/training/tracking/tracking.py b/tensorflow/python/training/tracking/tracking.py
index 8b0bc6e5e3a..f27b83ccba2 100644
--- a/tensorflow/python/training/tracking/tracking.py
+++ b/tensorflow/python/training/tracking/tracking.py
@@ -28,6 +28,7 @@ from tensorflow.python.framework import ops
 from tensorflow.python.training.tracking import base
 from tensorflow.python.training.tracking import data_structures
 from tensorflow.python.util import tf_contextlib
+from tensorflow.python.util.tf_export import tf_export
 
 
 # global _RESOURCE_TRACKER_STACK
@@ -275,8 +276,44 @@ class TrackableResource(CapturableResource):
     super(TrackableResource, self).__init__(device=device, deleter=deleter)
 
 
-class TrackableAsset(base.Trackable):
-  """Base class for asset files which need to be tracked."""
+@tf_export("saved_model.Asset")
+class Asset(base.Trackable):
+  """Represents a file asset to hermetically include in a SavedModel.
+
+  A SavedModel can include arbitrary files, called assets, that are needed
+  for its use. For example a vocabulary file used initialize a lookup table.
+
+  When a trackable object is exported via `tf.saved_model.save()`, all the
+  `Asset`s reachable from it are copied into the SavedModel assets directory.
+  Upon loading, the assets and the serialized functions that depend on them
+  will refer to the correct filepaths inside the SavedModel directory.
+
+  Example:
+
+  ```
+  filename = tf.saved_model.Asset("file.txt")
+
+  @tf.function(input_signature=[])
+  def func():
+    return tf.io.read_file(filename)
+
+  trackable_obj = tf.train.Checkpoint()
+  trackable_obj.func = func
+  trackable_obj.filename = filename
+  tf.saved_model.save(trackable_obj, "/tmp/saved_model")
+
+  # The created SavedModel is hermetic, it does not depend on
+  # the original file and can be moved to another path.
+  tf.io.gfile.remove("file.txt")
+  tf.io.gfile.rename("/tmp/saved_model", "/tmp/new_location")
+
+  reloaded_obj = tf.saved_model.load("/tmp/new_location")
+  print(reloaded_obj.func())
+  ```
+
+  Attributes:
+    asset_path: A 0-D `tf.string` tensor with path to the asset.
+  """
 
   def __init__(self, path):
     """Record the full path to the asset."""
@@ -389,5 +426,5 @@ def cached_per_instance(f):
 
 
 ops.register_tensor_conversion_function(
-    TrackableAsset,
+    Asset,
     lambda asset, **kw: ops.internal_convert_to_tensor(asset.asset_path, **kw))
diff --git a/tensorflow/python/util/compat.py b/tensorflow/python/util/compat.py
index 54d1495ca0c..c715bd646cf 100644
--- a/tensorflow/python/util/compat.py
+++ b/tensorflow/python/util/compat.py
@@ -143,18 +143,18 @@ def path_to_str(path):
     `os.PathLike` object
 
   Examples:
-  ```python3
-  >>> tf.compat.path_to_str('C:\XYZ\tensorflow\./.././tensorflow')
+  ```python
+  $ tf.compat.path_to_str('C:\XYZ\tensorflow\./.././tensorflow')
   'C:\XYZ\tensorflow\./.././tensorflow' # Windows OS
-  >>> tf.compat.path_to_str(Path('C:\XYZ\tensorflow\./.././tensorflow'))
+  $ tf.compat.path_to_str(Path('C:\XYZ\tensorflow\./.././tensorflow'))
   'C:\XYZ\tensorflow\..\tensorflow' # Windows OS
-  >>> tf.compat.path_to_str(Path('./corpus'))
+  $ tf.compat.path_to_str(Path('./corpus'))
   'corpus' # Linux OS
-  >>> tf.compat.path_to_str('./.././Corpus')
+  $ tf.compat.path_to_str('./.././Corpus')
   './.././Corpus' # Linux OS
-  >>> tf.compat.path_to_str(Path('./.././Corpus'))
+  $ tf.compat.path_to_str(Path('./.././Corpus'))
   '../Corpus' # Linux OS
-  >>> tf.compat.path_to_str(Path('./..////../'))
+  $ tf.compat.path_to_str(Path('./..////../'))
   '../..' # Linux OS
 
   ```
diff --git a/tensorflow/python/util/module_wrapper.py b/tensorflow/python/util/module_wrapper.py
index 6207a393d60..5ee356258a2 100644
--- a/tensorflow/python/util/module_wrapper.py
+++ b/tensorflow/python/util/module_wrapper.py
@@ -90,6 +90,8 @@ class TFModuleWrapper(types.ModuleType):
       deprecation=True,
       has_lite=False):  # pylint: enable=super-on-old-class
     super(TFModuleWrapper, self).__init__(wrapped.__name__)
+    # A cache for all members which do not print deprecations (any more).
+    self._tfmw_attr_map = {}
     self.__dict__.update(wrapped.__dict__)
     # Prefix all local attributes with _tfmw_ so that we can
     # handle them differently in attribute access methods.
@@ -136,6 +138,8 @@ class TFModuleWrapper(types.ModuleType):
               'From %s: The name %s is deprecated. Please use %s instead.\n',
               _call_location(), full_name, rename)
           self._tfmw_warning_count += 1
+          return True
+    return False
 
   def _tfmw_import_module(self, name):
     symbol_loc_info = self._tfmw_public_apis[name]
@@ -149,25 +153,45 @@ class TFModuleWrapper(types.ModuleType):
     return attr
 
   def __getattribute__(self, name):  # pylint: disable=super-on-old-class
-    # Workaround to make sure we do not import from tensorflow/lite/__init__.py
-    if name == 'lite':
-      if self._tfmw_has_lite:
-        attr = self._tfmw_import_module(name)
-        setattr(self._tfmw_wrapped_module, 'lite', attr)
+    # Handle edge case where we unpickle and the object is not initialized yet
+    # and does not have _tfmw_attr_map attribute. Otherwise, calling
+    # __getattribute__ on __setstate__ will result in infinite recursion where
+    # we keep trying to get _tfmw_wrapped_module in __getattr__.
+    try:
+      attr_map = object.__getattribute__(self, '_tfmw_attr_map')
+    except AttributeError:
+      self._tfmw_attr_map = attr_map = {}
+
+    try:
+      # Use cached attrs if available
+      return attr_map[name]
+    except KeyError:
+      # Make sure we do not import from tensorflow/lite/__init__.py
+      if name == 'lite':
+        if self._tfmw_has_lite:
+          attr = self._tfmw_import_module(name)
+          setattr(self._tfmw_wrapped_module, 'lite', attr)
+          attr_map[name] = attr
+          return attr
+
+      attr = super(TFModuleWrapper, self).__getattribute__(name)
+
+      # Return and cache dunders and our own members.
+      if name.startswith('__') or name.startswith('_tfmw_'):
+        attr_map[name] = attr
         return attr
 
-    attr = super(TFModuleWrapper, self).__getattribute__(name)
-    if name.startswith('__') or name.startswith('_tfmw_'):
+      # Print deprecations, only cache functions after deprecation warnings have
+      # stopped.
+      if not (self._tfmw_print_deprecation_warnings and
+              self._tfmw_add_deprecation_warning(name, attr)):
+        attr_map[name] = attr
       return attr
 
-    if self._tfmw_print_deprecation_warnings:
-      self._tfmw_add_deprecation_warning(name, attr)
-    return attr
-
   def __getattr__(self, name):
     try:
       attr = getattr(self._tfmw_wrapped_module, name)
-    except AttributeError as e:
+    except AttributeError:
       if not self._tfmw_public_apis:
         raise
       if name not in self._tfmw_public_apis:
@@ -184,6 +208,8 @@ class TFModuleWrapper(types.ModuleType):
       self.__dict__[arg] = val
       if arg not in self.__all__ and arg != '__all__':
         self.__all__.append(arg)
+      if arg in self._tfmw_attr_map:
+        self._tfmw_attr_map[arg] = val
     super(TFModuleWrapper, self).__setattr__(arg, val)
 
   def __dir__(self):
diff --git a/tensorflow/python/util/nest.py b/tensorflow/python/util/nest.py
index 2bd5d810e61..2284b6cc9c8 100644
--- a/tensorflow/python/util/nest.py
+++ b/tensorflow/python/util/nest.py
@@ -1078,6 +1078,7 @@ def map_structure_with_tuple_paths_up_to(shallow_tree, func, *inputs, **kwargs):
 
   Example:
 
+  ```python
   lowercase = {'a': 'a', 'b': ('b0', 'b1')}
   uppercase = {'a': 'A', 'b': ('B0', 'B1')}
 
@@ -1089,9 +1090,9 @@ def map_structure_with_tuple_paths_up_to(shallow_tree, func, *inputs, **kwargs):
                                        print_path_and_values,
                                        lowercase,
                                        uppercase)
-  >>> path: ('a',), values: ('a', 'A')
-  >>> path: ('b', 0), values: ('b0', 'B0')
-  >>> path: ('b', 1), values: ('b1', 'B1')
+  path: ('a',), values: ('a', 'A')
+  path: ('b', 0), values: ('b0', 'B0')
+  path: ('b', 1), values: ('b1', 'B1')
 
   shallow_tree = {'b': None}
   map_structure_with_tuple_paths_up_to(shallow_tree,
@@ -1099,7 +1100,7 @@ def map_structure_with_tuple_paths_up_to(shallow_tree, func, *inputs, **kwargs):
                                        lowercase,
                                        uppercase,
                                        check_types=False)
-  >>> path: ('b', 1), values: (('bo', 'b1'), ('B0', 'B1'))
+  path: ('b', 1), values: (('bo', 'b1'), ('B0', 'B1'))
 
   shallow_tree = {'a': None, 'b': {1: None}}
   map_structure_with_tuple_paths_up_to(shallow_tree,
@@ -1107,8 +1108,9 @@ def map_structure_with_tuple_paths_up_to(shallow_tree, func, *inputs, **kwargs):
                                        lowercase,
                                        uppercase,
                                        check_types=False)
-  >>> path: ('a',), values: ('a', 'A')
-  >>> path: ('b', 1), values: ('b1', B1')
+  path: ('a',), values: ('a', 'A')
+  path: ('b', 1), values: ('b1', B1')
+  ```
 
   Args:
     shallow_tree: a shallow tree, common to all the inputs.
@@ -1250,16 +1252,16 @@ def yield_flat_paths(nest, expand_composites=False):
   E.g. if we have a tuple `value = Foo(a=3, b=Bar(c=23, d=42))`
 
   ```shell
-  >>> nest.flatten(value)
+  nest.flatten(value)
   [3, 23, 42]
-  >>> list(nest.yield_flat_paths(value))
+  list(nest.yield_flat_paths(value))
   [('a',), ('b', 'c'), ('b', 'd')]
   ```
 
   ```shell
-  >>> list(nest.yield_flat_paths({'a': [3]}))
+  list(nest.yield_flat_paths({'a': [3]}))
   [('a', 0)]
-  >>> list(nest.yield_flat_paths({'a': 3}))
+  list(nest.yield_flat_paths({'a': 3}))
   [('a',)]
   ```
 
diff --git a/tensorflow/python/util/object_identity.py b/tensorflow/python/util/object_identity.py
index 3d191c2281d..2541e4983f5 100644
--- a/tensorflow/python/util/object_identity.py
+++ b/tensorflow/python/util/object_identity.py
@@ -40,9 +40,10 @@ class _ObjectIdentityWrapper(object):
     return self._wrapped
 
   def __eq__(self, other):
-    if isinstance(other, _ObjectIdentityWrapper):
-      return self._wrapped is other._wrapped  # pylint: disable=protected-access
-    return False
+    if not isinstance(other, _ObjectIdentityWrapper):
+      raise TypeError("Cannot compare wrapped object with unwrapped object")
+
+    return self._wrapped is other._wrapped  # pylint: disable=protected-access
 
   def __ne__(self, other):
     return not self.__eq__(other)
diff --git a/tensorflow/python/util/object_identity_test.py b/tensorflow/python/util/object_identity_test.py
index 5dc8be1a25d..67d26ebdcab 100644
--- a/tensorflow/python/util/object_identity_test.py
+++ b/tensorflow/python/util/object_identity_test.py
@@ -25,9 +25,32 @@ from tensorflow.python.util import object_identity
 class ObjectIdentityWrapperTest(test.TestCase):
 
   def testWrapperNotEqualToWrapped(self):
-    o = object()
-    self.assertNotEqual(o, object_identity._ObjectIdentityWrapper(o))
-    self.assertNotEqual(object_identity._ObjectIdentityWrapper(o), o)
+    class SettableHash(object):
+
+      def __init__(self):
+        self.hash_value = 8675309
+
+      def __hash__(self):
+        return self.hash_value
+
+    o = SettableHash()
+    wrap1 = object_identity._ObjectIdentityWrapper(o)
+    wrap2 = object_identity._ObjectIdentityWrapper(o)
+
+    self.assertEqual(wrap1, wrap1)
+    self.assertEqual(wrap1, wrap2)
+    self.assertEqual(o, wrap1.unwrapped)
+    self.assertEqual(o, wrap2.unwrapped)
+    with self.assertRaises(TypeError):
+      bool(o == wrap1)
+    with self.assertRaises(TypeError):
+      bool(wrap1 != o)
+
+    self.assertNotIn(o, set([wrap1]))
+    o.hash_value = id(o)
+    # Since there is now a hash collision we raise an exception
+    with self.assertRaises(TypeError):
+      bool(o in set([wrap1]))
 
 
 class ObjectIdentitySetTest(test.TestCase):
diff --git a/tensorflow/python/util/tfprof.i b/tensorflow/python/util/tfprof.i
deleted file mode 100644
index 06f12631fa7..00000000000
--- a/tensorflow/python/util/tfprof.i
+++ /dev/null
@@ -1,56 +0,0 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-%include "tensorflow/python/lib/core/strings.i"
-%include "tensorflow/python/platform/base.i"
-
-%{
-#include "tensorflow/core/framework/types.h"
-#include "tensorflow/core/profiler/internal/print_model_analysis.h"
-
-using tensorflow::int64;
-%}
-
-%typemap(typecheck) const string & = char *;
-%typemap(in) const string& (string temp) {
-  if (!_PyObjAs<string>($input, &temp)) return NULL;
-  $1 = &temp;
-}
-%typemap(out) const string& {
-%#if PY_MAJOR_VERSION >= 3
-  $result = PyUnicode_FromStringAndSize($1->data(), $1->size());
-%#else
-  $result = PyString_FromStringAndSize($1->data(), $1->size());
-%#endif
-}
-%apply const string & {string &};
-%apply const string & {string *};
-
-%ignoreall
-
-%unignore tensorflow;
-%unignore tensorflow::tfprof;
-%unignore tensorflow::tfprof::PrintModelAnalysis;
-%unignore tensorflow::tfprof::NewProfiler;
-%unignore tensorflow::tfprof::ProfilerFromFile;
-%unignore tensorflow::tfprof::DeleteProfiler;
-%unignore tensorflow::tfprof::AddStep;
-%unignore tensorflow::tfprof::SerializeToString;
-%unignore tensorflow::tfprof::WriteProfile;
-%unignore tensorflow::tfprof::Profile;
-
-%include "tensorflow/core/profiler/internal/print_model_analysis.h"
-
-%unignoreall
\ No newline at end of file
diff --git a/tensorflow/python/util/tfprof_wrapper.cc b/tensorflow/python/util/tfprof_wrapper.cc
new file mode 100644
index 00000000000..0d7b51840bb
--- /dev/null
+++ b/tensorflow/python/util/tfprof_wrapper.cc
@@ -0,0 +1,46 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <string>
+
+#include "include/pybind11/pybind11.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/profiler/internal/print_model_analysis.h"
+
+namespace py = pybind11;
+
+PYBIND11_MODULE(_pywrap_tfprof, m) {
+  m.def("PrintModelAnalysis",
+        [](const std::string* graph, const std::string* run_meta,
+           const std::string* op_log, const std::string* command,
+           const std::string* options) {
+          std::string temp = tensorflow::tfprof::PrintModelAnalysis(
+              graph, run_meta, op_log, command, options);
+          return py::bytes(temp);
+        });
+  m.def("NewProfiler", &tensorflow::tfprof::NewProfiler);
+  m.def("ProfilerFromFile", &tensorflow::tfprof::ProfilerFromFile);
+  m.def("DeleteProfiler", &tensorflow::tfprof::DeleteProfiler);
+  m.def("AddStep", &tensorflow::tfprof::AddStep);
+  m.def("SerializeToString", []() {
+    std::string temp = tensorflow::tfprof::SerializeToString();
+    return py::bytes(temp);
+  });
+  m.def("WriteProfile", &tensorflow::tfprof::WriteProfile);
+  m.def("Profile", [](const std::string* command, const std::string* options) {
+    std::string temp = tensorflow::tfprof::Profile(command, options);
+    return py::bytes(temp);
+  });
+}
diff --git a/tensorflow/stream_executor/build_defs.bzl b/tensorflow/stream_executor/build_defs.bzl
index 575ff639e75..3cb24f8468f 100644
--- a/tensorflow/stream_executor/build_defs.bzl
+++ b/tensorflow/stream_executor/build_defs.bzl
@@ -18,3 +18,6 @@ def if_gpu_is_configured(x):
     if cuda_is_configured() or rocm_is_configured():
         return x
     return []
+
+def if_cuda_or_rocm(x):
+    return if_gpu_is_configured(x)
diff --git a/tensorflow/stream_executor/cuda/BUILD b/tensorflow/stream_executor/cuda/BUILD
index 27b1364c6cb..fbcdfcca067 100644
--- a/tensorflow/stream_executor/cuda/BUILD
+++ b/tensorflow/stream_executor/cuda/BUILD
@@ -96,26 +96,6 @@ cc_library(
     ]),
 )
 
-cc_library(
-    name = "ptxas_utils",
-    srcs = if_cuda_is_configured(["ptxas_utils.cc"]),
-    hdrs = if_cuda_is_configured(["ptxas_utils.h"]),
-    deps = if_cuda_is_configured([
-        ":cuda_driver",
-        ":ptxas_wrapper",
-        "@com_google_absl//absl/types:span",
-        "//tensorflow/stream_executor/gpu:gpu_helpers_header",
-        "//tensorflow/stream_executor/lib",
-        "//tensorflow/stream_executor/platform",
-        "@com_google_absl//absl/strings:str_format",
-        "@com_google_absl//absl/synchronization",
-        "//tensorflow/core:cuda_libdevice_path",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:regexp_internal",
-        "@com_google_absl//absl/container:flat_hash_map",
-    ]),
-)
-
 # Buildozer can not remove dependencies inside select guards, so we have to use
 # an intermediate target.
 cc_library(name = "ptxas_wrapper")
@@ -600,24 +580,6 @@ cc_library(
     alwayslink = 1,
 )
 
-cc_library(
-    name = "redzone_allocator",
-    srcs = if_cuda_is_configured(["redzone_allocator.cc"]),
-    hdrs = if_cuda_is_configured(["redzone_allocator.h"]),
-    deps = if_cuda_is_configured([
-        ":ptxas_utils",
-        "@com_google_absl//absl/container:fixed_array",
-        "@com_google_absl//absl/strings:str_format",
-        "@com_google_absl//absl/types:optional",
-        "//tensorflow/core:allocator",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:stream_executor_no_cuda",
-        "//tensorflow/stream_executor:device_memory",
-        "//tensorflow/stream_executor:device_memory_allocator",
-        "//tensorflow/stream_executor:stream_executor_headers",
-    ]),
-)
-
 tf_cuda_cc_test(
     name = "redzone_allocator_test",
     srcs = ["redzone_allocator_test.cc"],
@@ -625,8 +587,6 @@ tf_cuda_cc_test(
     deps = [
         ":cuda_activation",
         ":cuda_gpu_executor",
-        ":ptxas_utils",
-        ":redzone_allocator",
         "//tensorflow/core:stream_executor_no_cuda",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
@@ -634,5 +594,7 @@ tf_cuda_cc_test(
         "//tensorflow/stream_executor:device_memory_allocator",
         "//tensorflow/stream_executor:event",
         "//tensorflow/stream_executor:kernel",
+        "//tensorflow/stream_executor/gpu:asm_compiler",
+        "//tensorflow/stream_executor/gpu:redzone_allocator",
     ],
 )
diff --git a/tensorflow/stream_executor/cuda/cuda_gpu_executor.cc b/tensorflow/stream_executor/cuda/cuda_gpu_executor.cc
index 38d3dc98463..049d22edf90 100644
--- a/tensorflow/stream_executor/cuda/cuda_gpu_executor.cc
+++ b/tensorflow/stream_executor/cuda/cuda_gpu_executor.cc
@@ -518,8 +518,8 @@ int GpuExecutor::CompareOccupancy(int* initial_blocks,
   }
 }
 
-void* GpuExecutor::Allocate(uint64 size) {
-  return GpuDriver::DeviceAllocate(context_, size);
+DeviceMemoryBase GpuExecutor::Allocate(uint64 size) {
+  return DeviceMemoryBase(GpuDriver::DeviceAllocate(context_, size), size);
 }
 
 void* GpuExecutor::GetSubBuffer(DeviceMemoryBase* mem, uint64 offset_bytes,
diff --git a/tensorflow/stream_executor/cuda/redzone_allocator_test.cc b/tensorflow/stream_executor/cuda/redzone_allocator_test.cc
index 97aa2c9e301..b396c16bc50 100644
--- a/tensorflow/stream_executor/cuda/redzone_allocator_test.cc
+++ b/tensorflow/stream_executor/cuda/redzone_allocator_test.cc
@@ -15,12 +15,12 @@ limitations under the License.
 
 #ifdef GOOGLE_CUDA
 
-#include "tensorflow/stream_executor/cuda/redzone_allocator.h"
+#include "tensorflow/stream_executor/gpu/redzone_allocator.h"
 
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/platform/test.h"
-#include "tensorflow/stream_executor/cuda/ptxas_utils.h"
 #include "tensorflow/stream_executor/device_memory_allocator.h"
+#include "tensorflow/stream_executor/gpu/asm_compiler.h"
 #include "tensorflow/stream_executor/multi_platform_manager.h"
 #include "tensorflow/stream_executor/platform.h"
 
@@ -53,7 +53,7 @@ TEST(RedzoneAllocatorTest, WriteToRedzone) {
   Platform* platform =
       MultiPlatformManager::PlatformWithName("cuda").ValueOrDie();
   StreamExecutor* stream_exec = platform->ExecutorForDevice(0).ValueOrDie();
-  cuda::PtxCompilationOptions opts;
+  GpuAsmOpts opts;
   StreamExecutorMemoryAllocator se_allocator(platform, {stream_exec});
 
   Stream stream(stream_exec);
@@ -128,7 +128,7 @@ TEST(RedzoneAllocatorTest, VeryLargeRedzone) {
   Platform* platform =
       MultiPlatformManager::PlatformWithName("cuda").ValueOrDie();
   StreamExecutor* stream_exec = platform->ExecutorForDevice(0).ValueOrDie();
-  cuda::PtxCompilationOptions opts;
+  GpuAsmOpts opts;
   StreamExecutorMemoryAllocator se_allocator(platform, {stream_exec});
   Stream stream(stream_exec);
   stream.Init();
diff --git a/tensorflow/stream_executor/device_memory.h b/tensorflow/stream_executor/device_memory.h
index e1106fde25b..c93ca3fefd7 100644
--- a/tensorflow/stream_executor/device_memory.h
+++ b/tensorflow/stream_executor/device_memory.h
@@ -84,6 +84,12 @@ class DeviceMemoryBase {
   void *opaque() { return opaque_; }
   const void *opaque() const { return opaque_; }
 
+  // Returns the payload of this memory region.
+  uint64 payload() const { return payload_; }
+
+  // Sets payload to given value.
+  void SetPayload(uint64 payload) { payload_ = payload; }
+
   // Returns whether the two DeviceMemoryBase segments are identical (both in
   // their opaque pointer and size).
   bool IsSameAs(const DeviceMemoryBase &other) const {
@@ -103,6 +109,7 @@ class DeviceMemoryBase {
  private:
   void *opaque_;  // Platform-dependent value representing allocated memory.
   uint64 size_;   // Size in bytes of this allocation.
+  uint64 payload_ = 0;  // Payload data associtated with this allocation.
 };
 
 // Typed wrapper around "void *"-like DeviceMemoryBase.
@@ -122,7 +129,9 @@ class DeviceMemory final : public DeviceMemoryBase {
   // regions, this effectively amounts to a cast from a void*.
   explicit DeviceMemory(const DeviceMemoryBase &other)
       : DeviceMemoryBase(const_cast<DeviceMemoryBase &>(other).opaque(),
-                         other.size()) {}
+                         other.size()) {
+    SetPayload(other.payload());
+  }
 
   // Returns the number of elements of type ElemT that constitute this
   // allocation.
@@ -179,7 +188,6 @@ class SharedDeviceMemory final : public DeviceMemoryBase {
   bool IsScalar() const { return ElementCount() == 1; }
 };
 
-
 // Host-side representation of packed-and-aligned vector datatypes on the device
 // side. Since these can appear in device kernel signatures, we support
 // launching them with these datatypes in launch signatures.
diff --git a/tensorflow/stream_executor/gpu/BUILD b/tensorflow/stream_executor/gpu/BUILD
index cd598b486dc..1624d55dafd 100644
--- a/tensorflow/stream_executor/gpu/BUILD
+++ b/tensorflow/stream_executor/gpu/BUILD
@@ -9,6 +9,7 @@ load(
     "//tensorflow/core/platform:default/cuda_build_defs.bzl",
     "if_cuda_is_configured",
 )
+load("//tensorflow:tensorflow.bzl", "tf_copts")
 load("@local_config_rocm//rocm:build_defs.bzl", "if_rocm_is_configured")
 
 package(
@@ -58,7 +59,7 @@ cc_library(
 
 cc_library(
     name = "gpu_driver_header",
-    hdrs = if_gpu_is_configured(["gpu_driver.h"]),
+    hdrs = ["gpu_driver.h"],
     deps = [
         ":gpu_types_header",
         "//tensorflow/stream_executor:device_options",
@@ -194,3 +195,56 @@ cc_library(
         "@local_config_rocm//rocm:rocm_headers",
     ]),
 )
+
+cc_library(
+    name = "asm_compiler",
+    srcs = if_gpu_is_configured(["asm_compiler.cc"]),
+    hdrs = if_gpu_is_configured(["asm_compiler.h"]),
+    copts = tf_copts(),
+    visibility = [
+        "//tensorflow/compiler/xla/service/gpu:__subpackages__",
+        "//tensorflow/core/kernels:__subpackages__",
+        "//tensorflow/stream_executor:__subpackages__",
+    ],
+    deps = if_gpu_is_configured([
+        "gpu_driver_header",
+        "gpu_helpers_header",
+        "//tensorflow/core/lib/gtl:cleanup",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:regexp_internal",
+        "//tensorflow/core:cuda_libdevice_path",
+        "//tensorflow/stream_executor/lib",
+        "//tensorflow/stream_executor/platform",
+        "@com_google_absl//absl/strings:str_format",
+        "@com_google_absl//absl/synchronization",
+        "@com_google_absl//absl/types:span",
+        "@com_google_absl//absl/container:flat_hash_map",
+    ]) + if_cuda_is_configured([
+        "//tensorflow/stream_executor/cuda:cuda_driver",
+        "//tensorflow/stream_executor/cuda:ptxas_wrapper",
+    ]),
+)
+
+cc_library(
+    name = "redzone_allocator",
+    srcs = if_gpu_is_configured(["redzone_allocator.cc"]),
+    hdrs = if_gpu_is_configured(["redzone_allocator.h"]),
+    copts = tf_copts(),
+    visibility = [
+        "//tensorflow/compiler/xla/service/gpu:__subpackages__",
+        "//tensorflow/core/kernels:__subpackages__",
+        "//tensorflow/stream_executor:__subpackages__",
+    ],
+    deps = if_gpu_is_configured([
+        "asm_compiler",
+        "@com_google_absl//absl/container:fixed_array",
+        "@com_google_absl//absl/strings:str_format",
+        "@com_google_absl//absl/types:optional",
+        "//tensorflow/core:allocator",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:stream_executor_no_cuda",
+        "//tensorflow/stream_executor:device_memory",
+        "//tensorflow/stream_executor:device_memory_allocator",
+        "//tensorflow/stream_executor:stream_executor_headers",
+    ]),
+)
diff --git a/tensorflow/stream_executor/cuda/ptxas_utils.cc b/tensorflow/stream_executor/gpu/asm_compiler.cc
similarity index 83%
rename from tensorflow/stream_executor/cuda/ptxas_utils.cc
rename to tensorflow/stream_executor/gpu/asm_compiler.cc
index 973cd6de449..8f91b81303b 100644
--- a/tensorflow/stream_executor/cuda/ptxas_utils.cc
+++ b/tensorflow/stream_executor/gpu/asm_compiler.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/stream_executor/cuda/ptxas_utils.h"
+#include "tensorflow/stream_executor/gpu/asm_compiler.h"
 
 #include "absl/container/flat_hash_map.h"
 #include "absl/strings/str_format.h"
@@ -26,25 +26,29 @@ limitations under the License.
 #include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/platform/regexp.h"
 #include "tensorflow/core/platform/subprocess.h"
-#include "tensorflow/stream_executor/cuda/cuda_driver.h"
-#include "tensorflow/stream_executor/gpu/gpu_helpers.h"
+#include "tensorflow/stream_executor/gpu/gpu_driver.h"
 #include "tensorflow/stream_executor/lib/statusor.h"
 
-namespace stream_executor {
-namespace cuda {
+#if GOOGLE_CUDA
+#include "tensorflow/stream_executor/cuda/cuda_driver.h"
+#endif  // GOOGLE_CUDA
 
-#if defined(PLATFORM_WINDOWS)
-port::StatusOr<std::vector<uint8>> CompilePtx(int device_ordinal,
-                                              const char* ptx_contents,
-                                              PtxCompilationOptions options) {
+namespace stream_executor {
+
+#if TENSORFLOW_USE_ROCM || defined(PLATFORM_WINDOWS)
+
+port::StatusOr<std::vector<uint8>> CompileGpuAsm(int device_ordinal,
+                                                 const char* ptx_contents,
+                                                 GpuAsmOpts options) {
   // TODO(b/134675935): Subprocess invocation not supported on Windows.
-  return port::InternalError("Invoking ptxas not supported on Windows");
+  return port::InternalError(
+      "Invoking GPU asm compilation is supported on Cuda non-Windows "
+      "platforms only");
 }
 
-port::StatusOr<absl::Span<const uint8>> CompilePtxOrGetCached(
-    int device_ordinal, const char* ptx,
-    PtxCompilationOptions compilation_options) {
-  return CompilePtx(device_ordinal, ptx, compilation_options);
+port::StatusOr<absl::Span<const uint8>> CompileGpuAsmOrGetCached(
+    int device_ordinal, const char* ptx, GpuAsmOpts compilation_options) {
+  return CompileGpuAsm(device_ordinal, ptx, compilation_options);
 }
 
 #else
@@ -123,11 +127,9 @@ static void WarnIfBadPtxasVersion(const string& ptxas_path) {
   }
 }
 
-port::StatusOr<absl::Span<const uint8>> CompilePtxOrGetCached(
-    int device_ordinal, const char* ptx,
-    PtxCompilationOptions compilation_options) {
-  using PtxCacheKey =
-      std::tuple<int, std::string, PtxCompilationOptions::PtxOptionsTuple>;
+port::StatusOr<absl::Span<const uint8>> CompileGpuAsmOrGetCached(
+    int device_ordinal, const char* ptx, GpuAsmOpts compilation_options) {
+  using PtxCacheKey = std::tuple<int, std::string, GpuAsmOpts::PtxOptionsTuple>;
   static tensorflow::mutex ptx_cache_mutex(tensorflow::LINKER_INITIALIZED);
   static auto& ptx_cache GUARDED_BY(ptx_cache_mutex) =
       *new absl::flat_hash_map<PtxCacheKey, std::vector<uint8>>();
@@ -137,8 +139,9 @@ port::StatusOr<absl::Span<const uint8>> CompilePtxOrGetCached(
                         compilation_options.ToTuple()};
   auto it = ptx_cache.find(cache_key);
   if (it == ptx_cache.end()) {
-    TF_ASSIGN_OR_RETURN(std::vector<uint8> compiled,
-                        CompilePtx(device_ordinal, ptx, compilation_options));
+    TF_ASSIGN_OR_RETURN(
+        std::vector<uint8> compiled,
+        CompileGpuAsm(device_ordinal, ptx, compilation_options));
     it = ptx_cache.emplace(cache_key, std::move(compiled)).first;
   }
 
@@ -147,15 +150,15 @@ port::StatusOr<absl::Span<const uint8>> CompilePtxOrGetCached(
   return absl::MakeSpan(compiled);
 }
 
-port::StatusOr<std::vector<uint8>> CompilePtx(int device_ordinal,
-                                              const char* ptx_contents,
-                                              PtxCompilationOptions options) {
+port::StatusOr<std::vector<uint8>> CompileGpuAsm(int device_ordinal,
+                                                 const char* ptx_contents,
+                                                 GpuAsmOpts options) {
   gpu::GpuDeviceHandle handle;
-  TF_RETURN_IF_ERROR(CUDADriver::GetDevice(device_ordinal, &handle));
+  TF_RETURN_IF_ERROR(gpu::GpuDriver::GetDevice(device_ordinal, &handle));
   int cc_major;
   int cc_minor;
   TF_RETURN_IF_ERROR(
-      CUDADriver::GetComputeCapability(&cc_major, &cc_minor, handle));
+      gpu::GpuDriver::GetComputeCapability(&cc_major, &cc_minor, handle));
 
   string ptxas_path;
   auto env = tensorflow::Env::Default();
@@ -202,7 +205,7 @@ port::StatusOr<std::vector<uint8>> CompilePtx(int device_ordinal,
   if (VLOG_IS_ON(2)) {
     ptxas_args.push_back("-v");
   }
-  if (options.disable_ptxas_optimizations) {
+  if (options.disable_gpuasm_optimizations) {
     ptxas_args.push_back("-O0");
   }
   ptxas_info_dumper.SetProgram(ptxas_path, ptxas_args);
@@ -228,7 +231,6 @@ port::StatusOr<std::vector<uint8>> CompilePtx(int device_ordinal,
   return cubin_vector;
 }
 
-#endif  // PLATFORM_WINDOWS
+#endif
 
-}  // namespace cuda
 }  // namespace stream_executor
diff --git a/tensorflow/stream_executor/cuda/ptxas_utils.h b/tensorflow/stream_executor/gpu/asm_compiler.h
similarity index 58%
rename from tensorflow/stream_executor/cuda/ptxas_utils.h
rename to tensorflow/stream_executor/gpu/asm_compiler.h
index c7d762a149a..fad85a2a60a 100644
--- a/tensorflow/stream_executor/cuda/ptxas_utils.h
+++ b/tensorflow/stream_executor/gpu/asm_compiler.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_STREAM_EXECUTOR_CUDA_PTXAS_UTILS_H_
-#define TENSORFLOW_STREAM_EXECUTOR_CUDA_PTXAS_UTILS_H_
+#ifndef TENSORFLOW_STREAM_EXECUTOR_GPU_ASM_COMPILER_H_
+#define TENSORFLOW_STREAM_EXECUTOR_GPU_ASM_COMPILER_H_
 
 #include <string>
 
@@ -23,24 +23,23 @@ limitations under the License.
 #include "tensorflow/stream_executor/platform/port.h"
 
 namespace stream_executor {
-namespace cuda {
-
 // Compilation options for compiling ptxas.
-struct PtxCompilationOptions {
-  bool disable_ptxas_optimizations;
+struct GpuAsmOpts {
+  // Disable Cuda ptxas optimizations.
+  bool disable_gpuasm_optimizations;
 
-  // CUDA directory which would be searched first.
+  // Cuda directory which would be searched first.
   std::string preferred_cuda_dir;
 
-  explicit PtxCompilationOptions(bool disable_ptxas_optimizations = false,
-                                 absl::string_view preferred_cuda_dir = "")
-      : disable_ptxas_optimizations(disable_ptxas_optimizations),
+  explicit GpuAsmOpts(bool disable_gpuasm_optimizations = false,
+                      absl::string_view preferred_cuda_dir = "")
+      : disable_gpuasm_optimizations(disable_gpuasm_optimizations),
         preferred_cuda_dir(preferred_cuda_dir) {}
 
   using PtxOptionsTuple = std::tuple<bool, std::string>;
 
   PtxOptionsTuple ToTuple() {
-    return std::make_tuple(disable_ptxas_optimizations, preferred_cuda_dir);
+    return std::make_tuple(disable_gpuasm_optimizations, preferred_cuda_dir);
   }
 };
 
@@ -49,19 +48,17 @@ struct PtxCompilationOptions {
 //
 // compile_ptx_options is used to query for the CUDA location in case it is
 // customized in a passed flag, and for controlling ptxas optimizations.
-port::StatusOr<std::vector<uint8>> CompilePtx(int device_ordinal,
-                                              const char* ptx_contents,
-                                              PtxCompilationOptions options);
+port::StatusOr<std::vector<uint8>> CompileGpuAsm(int device_ordinal,
+                                                 const char* ptx_contents,
+                                                 GpuAsmOpts options);
 
-// Same as CompilePtx, but caches the result, and returns unowned view of
+// Same as CompileGpuAsm, but caches the result, and returns unowned view of
 // the compiled binary.
 //
 // A copy of the string provided in ptx will be made.
-port::StatusOr<absl::Span<const uint8>> CompilePtxOrGetCached(
-    int device_ordinal, const char* ptx,
-    PtxCompilationOptions compilation_options);
+port::StatusOr<absl::Span<const uint8>> CompileGpuAsmOrGetCached(
+    int device_ordinal, const char* ptx, GpuAsmOpts compilation_options);
 
-}  // namespace cuda
 }  // namespace stream_executor
 
-#endif  // TENSORFLOW_STREAM_EXECUTOR_CUDA_PTXAS_UTILS_H_
+#endif  // TENSORFLOW_STREAM_EXECUTOR_GPU_ASM_COMPILER_H_
diff --git a/tensorflow/stream_executor/gpu/gpu_executor.h b/tensorflow/stream_executor/gpu/gpu_executor.h
index c61bd732176..9f0b48d1ada 100644
--- a/tensorflow/stream_executor/gpu/gpu_executor.h
+++ b/tensorflow/stream_executor/gpu/gpu_executor.h
@@ -86,7 +86,7 @@ class GpuExecutor : public internal::StreamExecutorInterface {
                        uint64 shared_memory_per_block,
                        const ThreadDim& thread_dims, GpuFunctionHandle func);
 
-  void* Allocate(uint64 size) override;
+  DeviceMemoryBase Allocate(uint64 size) override;
 
   void* GetSubBuffer(DeviceMemoryBase* mem, uint64 offset_bytes,
                      uint64 size_bytes) override;
diff --git a/tensorflow/stream_executor/cuda/redzone_allocator.cc b/tensorflow/stream_executor/gpu/redzone_allocator.cc
similarity index 95%
rename from tensorflow/stream_executor/cuda/redzone_allocator.cc
rename to tensorflow/stream_executor/gpu/redzone_allocator.cc
index afd4f57024d..89f514c420f 100644
--- a/tensorflow/stream_executor/cuda/redzone_allocator.cc
+++ b/tensorflow/stream_executor/gpu/redzone_allocator.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/stream_executor/cuda/redzone_allocator.h"
+#include "tensorflow/stream_executor/gpu/redzone_allocator.h"
 
 #include "absl/container/fixed_array.h"
 #include "absl/strings/str_format.h"
@@ -21,15 +21,14 @@ limitations under the License.
 #include "tensorflow/core/framework/allocator.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status.h"
-#include "tensorflow/stream_executor/cuda/ptxas_utils.h"
 #include "tensorflow/stream_executor/device_memory.h"
+#include "tensorflow/stream_executor/gpu/asm_compiler.h"
 #include "tensorflow/stream_executor/kernel.h"
 #include "tensorflow/stream_executor/kernel_spec.h"
 #include "tensorflow/stream_executor/stream.h"
 #include "tensorflow/stream_executor/stream_executor_pimpl.h"
 
 namespace stream_executor {
-namespace cuda {
 
 // Rounds the value up to a multiple of the divisor by first calling CeilOfRatio
 // then multiplying by the divisor. For example: RoundUpToNearest(13, 8) => 16
@@ -44,10 +43,11 @@ constexpr int64 kRhsRedzoneAlign = 4;
 
 using RedzoneCheckStatus = RedzoneAllocator::RedzoneCheckStatus;
 
-RedzoneAllocator::RedzoneAllocator(
-    Stream* stream, DeviceMemoryAllocator* memory_allocator,
-    cuda::PtxCompilationOptions ptx_compilation_opts, int64 memory_limit,
-    int64 redzone_size, uint8 redzone_pattern)
+RedzoneAllocator::RedzoneAllocator(Stream* stream,
+                                   DeviceMemoryAllocator* memory_allocator,
+                                   GpuAsmOpts ptx_compilation_opts,
+                                   int64 memory_limit, int64 redzone_size,
+                                   uint8 redzone_pattern)
     : device_ordinal_(stream->parent()->device_ordinal()),
       stream_(stream),
       memory_limit_(memory_limit),
@@ -56,7 +56,7 @@ RedzoneAllocator::RedzoneAllocator(
           static_cast<int64>(tensorflow::Allocator::kAllocatorAlignment))),
       redzone_pattern_(redzone_pattern),
       memory_allocator_(memory_allocator),
-      ptx_compilation_opts_(ptx_compilation_opts) {}
+      gpu_compilation_opts_(ptx_compilation_opts) {}
 
 port::StatusOr<DeviceMemory<uint8>> RedzoneAllocator::AllocateBytes(
     int64 byte_size) {
@@ -302,8 +302,8 @@ port::StatusOr<RedzoneCheckStatus> RedzoneAllocator::CheckRedzones() const {
 
   absl::Span<const uint8> compiled_ptx = {};
   port::StatusOr<absl::Span<const uint8>> compiled_ptx_or =
-      cuda::CompilePtxOrGetCached(executor->device_ordinal(),
-                                  redzone_checker_ptx, ptx_compilation_opts_);
+      CompileGpuAsmOrGetCached(executor->device_ordinal(), redzone_checker_ptx,
+                               gpu_compilation_opts_);
   if (compiled_ptx_or.ok()) {
     compiled_ptx = compiled_ptx_or.ValueOrDie();
   } else {
@@ -346,5 +346,4 @@ std::string RedzoneCheckStatus::RedzoneFailureMsg() const {
       buffer_name, user_buffer_address, offset, expected_value, actual_value);
 }
 
-}  // namespace cuda
 }  // namespace stream_executor
diff --git a/tensorflow/stream_executor/cuda/redzone_allocator.h b/tensorflow/stream_executor/gpu/redzone_allocator.h
similarity index 91%
rename from tensorflow/stream_executor/cuda/redzone_allocator.h
rename to tensorflow/stream_executor/gpu/redzone_allocator.h
index d09a5c0903b..daa40d6ec9b 100644
--- a/tensorflow/stream_executor/cuda/redzone_allocator.h
+++ b/tensorflow/stream_executor/gpu/redzone_allocator.h
@@ -13,18 +13,17 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_STREAM_EXECUTOR_CUDA_REDZONE_ALLOCATOR_H_
-#define TENSORFLOW_STREAM_EXECUTOR_CUDA_REDZONE_ALLOCATOR_H_
+#ifndef TENSORFLOW_STREAM_EXECUTOR_GPU_REDZONE_ALLOCATOR_H_
+#define TENSORFLOW_STREAM_EXECUTOR_GPU_REDZONE_ALLOCATOR_H_
 
 #include <vector>
 
 #include "tensorflow/core/lib/math/math_util.h"
 #include "tensorflow/core/platform/stream_executor_no_cuda.h"
-#include "tensorflow/stream_executor/cuda/ptxas_utils.h"
 #include "tensorflow/stream_executor/device_memory_allocator.h"
+#include "tensorflow/stream_executor/gpu/asm_compiler.h"
 
 namespace stream_executor {
-namespace cuda {
 
 // An allocator that allocates a bit of extra memory around the beginning/end of
 // every allocation and can check that this memory is unmodified.
@@ -44,7 +43,7 @@ class RedzoneAllocator : public ScratchAllocator {
       1LL << 23;  // 8MiB per side, 16MiB total.
   static const uint8 kDefaultRedzonePattern = -1;
   RedzoneAllocator(Stream* stream, DeviceMemoryAllocator* memory_allocator,
-                   cuda::PtxCompilationOptions ptx_compilation_opts,
+                   GpuAsmOpts gpu_compilation_opts_,
                    int64 memory_limit = kDefaultMemoryLimit,
                    int64 redzone_size = kDefaultRedzoneSize,
                    uint8 redzone_pattern = kDefaultRedzonePattern);
@@ -112,7 +111,7 @@ class RedzoneAllocator : public ScratchAllocator {
 
   const uint8 redzone_pattern_;
   DeviceMemoryAllocator* memory_allocator_;
-  cuda::PtxCompilationOptions ptx_compilation_opts_;
+  GpuAsmOpts gpu_compilation_opts_;
 
   // The second element of the pair is the size of the user allocation.  This
   // isn't necessarily just first.size() - 2 * redzone_size_ because when the
@@ -123,7 +122,6 @@ class RedzoneAllocator : public ScratchAllocator {
   int64 allocated_bytes_excluding_redzones_ = 0;
 };
 
-}  // namespace cuda
 }  // namespace stream_executor
 
-#endif  // TENSORFLOW_STREAM_EXECUTOR_CUDA_REDZONE_ALLOCATOR_H_
+#endif  // TENSORFLOW_STREAM_EXECUTOR_GPU_REDZONE_ALLOCATOR_H_
diff --git a/tensorflow/stream_executor/host/host_gpu_executor.cc b/tensorflow/stream_executor/host/host_gpu_executor.cc
index 75f5431d2c2..4304bd6324c 100644
--- a/tensorflow/stream_executor/host/host_gpu_executor.cc
+++ b/tensorflow/stream_executor/host/host_gpu_executor.cc
@@ -41,7 +41,9 @@ HostExecutor::HostExecutor(const PluginConfig &plugin_config)
 
 HostExecutor::~HostExecutor() {}
 
-void *HostExecutor::Allocate(uint64 size) { return new char[size]; }
+DeviceMemoryBase HostExecutor::Allocate(uint64 size) {
+  return DeviceMemoryBase(new char[size], size);
+}
 
 void *HostExecutor::GetSubBuffer(DeviceMemoryBase *parent, uint64 offset_bytes,
                                  uint64 size_bytes) {
diff --git a/tensorflow/stream_executor/host/host_gpu_executor.h b/tensorflow/stream_executor/host/host_gpu_executor.h
index d0cc004e43c..a97d854cb61 100644
--- a/tensorflow/stream_executor/host/host_gpu_executor.h
+++ b/tensorflow/stream_executor/host/host_gpu_executor.h
@@ -60,7 +60,7 @@ class HostExecutor : public internal::StreamExecutorInterface {
     return port::UnimplementedError("Not Implemented");
   }
 
-  void *Allocate(uint64 size) override;
+  DeviceMemoryBase Allocate(uint64 size) override;
   void *GetSubBuffer(DeviceMemoryBase *parent, uint64 offset_bytes,
                      uint64 size_bytes) override;
   void Deallocate(DeviceMemoryBase *mem) override;
diff --git a/tensorflow/stream_executor/rocm/rocm_gpu_executor.cc b/tensorflow/stream_executor/rocm/rocm_gpu_executor.cc
index d1ee42e0448..61629194189 100644
--- a/tensorflow/stream_executor/rocm/rocm_gpu_executor.cc
+++ b/tensorflow/stream_executor/rocm/rocm_gpu_executor.cc
@@ -425,8 +425,8 @@ void GpuExecutor::VlogOccupancyInfo(const KernelBase& kernel,
   // TODO(ROCm) implement this feature in HIP
 }
 
-void* GpuExecutor::Allocate(uint64 size) {
-  return GpuDriver::DeviceAllocate(context_, size);
+DeviceMemoryBase GpuExecutor::Allocate(uint64 size) {
+  return DeviceMemoryBase(GpuDriver::DeviceAllocate(context_, size), size);
 }
 
 void* GpuExecutor::GetSubBuffer(DeviceMemoryBase* mem, uint64 offset_bytes,
diff --git a/tensorflow/stream_executor/stream_executor_internal.h b/tensorflow/stream_executor/stream_executor_internal.h
index ca60a0999bd..f5fdfdf8864 100644
--- a/tensorflow/stream_executor/stream_executor_internal.h
+++ b/tensorflow/stream_executor/stream_executor_internal.h
@@ -196,7 +196,7 @@ class StreamExecutorInterface {
 
   // Releases any state associated with the kernel.
   virtual void UnloadKernel(const KernelBase *kernel) {}
-  virtual void *Allocate(uint64 size) = 0;
+  virtual DeviceMemoryBase Allocate(uint64 size) = 0;
   virtual void *GetSubBuffer(DeviceMemoryBase *parent, uint64 offset,
                              uint64 size) = 0;
   virtual void Deallocate(DeviceMemoryBase *mem) = 0;
@@ -385,7 +385,6 @@ class StreamExecutorInterface {
   SE_DISALLOW_COPY_AND_ASSIGN(StreamExecutorInterface);
 };
 
-
 }  // namespace internal
 }  // namespace stream_executor
 
diff --git a/tensorflow/stream_executor/stream_executor_pimpl.cc b/tensorflow/stream_executor/stream_executor_pimpl.cc
index f8b6655e586..c44cb90a4b3 100644
--- a/tensorflow/stream_executor/stream_executor_pimpl.cc
+++ b/tensorflow/stream_executor/stream_executor_pimpl.cc
@@ -457,19 +457,19 @@ port::Status StreamExecutor::GetStatus(Stream *stream) {
   return implementation_->GetStatus(stream);
 }
 
-void *StreamExecutor::Allocate(uint64 size) {
+DeviceMemoryBase StreamExecutor::Allocate(uint64 size) {
   if (memory_limit_bytes_ > 0 &&
       mem_alloc_bytes_ + size > memory_limit_bytes_) {
     LOG(WARNING) << "Not enough memory to allocate " << size << " on device "
                  << device_ordinal_
                  << " within provided limit. [used=" << mem_alloc_bytes_
                  << ", limit=" << memory_limit_bytes_ << "]";
-    return nullptr;
+    return DeviceMemoryBase();
   }
-  void *buf = implementation_->Allocate(size);
+  DeviceMemoryBase buf = implementation_->Allocate(size);
   VLOG(1) << "Called StreamExecutor::Allocate(size=" << size << ") returns "
-          << buf << StackTraceIfVLOG10();
-  CreateAllocRecord(buf, size);
+          << buf.opaque() << StackTraceIfVLOG10();
+  CreateAllocRecord(buf.opaque(), size);
 
   return buf;
 }
diff --git a/tensorflow/stream_executor/stream_executor_pimpl.h b/tensorflow/stream_executor/stream_executor_pimpl.h
index efa4034c88a..0a1bfa33daf 100644
--- a/tensorflow/stream_executor/stream_executor_pimpl.h
+++ b/tensorflow/stream_executor/stream_executor_pimpl.h
@@ -520,9 +520,9 @@ class StreamExecutor {
   friend struct ThenBlasImpl;
 
   // Synchronously allocates size bytes on the underlying platform and returns
-  // an opaque void* representing that allocation. In the case of failure,
+  // a DeviceMemoryBase representing that allocation. In the case of failure,
   // nullptr is returned.
-  void *Allocate(uint64 size);
+  DeviceMemoryBase Allocate(uint64 size);
 
   // Gets-or-creates (creates with memoization) an RngSupport datatype that can
   // be used for random-number-generation routines on the current platform.
@@ -788,8 +788,7 @@ StreamExecutor::CreateTypedKernel(absl::string_view kernel_name,
 template <typename T>
 inline DeviceMemory<T> StreamExecutor::AllocateArray(uint64 element_count) {
   uint64 bytes = sizeof(T) * element_count;
-  void *opaque = Allocate(bytes);
-  return DeviceMemory<T>::MakeFromByteSize(opaque, bytes);
+  return DeviceMemory<T>(Allocate(bytes));
 }
 
 template <typename T>
@@ -825,12 +824,12 @@ ScopedDeviceMemory<ElemT>::ScopedDeviceMemory(
 
 template <typename T>
 DeviceMemory<T> StreamExecutor::AllocateZeroed() {
-  void *opaque = Allocate(sizeof(T));
-  if (opaque == nullptr) {
+  DeviceMemoryBase buf = Allocate(sizeof(T));
+  if (buf.is_null()) {
     return DeviceMemory<T>{};
   }
 
-  DeviceMemory<T> result = DeviceMemory<T>::MakeFromByteSize(opaque, sizeof(T));
+  DeviceMemory<T> result(buf);
   bool ok = SynchronousMemZero(&result, sizeof(T));
   if (!ok) {
     Deallocate(&result);
diff --git a/tensorflow/tensorflow.bzl b/tensorflow/tensorflow.bzl
index 7b251a7dd3a..4d454a64426 100644
--- a/tensorflow/tensorflow.bzl
+++ b/tensorflow/tensorflow.bzl
@@ -1496,7 +1496,7 @@ def tf_kernel_library(
 
 register_extension_info(
     extension_name = "tf_kernel_library",
-    label_regex_for_dep = "{extension_name}(_gpu)?",
+    label_regex_for_dep = "({extension_name}(_gpu)?|libtfkernel_{extension_name}\\.so)",
 )
 
 def tf_mkl_kernel_library(
@@ -2099,6 +2099,7 @@ def tf_py_test(
         additional_deps = additional_deps + tf_additional_grpc_deps_py()
 
     # Python version placeholder
+    kwargs.setdefault("srcs_version", "PY2AND3")
     py_test(
         name = name,
         size = size,
@@ -2109,7 +2110,6 @@ def tf_py_test(
         kernels = kernels,
         main = main,
         shard_count = shard_count,
-        srcs_version = "PY2AND3",
         tags = tags,
         visibility = [clean_dep("//tensorflow:internal")] +
                      additional_visibility,
@@ -2391,7 +2391,7 @@ def pybind_extension(
         features = [],
         srcs_version = "PY2AND3",
         data = [],
-        copts = None,
+        copts = [],
         linkopts = [],
         deps = [],
         visibility = None,
@@ -2437,7 +2437,7 @@ def pybind_extension(
         name = so_file,
         srcs = srcs + hdrs,
         data = data,
-        copts = copts,
+        copts = copts + ["-fexceptions"],
         linkopts = linkopts + _rpath_linkopts(name) + select({
             "@local_config_cuda//cuda:darwin": [
                 "-Wl,-exported_symbols_list,$(location %s)" % exported_symbols_file,
@@ -2452,7 +2452,7 @@ def pybind_extension(
             exported_symbols_file,
             version_script_file,
         ],
-        features = features,
+        features = features + ["-use_header_modules"],
         linkshared = 1,
         testonly = testonly,
         licenses = licenses,
@@ -2493,9 +2493,9 @@ def tf_python_pybind_extension(
         name,
         srcs,
         module_name,
-        hdrs = [],
         features = [],
-        copts = None,
+        copts = [],
+        hdrs = [],
         deps = []):
     """A wrapper macro for pybind_extension that is used in tensorflow/python/BUILD.
 
@@ -2506,9 +2506,9 @@ def tf_python_pybind_extension(
         name,
         srcs + tf_binary_additional_srcs(),
         module_name,
-        hdrs = hdrs,
         features = features,
         copts = copts,
+        hdrs = hdrs,
         deps = deps + tf_binary_pybind_deps(),
     )
 
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.estimator.-run-config.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-run-config.pbtxt
index 843298f61f8..b730913ca91 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.estimator.-run-config.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-run-config.pbtxt
@@ -82,6 +82,10 @@ tf_class {
     name: "session_config"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "session_creation_timeout_secs"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "task_id"
     mtype: "<type \'property\'>"
@@ -100,7 +104,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'model_dir\', \'tf_random_seed\', \'save_summary_steps\', \'save_checkpoints_steps\', \'save_checkpoints_secs\', \'session_config\', \'keep_checkpoint_max\', \'keep_checkpoint_every_n_hours\', \'log_step_count_steps\', \'train_distribute\', \'device_fn\', \'protocol\', \'eval_distribute\', \'experimental_distribute\', \'experimental_max_worker_delay_secs\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'100\', \'<object object instance>\', \'<object object instance>\', \'None\', \'5\', \'10000\', \'100\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'model_dir\', \'tf_random_seed\', \'save_summary_steps\', \'save_checkpoints_steps\', \'save_checkpoints_secs\', \'session_config\', \'keep_checkpoint_max\', \'keep_checkpoint_every_n_hours\', \'log_step_count_steps\', \'train_distribute\', \'device_fn\', \'protocol\', \'eval_distribute\', \'experimental_distribute\', \'experimental_max_worker_delay_secs\', \'session_creation_timeout_secs\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'100\', \'<object object instance>\', \'<object object instance>\', \'None\', \'5\', \'10000\', \'100\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'7200\'], "
   }
   member_method {
     name: "replace"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.estimator.tpu.-run-config.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.estimator.tpu.-run-config.pbtxt
index ea95acf18e5..3c94cf708e0 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.estimator.tpu.-run-config.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.estimator.tpu.-run-config.pbtxt
@@ -87,6 +87,10 @@ tf_class {
     name: "session_config"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "session_creation_timeout_secs"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "task_id"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.experimental.-peephole-l-s-t-m-cell.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.experimental.-peephole-l-s-t-m-cell.pbtxt
index 9c23568e3b8..6f214ecf10b 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.experimental.-peephole-l-s-t-m-cell.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.experimental.-peephole-l-s-t-m-cell.pbtxt
@@ -80,6 +80,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.experimental.-sequence-features.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.experimental.-sequence-features.pbtxt
index d5c79577782..0c32849561a 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.experimental.-sequence-features.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.experimental.-sequence-features.pbtxt
@@ -79,6 +79,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-abstract-r-n-n-cell.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-abstract-r-n-n-cell.pbtxt
index 595ef757cfb..dd22282d465 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-abstract-r-n-n-cell.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-abstract-r-n-n-cell.pbtxt
@@ -86,6 +86,10 @@ tf_class {
     name: "state_size"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-activation.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-activation.pbtxt
index 6f9e0bf2997..75d7a072851 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-activation.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-activation.pbtxt
@@ -78,6 +78,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-activity-regularization.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-activity-regularization.pbtxt
index ab04261130c..c16b1c5548d 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-activity-regularization.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-activity-regularization.pbtxt
@@ -78,6 +78,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-add.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-add.pbtxt
index 0aa9ce8ffe9..57fa16e14b2 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-add.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-add.pbtxt
@@ -79,6 +79,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-additive-attention.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-additive-attention.pbtxt
index 2cc75dc5118..c63ff3d3a36 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-additive-attention.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-additive-attention.pbtxt
@@ -79,6 +79,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-alpha-dropout.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-alpha-dropout.pbtxt
index cc4cfecec9c..d2387bb271c 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-alpha-dropout.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-alpha-dropout.pbtxt
@@ -78,6 +78,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-attention.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-attention.pbtxt
index b4a5fdb56fb..d42857d041d 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-attention.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-attention.pbtxt
@@ -79,6 +79,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-average-pooling1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-average-pooling1-d.pbtxt
index 10597fe6de0..e615535f477 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-average-pooling1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-average-pooling1-d.pbtxt
@@ -79,6 +79,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-average-pooling2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-average-pooling2-d.pbtxt
index c1abefc2861..268f46be93d 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-average-pooling2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-average-pooling2-d.pbtxt
@@ -79,6 +79,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-average-pooling3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-average-pooling3-d.pbtxt
index b2f2b1af603..eee3a809649 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-average-pooling3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-average-pooling3-d.pbtxt
@@ -79,6 +79,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-average.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-average.pbtxt
index d6eff05a492..5c29d2cd8ac 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-average.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-average.pbtxt
@@ -79,6 +79,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-avg-pool1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-avg-pool1-d.pbtxt
index 998ed2b71cf..a4efe517d5f 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-avg-pool1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-avg-pool1-d.pbtxt
@@ -79,6 +79,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-avg-pool2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-avg-pool2-d.pbtxt
index afe4b427bd9..704ed8751ad 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-avg-pool2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-avg-pool2-d.pbtxt
@@ -79,6 +79,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-avg-pool3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-avg-pool3-d.pbtxt
index 558a783beb9..4ab0c8118d6 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-avg-pool3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-avg-pool3-d.pbtxt
@@ -79,6 +79,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-batch-normalization.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-batch-normalization.pbtxt
index bcd3ed15573..b99661eba99 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-batch-normalization.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-batch-normalization.pbtxt
@@ -79,6 +79,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-bidirectional.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-bidirectional.pbtxt
index ca600890769..4d8fc6694a4 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-bidirectional.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-bidirectional.pbtxt
@@ -83,6 +83,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-concatenate.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-concatenate.pbtxt
index 65ddd376a4e..71773dd4ba1 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-concatenate.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-concatenate.pbtxt
@@ -79,6 +79,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv-l-s-t-m2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv-l-s-t-m2-d.pbtxt
index 027d1f9ec82..a31760190ff 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv-l-s-t-m2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv-l-s-t-m2-d.pbtxt
@@ -152,6 +152,10 @@ tf_class {
     name: "recurrent_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "states"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv1-d.pbtxt
index 82b18bfaf05..74969359fe1 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv1-d.pbtxt
@@ -79,6 +79,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv2-d-transpose.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv2-d-transpose.pbtxt
index 37b8290803d..cd130b50db1 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv2-d-transpose.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv2-d-transpose.pbtxt
@@ -80,6 +80,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv2-d.pbtxt
index 32eeb373f9a..cd23b8f5e55 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv2-d.pbtxt
@@ -79,6 +79,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv3-d-transpose.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv3-d-transpose.pbtxt
index f99f64ceb65..ee577ac5cfb 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv3-d-transpose.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv3-d-transpose.pbtxt
@@ -80,6 +80,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv3-d.pbtxt
index 6422501ab9e..1d8cf248d89 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv3-d.pbtxt
@@ -79,6 +79,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution1-d.pbtxt
index 6aeb1f6762c..926f2b77dd6 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution1-d.pbtxt
@@ -79,6 +79,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution2-d-transpose.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution2-d-transpose.pbtxt
index 4ed04f1cf46..23819dfeef4 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution2-d-transpose.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution2-d-transpose.pbtxt
@@ -80,6 +80,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution2-d.pbtxt
index 8e0bfdee2f0..34e0ecba5d3 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution2-d.pbtxt
@@ -79,6 +79,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution3-d-transpose.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution3-d-transpose.pbtxt
index bf42a30f104..6ebbc99fb78 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution3-d-transpose.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution3-d-transpose.pbtxt
@@ -80,6 +80,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution3-d.pbtxt
index fe07a733a61..96622497c64 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution3-d.pbtxt
@@ -79,6 +79,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cropping1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cropping1-d.pbtxt
index e38aeb14f8a..ee7342bdc87 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cropping1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cropping1-d.pbtxt
@@ -78,6 +78,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cropping2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cropping2-d.pbtxt
index 5544080f7b4..bc960d313d4 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cropping2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cropping2-d.pbtxt
@@ -78,6 +78,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cropping3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cropping3-d.pbtxt
index c858dc8f5b5..7dead423590 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cropping3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cropping3-d.pbtxt
@@ -78,6 +78,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cu-d-n-n-g-r-u.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cu-d-n-n-g-r-u.pbtxt
index c6d0cae4bb2..2045fb5bdc0 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cu-d-n-n-g-r-u.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cu-d-n-n-g-r-u.pbtxt
@@ -84,6 +84,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "states"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cu-d-n-n-l-s-t-m.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cu-d-n-n-l-s-t-m.pbtxt
index 747b0446048..a80bdf1d32a 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cu-d-n-n-l-s-t-m.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cu-d-n-n-l-s-t-m.pbtxt
@@ -84,6 +84,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "states"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-dense-features.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-dense-features.pbtxt
index 718b0f75c59..9d54752f328 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-dense-features.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-dense-features.pbtxt
@@ -79,6 +79,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-dense.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-dense.pbtxt
index 1955e1828da..998f086f5be 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-dense.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-dense.pbtxt
@@ -78,6 +78,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-depthwise-conv2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-depthwise-conv2-d.pbtxt
index da67276b89d..0e528a854fa 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-depthwise-conv2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-depthwise-conv2-d.pbtxt
@@ -80,6 +80,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-dot.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-dot.pbtxt
index 09534bc4476..4a13c7e9bd2 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-dot.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-dot.pbtxt
@@ -79,6 +79,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-dropout.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-dropout.pbtxt
index 74471db6fe3..2093bedc6b4 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-dropout.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-dropout.pbtxt
@@ -78,6 +78,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-e-l-u.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-e-l-u.pbtxt
index 2085d5e2a9f..7037749e44d 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-e-l-u.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-e-l-u.pbtxt
@@ -78,6 +78,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-embedding.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-embedding.pbtxt
index 4672a0df8cd..fb0f699dcd5 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-embedding.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-embedding.pbtxt
@@ -78,6 +78,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-flatten.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-flatten.pbtxt
index ce3cc686176..28fb7463f56 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-flatten.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-flatten.pbtxt
@@ -78,6 +78,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-g-r-u-cell.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-g-r-u-cell.pbtxt
index 7775d5f035a..9fa42495d66 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-g-r-u-cell.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-g-r-u-cell.pbtxt
@@ -79,6 +79,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-g-r-u.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-g-r-u.pbtxt
index 12ec00b777f..419a2ef8c85 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-g-r-u.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-g-r-u.pbtxt
@@ -139,6 +139,10 @@ tf_class {
     name: "reset_after"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "states"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-gaussian-dropout.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-gaussian-dropout.pbtxt
index 079f5b72eee..4b0be9fd513 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-gaussian-dropout.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-gaussian-dropout.pbtxt
@@ -78,6 +78,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-gaussian-noise.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-gaussian-noise.pbtxt
index 5dfa29c13f0..2b7dbdf4cdd 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-gaussian-noise.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-gaussian-noise.pbtxt
@@ -78,6 +78,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-average-pooling1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-average-pooling1-d.pbtxt
index 763e9238420..ca436ceb729 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-average-pooling1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-average-pooling1-d.pbtxt
@@ -79,6 +79,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-average-pooling2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-average-pooling2-d.pbtxt
index 347c3129035..16f543bb559 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-average-pooling2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-average-pooling2-d.pbtxt
@@ -79,6 +79,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-average-pooling3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-average-pooling3-d.pbtxt
index ff84e8ed221..e628cfbc9c1 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-average-pooling3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-average-pooling3-d.pbtxt
@@ -79,6 +79,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-avg-pool1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-avg-pool1-d.pbtxt
index d0965962252..75de058501f 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-avg-pool1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-avg-pool1-d.pbtxt
@@ -79,6 +79,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-avg-pool2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-avg-pool2-d.pbtxt
index 6b12001650a..65aea76985a 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-avg-pool2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-avg-pool2-d.pbtxt
@@ -79,6 +79,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-avg-pool3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-avg-pool3-d.pbtxt
index 35dfa019fe9..f89a720bc4d 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-avg-pool3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-avg-pool3-d.pbtxt
@@ -79,6 +79,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pool1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pool1-d.pbtxt
index cfc3d28a79d..137046d71b5 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pool1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pool1-d.pbtxt
@@ -79,6 +79,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pool2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pool2-d.pbtxt
index 8829b2dedb6..8b4fb108520 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pool2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pool2-d.pbtxt
@@ -79,6 +79,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pool3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pool3-d.pbtxt
index 0513968de37..065d75d0f5e 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pool3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pool3-d.pbtxt
@@ -79,6 +79,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pooling1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pooling1-d.pbtxt
index c6536083f40..dc42e79d466 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pooling1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pooling1-d.pbtxt
@@ -79,6 +79,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pooling2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pooling2-d.pbtxt
index 778ac960287..bfef3ad9f31 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pooling2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pooling2-d.pbtxt
@@ -79,6 +79,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pooling3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pooling3-d.pbtxt
index 828ac869e6a..00346e8cd4a 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pooling3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pooling3-d.pbtxt
@@ -79,6 +79,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-input-layer.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-input-layer.pbtxt
index d6375854f4e..981a5999418 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-input-layer.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-input-layer.pbtxt
@@ -78,6 +78,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-l-s-t-m-cell.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-l-s-t-m-cell.pbtxt
index 9ca3a2b9987..2dcd447107f 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-l-s-t-m-cell.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-l-s-t-m-cell.pbtxt
@@ -79,6 +79,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-l-s-t-m.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-l-s-t-m.pbtxt
index acf9a3f5cec..e9b1f4284d8 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-l-s-t-m.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-l-s-t-m.pbtxt
@@ -135,6 +135,10 @@ tf_class {
     name: "recurrent_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "states"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-lambda.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-lambda.pbtxt
index ee0f5bab3a6..0dd9c220f8f 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-lambda.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-lambda.pbtxt
@@ -78,6 +78,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-layer-normalization.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-layer-normalization.pbtxt
index b5ce0bf21e8..bd352257e6d 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-layer-normalization.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-layer-normalization.pbtxt
@@ -78,6 +78,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-layer.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-layer.pbtxt
index 87070ed86f0..077aa8670d2 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-layer.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-layer.pbtxt
@@ -77,6 +77,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-leaky-re-l-u.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-leaky-re-l-u.pbtxt
index cd64b92927a..8ffd2ab8897 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-leaky-re-l-u.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-leaky-re-l-u.pbtxt
@@ -78,6 +78,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-locally-connected1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-locally-connected1-d.pbtxt
index d7eff7256cc..def8f117547 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-locally-connected1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-locally-connected1-d.pbtxt
@@ -78,6 +78,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-locally-connected2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-locally-connected2-d.pbtxt
index f8a345956f4..44cdbf84d4a 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-locally-connected2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-locally-connected2-d.pbtxt
@@ -78,6 +78,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-masking.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-masking.pbtxt
index ad20f1108ae..d87a67b1d26 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-masking.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-masking.pbtxt
@@ -78,6 +78,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pool1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pool1-d.pbtxt
index f5c76e41c0c..801f9f9754d 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pool1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pool1-d.pbtxt
@@ -79,6 +79,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pool2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pool2-d.pbtxt
index 6d0472f5272..546964e017a 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pool2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pool2-d.pbtxt
@@ -79,6 +79,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pool3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pool3-d.pbtxt
index 6881ecaa57d..e1180bee3f0 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pool3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pool3-d.pbtxt
@@ -79,6 +79,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pooling1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pooling1-d.pbtxt
index 39013b24cd2..cea4913d199 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pooling1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pooling1-d.pbtxt
@@ -79,6 +79,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pooling2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pooling2-d.pbtxt
index 283707ff2af..78c8f4d7a91 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pooling2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pooling2-d.pbtxt
@@ -79,6 +79,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pooling3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pooling3-d.pbtxt
index 8c7757903f2..d9760cc9d8c 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pooling3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pooling3-d.pbtxt
@@ -79,6 +79,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-maximum.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-maximum.pbtxt
index 5e7c03ede38..8936ce4afc5 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-maximum.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-maximum.pbtxt
@@ -79,6 +79,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-minimum.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-minimum.pbtxt
index 9b9524f4cb6..b73d89cdb2b 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-minimum.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-minimum.pbtxt
@@ -79,6 +79,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-multiply.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-multiply.pbtxt
index dc1273d67d4..2aa98b84534 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-multiply.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-multiply.pbtxt
@@ -79,6 +79,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-p-re-l-u.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-p-re-l-u.pbtxt
index 833e951f794..2bee039cd84 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-p-re-l-u.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-p-re-l-u.pbtxt
@@ -78,6 +78,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-permute.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-permute.pbtxt
index 507df211484..605b5cffa9e 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-permute.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-permute.pbtxt
@@ -78,6 +78,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-r-n-n.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-r-n-n.pbtxt
index e4ca62748f0..e2fd95541a7 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-r-n-n.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-r-n-n.pbtxt
@@ -78,6 +78,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "states"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-re-l-u.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-re-l-u.pbtxt
index 17307bf8a49..fd3089ca422 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-re-l-u.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-re-l-u.pbtxt
@@ -78,6 +78,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-repeat-vector.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-repeat-vector.pbtxt
index aabbb55e40d..481868f4258 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-repeat-vector.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-repeat-vector.pbtxt
@@ -78,6 +78,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-reshape.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-reshape.pbtxt
index 9a2ecb0b16e..7509236e3ef 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-reshape.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-reshape.pbtxt
@@ -78,6 +78,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-separable-conv1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-separable-conv1-d.pbtxt
index f0e9ceace32..59384af69c8 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-separable-conv1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-separable-conv1-d.pbtxt
@@ -80,6 +80,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-separable-conv2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-separable-conv2-d.pbtxt
index 4a45b309801..07979ca71b5 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-separable-conv2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-separable-conv2-d.pbtxt
@@ -80,6 +80,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-separable-convolution1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-separable-convolution1-d.pbtxt
index 66945c2a2f4..4110880623a 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-separable-convolution1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-separable-convolution1-d.pbtxt
@@ -80,6 +80,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-separable-convolution2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-separable-convolution2-d.pbtxt
index caf1fd8de75..93472cb42af 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-separable-convolution2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-separable-convolution2-d.pbtxt
@@ -80,6 +80,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-simple-r-n-n-cell.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-simple-r-n-n-cell.pbtxt
index edecba5218a..833118d92e4 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-simple-r-n-n-cell.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-simple-r-n-n-cell.pbtxt
@@ -79,6 +79,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-simple-r-n-n.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-simple-r-n-n.pbtxt
index e9425da97bd..f777d01603c 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-simple-r-n-n.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-simple-r-n-n.pbtxt
@@ -127,6 +127,10 @@ tf_class {
     name: "recurrent_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "states"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-softmax.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-softmax.pbtxt
index 0d568225db8..f53b421d8aa 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-softmax.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-softmax.pbtxt
@@ -78,6 +78,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-spatial-dropout1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-spatial-dropout1-d.pbtxt
index 84c3725f005..ba406d8345a 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-spatial-dropout1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-spatial-dropout1-d.pbtxt
@@ -79,6 +79,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-spatial-dropout2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-spatial-dropout2-d.pbtxt
index 90448585e68..792f54806ca 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-spatial-dropout2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-spatial-dropout2-d.pbtxt
@@ -79,6 +79,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-spatial-dropout3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-spatial-dropout3-d.pbtxt
index 530b1e363fe..77c609653d8 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-spatial-dropout3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-spatial-dropout3-d.pbtxt
@@ -79,6 +79,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-stacked-r-n-n-cells.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-stacked-r-n-n-cells.pbtxt
index 6229011bac0..da0244db822 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-stacked-r-n-n-cells.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-stacked-r-n-n-cells.pbtxt
@@ -86,6 +86,10 @@ tf_class {
     name: "state_size"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-subtract.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-subtract.pbtxt
index 4b82875ae0f..2e81f864ea5 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-subtract.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-subtract.pbtxt
@@ -79,6 +79,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-thresholded-re-l-u.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-thresholded-re-l-u.pbtxt
index 85ca2fb9ab6..54b7b0b457d 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-thresholded-re-l-u.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-thresholded-re-l-u.pbtxt
@@ -78,6 +78,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-time-distributed.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-time-distributed.pbtxt
index e3627416221..eadd5212a61 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-time-distributed.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-time-distributed.pbtxt
@@ -79,6 +79,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-up-sampling1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-up-sampling1-d.pbtxt
index 0215e708884..ea569ec906e 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-up-sampling1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-up-sampling1-d.pbtxt
@@ -78,6 +78,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-up-sampling2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-up-sampling2-d.pbtxt
index 735469eb70e..91cdd3d9838 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-up-sampling2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-up-sampling2-d.pbtxt
@@ -78,6 +78,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-up-sampling3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-up-sampling3-d.pbtxt
index ee75caacc67..4fb02a19a22 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-up-sampling3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-up-sampling3-d.pbtxt
@@ -78,6 +78,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-wrapper.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-wrapper.pbtxt
index ee5ed97987b..d412f09306a 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-wrapper.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-wrapper.pbtxt
@@ -78,6 +78,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-zero-padding1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-zero-padding1-d.pbtxt
index 6dfc1281265..2e8d03cb5d3 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-zero-padding1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-zero-padding1-d.pbtxt
@@ -78,6 +78,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-zero-padding2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-zero-padding2-d.pbtxt
index 0bd8230caab..bbd0e007626 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-zero-padding2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-zero-padding2-d.pbtxt
@@ -78,6 +78,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-zero-padding3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-zero-padding3-d.pbtxt
index 081b60f5a6b..6c0be03beb6 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-zero-padding3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-zero-padding3-d.pbtxt
@@ -78,6 +78,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-a-u-c.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-a-u-c.pbtxt
index 031f50119f3..84684d5112e 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-a-u-c.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-a-u-c.pbtxt
@@ -79,6 +79,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-accuracy.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-accuracy.pbtxt
index f9e8ff5dc3f..1d5f21871b1 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-accuracy.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-accuracy.pbtxt
@@ -82,6 +82,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-binary-accuracy.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-binary-accuracy.pbtxt
index c3c31ccd9ba..d3ff59ba251 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-binary-accuracy.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-binary-accuracy.pbtxt
@@ -82,6 +82,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-binary-crossentropy.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-binary-crossentropy.pbtxt
index 8f589aac0b6..2aaa33cf236 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-binary-crossentropy.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-binary-crossentropy.pbtxt
@@ -82,6 +82,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-categorical-accuracy.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-categorical-accuracy.pbtxt
index 9a496dc6396..f10bcd1b3ed 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-categorical-accuracy.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-categorical-accuracy.pbtxt
@@ -82,6 +82,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-categorical-crossentropy.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-categorical-crossentropy.pbtxt
index 85ad7ee52ef..252ee32e9e6 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-categorical-crossentropy.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-categorical-crossentropy.pbtxt
@@ -82,6 +82,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-categorical-hinge.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-categorical-hinge.pbtxt
index 462ee8bf46e..040f8683294 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-categorical-hinge.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-categorical-hinge.pbtxt
@@ -82,6 +82,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-cosine-similarity.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-cosine-similarity.pbtxt
index 93b668a0cbd..c7e5e2889e3 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-cosine-similarity.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-cosine-similarity.pbtxt
@@ -82,6 +82,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-false-negatives.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-false-negatives.pbtxt
index 5c4d9215cff..94f6f1dea71 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-false-negatives.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-false-negatives.pbtxt
@@ -80,6 +80,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-false-positives.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-false-positives.pbtxt
index 354a3ee5127..4eabf3ab01b 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-false-positives.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-false-positives.pbtxt
@@ -80,6 +80,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-hinge.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-hinge.pbtxt
index 8066dc86b32..181594ddeeb 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-hinge.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-hinge.pbtxt
@@ -82,6 +82,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-k-l-divergence.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-k-l-divergence.pbtxt
index 44e0d4da17c..d8dda74df2b 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-k-l-divergence.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-k-l-divergence.pbtxt
@@ -82,6 +82,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-log-cosh-error.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-log-cosh-error.pbtxt
index 36644ce0671..0709b09a035 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-log-cosh-error.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-log-cosh-error.pbtxt
@@ -82,6 +82,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-mean-absolute-error.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-mean-absolute-error.pbtxt
index f61b33b3b2f..3929a50f1df 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-mean-absolute-error.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-mean-absolute-error.pbtxt
@@ -82,6 +82,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-mean-absolute-percentage-error.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-mean-absolute-percentage-error.pbtxt
index 2216343cec0..0fc7eb57cfc 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-mean-absolute-percentage-error.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-mean-absolute-percentage-error.pbtxt
@@ -82,6 +82,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-mean-io-u.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-mean-io-u.pbtxt
index ee0268c8d39..77d9b9b3d19 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-mean-io-u.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-mean-io-u.pbtxt
@@ -79,6 +79,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-mean-relative-error.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-mean-relative-error.pbtxt
index 811bd8b4032..2507fdb639c 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-mean-relative-error.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-mean-relative-error.pbtxt
@@ -81,6 +81,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-mean-squared-error.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-mean-squared-error.pbtxt
index 65a8738dd39..f5f4170df6a 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-mean-squared-error.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-mean-squared-error.pbtxt
@@ -82,6 +82,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-mean-squared-logarithmic-error.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-mean-squared-logarithmic-error.pbtxt
index 882fc260c97..2845bfc6b92 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-mean-squared-logarithmic-error.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-mean-squared-logarithmic-error.pbtxt
@@ -82,6 +82,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-mean-tensor.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-mean-tensor.pbtxt
index 80aaf335e39..db50c90d539 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-mean-tensor.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-mean-tensor.pbtxt
@@ -83,6 +83,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-mean.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-mean.pbtxt
index 20270089a22..662ee90c43a 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-mean.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-mean.pbtxt
@@ -80,6 +80,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-metric.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-metric.pbtxt
index 625a2ac6ee3..c7a31e96985 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-metric.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-metric.pbtxt
@@ -78,6 +78,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-poisson.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-poisson.pbtxt
index 02223475046..8474174a651 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-poisson.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-poisson.pbtxt
@@ -82,6 +82,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-precision.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-precision.pbtxt
index c68067ef76e..387957b8b0d 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-precision.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-precision.pbtxt
@@ -79,6 +79,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-recall.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-recall.pbtxt
index 5d101daec2a..f7612622586 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-recall.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-recall.pbtxt
@@ -79,6 +79,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-root-mean-squared-error.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-root-mean-squared-error.pbtxt
index a1b63dc494b..fa90c648094 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-root-mean-squared-error.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-root-mean-squared-error.pbtxt
@@ -81,6 +81,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-sensitivity-at-specificity.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-sensitivity-at-specificity.pbtxt
index 02ba5b724a3..ede3905bd43 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-sensitivity-at-specificity.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-sensitivity-at-specificity.pbtxt
@@ -80,6 +80,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-sparse-categorical-accuracy.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-sparse-categorical-accuracy.pbtxt
index 03f3c7f1964..ce24016f128 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-sparse-categorical-accuracy.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-sparse-categorical-accuracy.pbtxt
@@ -82,6 +82,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-sparse-categorical-crossentropy.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-sparse-categorical-crossentropy.pbtxt
index e7c07d90853..74a9010f085 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-sparse-categorical-crossentropy.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-sparse-categorical-crossentropy.pbtxt
@@ -82,6 +82,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-sparse-top-k-categorical-accuracy.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-sparse-top-k-categorical-accuracy.pbtxt
index d86b149238e..a3709d03a85 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-sparse-top-k-categorical-accuracy.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-sparse-top-k-categorical-accuracy.pbtxt
@@ -82,6 +82,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-specificity-at-sensitivity.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-specificity-at-sensitivity.pbtxt
index 8bed099c95b..d986ef77e11 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-specificity-at-sensitivity.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-specificity-at-sensitivity.pbtxt
@@ -80,6 +80,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-squared-hinge.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-squared-hinge.pbtxt
index 704d6e1a923..e61667ded19 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-squared-hinge.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-squared-hinge.pbtxt
@@ -82,6 +82,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-sum.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-sum.pbtxt
index 099fbc6fdcf..e4b006b7e0c 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-sum.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-sum.pbtxt
@@ -80,6 +80,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-top-k-categorical-accuracy.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-top-k-categorical-accuracy.pbtxt
index 71f261cd598..df761b75b45 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-top-k-categorical-accuracy.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-top-k-categorical-accuracy.pbtxt
@@ -82,6 +82,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-true-negatives.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-true-negatives.pbtxt
index 639a328c80f..93971a1a749 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-true-negatives.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-true-negatives.pbtxt
@@ -80,6 +80,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-true-positives.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-true-positives.pbtxt
index 6e6bf0401e2..4df8ff463c4 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-true-positives.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-true-positives.pbtxt
@@ -80,6 +80,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.mixed_precision.experimental.-loss-scale-optimizer.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.mixed_precision.experimental.-loss-scale-optimizer.pbtxt
index 31696139210..c4049240358 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.mixed_precision.experimental.-loss-scale-optimizer.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.mixed_precision.experimental.-loss-scale-optimizer.pbtxt
@@ -26,7 +26,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'opt\', \'loss_scale\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'optimizer\', \'loss_scale\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "add_slot"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.layers.-average-pooling1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.layers.-average-pooling1-d.pbtxt
index e0d0f1c95b1..001f81004bc 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.layers.-average-pooling1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.layers.-average-pooling1-d.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "scope_name"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.layers.-average-pooling2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.layers.-average-pooling2-d.pbtxt
index 95c92ddd6c4..225058bdf1b 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.layers.-average-pooling2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.layers.-average-pooling2-d.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "scope_name"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.layers.-average-pooling3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.layers.-average-pooling3-d.pbtxt
index 5d0b1b4b3d1..a53fce11a29 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.layers.-average-pooling3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.layers.-average-pooling3-d.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "scope_name"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.layers.-batch-normalization.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.layers.-batch-normalization.pbtxt
index 88b0b347e27..5f89772c2e5 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.layers.-batch-normalization.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.layers.-batch-normalization.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "scope_name"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv1-d.pbtxt
index 9c860032454..ea4f51dc051 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv1-d.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "scope_name"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv2-d-transpose.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv2-d-transpose.pbtxt
index 9fbabd6c4c0..97cc449654e 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv2-d-transpose.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv2-d-transpose.pbtxt
@@ -90,6 +90,10 @@ tf_class {
     name: "scope_name"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv2-d.pbtxt
index 0d862e67321..3d5784fc96b 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv2-d.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "scope_name"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv3-d-transpose.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv3-d-transpose.pbtxt
index 75d8daf7300..4721f0552b3 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv3-d-transpose.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv3-d-transpose.pbtxt
@@ -90,6 +90,10 @@ tf_class {
     name: "scope_name"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv3-d.pbtxt
index 08f515a8fd5..4819fbce4c8 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv3-d.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "scope_name"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.layers.-dense.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.layers.-dense.pbtxt
index 7e4ba72afa5..df2b12685e0 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.layers.-dense.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.layers.-dense.pbtxt
@@ -88,6 +88,10 @@ tf_class {
     name: "scope_name"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.layers.-dropout.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.layers.-dropout.pbtxt
index 35c1299076c..daf72e671ca 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.layers.-dropout.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.layers.-dropout.pbtxt
@@ -88,6 +88,10 @@ tf_class {
     name: "scope_name"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.layers.-flatten.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.layers.-flatten.pbtxt
index 886995d1a33..e004e6a8929 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.layers.-flatten.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.layers.-flatten.pbtxt
@@ -88,6 +88,10 @@ tf_class {
     name: "scope_name"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.layers.-layer.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.layers.-layer.pbtxt
index 73dbb6f3a99..d556e5e3249 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.layers.-layer.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.layers.-layer.pbtxt
@@ -86,6 +86,10 @@ tf_class {
     name: "scope_name"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.layers.-max-pooling1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.layers.-max-pooling1-d.pbtxt
index a4fa9a4d417..1a5941da5a0 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.layers.-max-pooling1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.layers.-max-pooling1-d.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "scope_name"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.layers.-max-pooling2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.layers.-max-pooling2-d.pbtxt
index e829d8d01f5..a0e026cb1e0 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.layers.-max-pooling2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.layers.-max-pooling2-d.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "scope_name"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.layers.-max-pooling3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.layers.-max-pooling3-d.pbtxt
index 29e53e465b7..c7fa5e251ff 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.layers.-max-pooling3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.layers.-max-pooling3-d.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "scope_name"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.layers.-separable-conv1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.layers.-separable-conv1-d.pbtxt
index 90f11e70b1e..43a636ea199 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.layers.-separable-conv1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.layers.-separable-conv1-d.pbtxt
@@ -90,6 +90,10 @@ tf_class {
     name: "scope_name"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.layers.-separable-conv2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.layers.-separable-conv2-d.pbtxt
index 206677e9780..488dfa0add1 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.layers.-separable-conv2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.layers.-separable-conv2-d.pbtxt
@@ -90,6 +90,10 @@ tf_class {
     name: "scope_name"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.lite.experimental.nn.-t-f-lite-l-s-t-m-cell.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.lite.experimental.nn.-t-f-lite-l-s-t-m-cell.pbtxt
index c11eda27a22..020c71e70ec 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.lite.experimental.nn.-t-f-lite-l-s-t-m-cell.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.lite.experimental.nn.-t-f-lite-l-s-t-m-cell.pbtxt
@@ -97,6 +97,10 @@ tf_class {
     name: "state_size"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.lite.experimental.nn.-tf-lite-r-n-n-cell.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.lite.experimental.nn.-tf-lite-r-n-n-cell.pbtxt
index 7f4af5ebd45..c5cfd17377e 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.lite.experimental.nn.-tf-lite-r-n-n-cell.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.lite.experimental.nn.-tf-lite-r-n-n-cell.pbtxt
@@ -97,6 +97,10 @@ tf_class {
     name: "state_size"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.mlir.experimental.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.mlir.experimental.pbtxt
new file mode 100644
index 00000000000..b39c6d3b85d
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.mlir.experimental.pbtxt
@@ -0,0 +1,7 @@
+path: "tensorflow.mlir.experimental"
+tf_module {
+  member_method {
+    name: "convert_graph_def"
+    argspec: "args=[\'graph_def\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.mlir.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.mlir.pbtxt
new file mode 100644
index 00000000000..d4ad55c8999
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.mlir.pbtxt
@@ -0,0 +1,7 @@
+path: "tensorflow.mlir"
+tf_module {
+  member {
+    name: "experimental"
+    mtype: "<type \'module\'>"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-basic-l-s-t-m-cell.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-basic-l-s-t-m-cell.pbtxt
index 138ce1c3b19..a3577e3df97 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-basic-l-s-t-m-cell.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-basic-l-s-t-m-cell.pbtxt
@@ -97,6 +97,10 @@ tf_class {
     name: "state_size"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-basic-r-n-n-cell.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-basic-r-n-n-cell.pbtxt
index dc3bb974075..db6ef100ba0 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-basic-r-n-n-cell.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-basic-r-n-n-cell.pbtxt
@@ -97,6 +97,10 @@ tf_class {
     name: "state_size"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-device-wrapper.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-device-wrapper.pbtxt
index 89710794e31..1a337097ba6 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-device-wrapper.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-device-wrapper.pbtxt
@@ -98,6 +98,10 @@ tf_class {
     name: "state_size"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-dropout-wrapper.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-dropout-wrapper.pbtxt
index 4058574cbd3..0eec5ed4e6a 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-dropout-wrapper.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-dropout-wrapper.pbtxt
@@ -98,6 +98,10 @@ tf_class {
     name: "state_size"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-g-r-u-cell.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-g-r-u-cell.pbtxt
index bd810fbb5a8..c7d24e1a720 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-g-r-u-cell.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-g-r-u-cell.pbtxt
@@ -97,6 +97,10 @@ tf_class {
     name: "state_size"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-l-s-t-m-cell.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-l-s-t-m-cell.pbtxt
index 84f593e6887..b0ec38e7941 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-l-s-t-m-cell.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-l-s-t-m-cell.pbtxt
@@ -97,6 +97,10 @@ tf_class {
     name: "state_size"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-multi-r-n-n-cell.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-multi-r-n-n-cell.pbtxt
index 6f055ba4911..33ceeca68ef 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-multi-r-n-n-cell.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-multi-r-n-n-cell.pbtxt
@@ -96,6 +96,10 @@ tf_class {
     name: "state_size"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-r-n-n-cell.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-r-n-n-cell.pbtxt
index 400e3a5660f..a81f5e80fe8 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-r-n-n-cell.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-r-n-n-cell.pbtxt
@@ -95,6 +95,10 @@ tf_class {
     name: "state_size"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-residual-wrapper.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-residual-wrapper.pbtxt
index 1f04568bc12..73f22f4b260 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-residual-wrapper.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-residual-wrapper.pbtxt
@@ -98,6 +98,10 @@ tf_class {
     name: "state_size"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.pbtxt
index bdccd5b436c..976da90a72d 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.pbtxt
@@ -508,6 +508,10 @@ tf_module {
     name: "metrics"
     mtype: "<type \'module\'>"
   }
+  member {
+    name: "mlir"
+    mtype: "<type \'module\'>"
+  }
   member {
     name: "name_scope"
     mtype: "<type \'type\'>"
@@ -1102,7 +1106,7 @@ tf_module {
   }
   member_method {
     name: "dequantize"
-    argspec: "args=[\'input\', \'min_range\', \'max_range\', \'mode\', \'name\'], varargs=None, keywords=None, defaults=[\'MIN_COMBINED\', \'None\'], "
+    argspec: "args=[\'input\', \'min_range\', \'max_range\', \'mode\', \'name\', \'axis\'], varargs=None, keywords=None, defaults=[\'MIN_COMBINED\', \'None\', \'None\'], "
   }
   member_method {
     name: "deserialize_many_sparse"
@@ -1810,11 +1814,11 @@ tf_module {
   }
   member_method {
     name: "quantize"
-    argspec: "args=[\'input\', \'min_range\', \'max_range\', \'T\', \'mode\', \'round_mode\', \'name\'], varargs=None, keywords=None, defaults=[\'MIN_COMBINED\', \'HALF_AWAY_FROM_ZERO\', \'None\'], "
+    argspec: "args=[\'input\', \'min_range\', \'max_range\', \'T\', \'mode\', \'round_mode\', \'name\', \'narrow_range\', \'axis\'], varargs=None, keywords=None, defaults=[\'MIN_COMBINED\', \'HALF_AWAY_FROM_ZERO\', \'None\', \'False\', \'None\'], "
   }
   member_method {
     name: "quantize_v2"
-    argspec: "args=[\'input\', \'min_range\', \'max_range\', \'T\', \'mode\', \'name\', \'round_mode\'], varargs=None, keywords=None, defaults=[\'MIN_COMBINED\', \'None\', \'HALF_AWAY_FROM_ZERO\'], "
+    argspec: "args=[\'input\', \'min_range\', \'max_range\', \'T\', \'mode\', \'name\', \'round_mode\', \'narrow_range\', \'axis\'], varargs=None, keywords=None, defaults=[\'MIN_COMBINED\', \'None\', \'HALF_AWAY_FROM_ZERO\', \'False\', \'None\'], "
   }
   member_method {
     name: "quantized_concat"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.quantization.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.quantization.pbtxt
index e5921e8c3b2..72fb6ec854f 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.quantization.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.quantization.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.quantization"
 tf_module {
   member_method {
     name: "dequantize"
-    argspec: "args=[\'input\', \'min_range\', \'max_range\', \'mode\', \'name\'], varargs=None, keywords=None, defaults=[\'MIN_COMBINED\', \'None\'], "
+    argspec: "args=[\'input\', \'min_range\', \'max_range\', \'mode\', \'name\', \'axis\'], varargs=None, keywords=None, defaults=[\'MIN_COMBINED\', \'None\', \'None\'], "
   }
   member_method {
     name: "fake_quant_with_min_max_args"
@@ -30,7 +30,7 @@ tf_module {
   }
   member_method {
     name: "quantize"
-    argspec: "args=[\'input\', \'min_range\', \'max_range\', \'T\', \'mode\', \'round_mode\', \'name\'], varargs=None, keywords=None, defaults=[\'MIN_COMBINED\', \'HALF_AWAY_FROM_ZERO\', \'None\'], "
+    argspec: "args=[\'input\', \'min_range\', \'max_range\', \'T\', \'mode\', \'round_mode\', \'name\', \'narrow_range\', \'axis\'], varargs=None, keywords=None, defaults=[\'MIN_COMBINED\', \'HALF_AWAY_FROM_ZERO\', \'None\', \'False\', \'None\'], "
   }
   member_method {
     name: "quantize_and_dequantize"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.raw_ops.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.raw_ops.pbtxt
index 25fa79970be..f6d953d5df7 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.raw_ops.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.raw_ops.pbtxt
@@ -1026,7 +1026,7 @@ tf_module {
   }
   member_method {
     name: "Dequantize"
-    argspec: "args=[\'input\', \'min_range\', \'max_range\', \'mode\', \'name\'], varargs=None, keywords=None, defaults=[\'MIN_COMBINED\', \'None\'], "
+    argspec: "args=[\'input\', \'min_range\', \'max_range\', \'mode\', \'axis\', \'name\'], varargs=None, keywords=None, defaults=[\'MIN_COMBINED\', \'-1\', \'None\'], "
   }
   member_method {
     name: "DeserializeIterator"
@@ -1174,7 +1174,7 @@ tf_module {
   }
   member_method {
     name: "Equal"
-    argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'x\', \'y\', \'incompatible_shape_error\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'None\'], "
   }
   member_method {
     name: "Erf"
@@ -1626,15 +1626,15 @@ tf_module {
   }
   member_method {
     name: "IRFFT"
-    argspec: "args=[\'input\', \'fft_length\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'input\', \'fft_length\', \'Treal\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'float32\'>\", \'None\'], "
   }
   member_method {
     name: "IRFFT2D"
-    argspec: "args=[\'input\', \'fft_length\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'input\', \'fft_length\', \'Treal\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'float32\'>\", \'None\'], "
   }
   member_method {
     name: "IRFFT3D"
-    argspec: "args=[\'input\', \'fft_length\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'input\', \'fft_length\', \'Treal\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'float32\'>\", \'None\'], "
   }
   member_method {
     name: "Identity"
@@ -1898,71 +1898,71 @@ tf_module {
   }
   member_method {
     name: "LoadTPUEmbeddingADAMParameters"
-    argspec: "args=[\'parameters\', \'momenta\', \'velocities\', \'num_shards\', \'shard_id\', \'table_id\', \'table_name\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'\', \'None\'], "
+    argspec: "args=[\'parameters\', \'momenta\', \'velocities\', \'num_shards\', \'shard_id\', \'table_id\', \'table_name\', \'config\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'\', \'\', \'None\'], "
   }
   member_method {
     name: "LoadTPUEmbeddingADAMParametersGradAccumDebug"
-    argspec: "args=[\'parameters\', \'momenta\', \'velocities\', \'gradient_accumulators\', \'num_shards\', \'shard_id\', \'table_id\', \'table_name\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'\', \'None\'], "
+    argspec: "args=[\'parameters\', \'momenta\', \'velocities\', \'gradient_accumulators\', \'num_shards\', \'shard_id\', \'table_id\', \'table_name\', \'config\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'\', \'\', \'None\'], "
   }
   member_method {
     name: "LoadTPUEmbeddingAdadeltaParameters"
-    argspec: "args=[\'parameters\', \'accumulators\', \'updates\', \'num_shards\', \'shard_id\', \'table_id\', \'table_name\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'\', \'None\'], "
+    argspec: "args=[\'parameters\', \'accumulators\', \'updates\', \'num_shards\', \'shard_id\', \'table_id\', \'table_name\', \'config\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'\', \'\', \'None\'], "
   }
   member_method {
     name: "LoadTPUEmbeddingAdadeltaParametersGradAccumDebug"
-    argspec: "args=[\'parameters\', \'accumulators\', \'updates\', \'gradient_accumulators\', \'num_shards\', \'shard_id\', \'table_id\', \'table_name\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'\', \'None\'], "
+    argspec: "args=[\'parameters\', \'accumulators\', \'updates\', \'gradient_accumulators\', \'num_shards\', \'shard_id\', \'table_id\', \'table_name\', \'config\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'\', \'\', \'None\'], "
   }
   member_method {
     name: "LoadTPUEmbeddingAdagradParameters"
-    argspec: "args=[\'parameters\', \'accumulators\', \'num_shards\', \'shard_id\', \'table_id\', \'table_name\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'\', \'None\'], "
+    argspec: "args=[\'parameters\', \'accumulators\', \'num_shards\', \'shard_id\', \'table_id\', \'table_name\', \'config\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'\', \'\', \'None\'], "
   }
   member_method {
     name: "LoadTPUEmbeddingAdagradParametersGradAccumDebug"
-    argspec: "args=[\'parameters\', \'accumulators\', \'gradient_accumulators\', \'num_shards\', \'shard_id\', \'table_id\', \'table_name\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'\', \'None\'], "
+    argspec: "args=[\'parameters\', \'accumulators\', \'gradient_accumulators\', \'num_shards\', \'shard_id\', \'table_id\', \'table_name\', \'config\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'\', \'\', \'None\'], "
   }
   member_method {
     name: "LoadTPUEmbeddingCenteredRMSPropParameters"
-    argspec: "args=[\'parameters\', \'ms\', \'mom\', \'mg\', \'num_shards\', \'shard_id\', \'table_id\', \'table_name\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'\', \'None\'], "
+    argspec: "args=[\'parameters\', \'ms\', \'mom\', \'mg\', \'num_shards\', \'shard_id\', \'table_id\', \'table_name\', \'config\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'\', \'\', \'None\'], "
   }
   member_method {
     name: "LoadTPUEmbeddingFTRLParameters"
-    argspec: "args=[\'parameters\', \'accumulators\', \'linears\', \'num_shards\', \'shard_id\', \'table_id\', \'table_name\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'\', \'None\'], "
+    argspec: "args=[\'parameters\', \'accumulators\', \'linears\', \'num_shards\', \'shard_id\', \'table_id\', \'table_name\', \'config\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'\', \'\', \'None\'], "
   }
   member_method {
     name: "LoadTPUEmbeddingFTRLParametersGradAccumDebug"
-    argspec: "args=[\'parameters\', \'accumulators\', \'linears\', \'gradient_accumulators\', \'num_shards\', \'shard_id\', \'table_id\', \'table_name\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'\', \'None\'], "
+    argspec: "args=[\'parameters\', \'accumulators\', \'linears\', \'gradient_accumulators\', \'num_shards\', \'shard_id\', \'table_id\', \'table_name\', \'config\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'\', \'\', \'None\'], "
   }
   member_method {
     name: "LoadTPUEmbeddingMDLAdagradLightParameters"
-    argspec: "args=[\'parameters\', \'accumulators\', \'weights\', \'benefits\', \'num_shards\', \'shard_id\', \'table_id\', \'table_name\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'\', \'None\'], "
+    argspec: "args=[\'parameters\', \'accumulators\', \'weights\', \'benefits\', \'num_shards\', \'shard_id\', \'table_id\', \'table_name\', \'config\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'\', \'\', \'None\'], "
   }
   member_method {
     name: "LoadTPUEmbeddingMomentumParameters"
-    argspec: "args=[\'parameters\', \'momenta\', \'num_shards\', \'shard_id\', \'table_id\', \'table_name\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'\', \'None\'], "
+    argspec: "args=[\'parameters\', \'momenta\', \'num_shards\', \'shard_id\', \'table_id\', \'table_name\', \'config\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'\', \'\', \'None\'], "
   }
   member_method {
     name: "LoadTPUEmbeddingMomentumParametersGradAccumDebug"
-    argspec: "args=[\'parameters\', \'momenta\', \'gradient_accumulators\', \'num_shards\', \'shard_id\', \'table_id\', \'table_name\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'\', \'None\'], "
+    argspec: "args=[\'parameters\', \'momenta\', \'gradient_accumulators\', \'num_shards\', \'shard_id\', \'table_id\', \'table_name\', \'config\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'\', \'\', \'None\'], "
   }
   member_method {
     name: "LoadTPUEmbeddingProximalAdagradParameters"
-    argspec: "args=[\'parameters\', \'accumulators\', \'num_shards\', \'shard_id\', \'table_id\', \'table_name\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'\', \'None\'], "
+    argspec: "args=[\'parameters\', \'accumulators\', \'num_shards\', \'shard_id\', \'table_id\', \'table_name\', \'config\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'\', \'\', \'None\'], "
   }
   member_method {
     name: "LoadTPUEmbeddingProximalAdagradParametersGradAccumDebug"
-    argspec: "args=[\'parameters\', \'accumulators\', \'gradient_accumulators\', \'num_shards\', \'shard_id\', \'table_id\', \'table_name\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'\', \'None\'], "
+    argspec: "args=[\'parameters\', \'accumulators\', \'gradient_accumulators\', \'num_shards\', \'shard_id\', \'table_id\', \'table_name\', \'config\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'\', \'\', \'None\'], "
   }
   member_method {
     name: "LoadTPUEmbeddingRMSPropParameters"
-    argspec: "args=[\'parameters\', \'ms\', \'mom\', \'num_shards\', \'shard_id\', \'table_id\', \'table_name\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'\', \'None\'], "
+    argspec: "args=[\'parameters\', \'ms\', \'mom\', \'num_shards\', \'shard_id\', \'table_id\', \'table_name\', \'config\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'\', \'\', \'None\'], "
   }
   member_method {
     name: "LoadTPUEmbeddingRMSPropParametersGradAccumDebug"
-    argspec: "args=[\'parameters\', \'ms\', \'mom\', \'gradient_accumulators\', \'num_shards\', \'shard_id\', \'table_id\', \'table_name\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'\', \'None\'], "
+    argspec: "args=[\'parameters\', \'ms\', \'mom\', \'gradient_accumulators\', \'num_shards\', \'shard_id\', \'table_id\', \'table_name\', \'config\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'\', \'\', \'None\'], "
   }
   member_method {
     name: "LoadTPUEmbeddingStochasticGradientDescentParameters"
-    argspec: "args=[\'parameters\', \'num_shards\', \'shard_id\', \'table_id\', \'table_name\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'\', \'None\'], "
+    argspec: "args=[\'parameters\', \'num_shards\', \'shard_id\', \'table_id\', \'table_name\', \'config\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'\', \'\', \'None\'], "
   }
   member_method {
     name: "Log"
@@ -2398,7 +2398,7 @@ tf_module {
   }
   member_method {
     name: "NotEqual"
-    argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'x\', \'y\', \'incompatible_shape_error\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'None\'], "
   }
   member_method {
     name: "NthElement"
@@ -2658,7 +2658,7 @@ tf_module {
   }
   member_method {
     name: "QuantizeV2"
-    argspec: "args=[\'input\', \'min_range\', \'max_range\', \'T\', \'mode\', \'round_mode\', \'name\'], varargs=None, keywords=None, defaults=[\'MIN_COMBINED\', \'HALF_AWAY_FROM_ZERO\', \'None\'], "
+    argspec: "args=[\'input\', \'min_range\', \'max_range\', \'T\', \'mode\', \'round_mode\', \'narrow_range\', \'axis\', \'name\'], varargs=None, keywords=None, defaults=[\'MIN_COMBINED\', \'HALF_AWAY_FROM_ZERO\', \'False\', \'-1\', \'None\'], "
   }
   member_method {
     name: "QuantizedAdd"
@@ -2858,15 +2858,15 @@ tf_module {
   }
   member_method {
     name: "RFFT"
-    argspec: "args=[\'input\', \'fft_length\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'input\', \'fft_length\', \'Tcomplex\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'complex64\'>\", \'None\'], "
   }
   member_method {
     name: "RFFT2D"
-    argspec: "args=[\'input\', \'fft_length\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'input\', \'fft_length\', \'Tcomplex\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'complex64\'>\", \'None\'], "
   }
   member_method {
     name: "RFFT3D"
-    argspec: "args=[\'input\', \'fft_length\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'input\', \'fft_length\', \'Tcomplex\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'complex64\'>\", \'None\'], "
   }
   member_method {
     name: "RGBToHSV"
@@ -3374,71 +3374,71 @@ tf_module {
   }
   member_method {
     name: "RetrieveTPUEmbeddingADAMParameters"
-    argspec: "args=[\'num_shards\', \'shard_id\', \'table_id\', \'table_name\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'\', \'None\'], "
+    argspec: "args=[\'num_shards\', \'shard_id\', \'table_id\', \'table_name\', \'config\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'\', \'\', \'None\'], "
   }
   member_method {
     name: "RetrieveTPUEmbeddingADAMParametersGradAccumDebug"
-    argspec: "args=[\'num_shards\', \'shard_id\', \'table_id\', \'table_name\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'\', \'None\'], "
+    argspec: "args=[\'num_shards\', \'shard_id\', \'table_id\', \'table_name\', \'config\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'\', \'\', \'None\'], "
   }
   member_method {
     name: "RetrieveTPUEmbeddingAdadeltaParameters"
-    argspec: "args=[\'num_shards\', \'shard_id\', \'table_id\', \'table_name\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'\', \'None\'], "
+    argspec: "args=[\'num_shards\', \'shard_id\', \'table_id\', \'table_name\', \'config\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'\', \'\', \'None\'], "
   }
   member_method {
     name: "RetrieveTPUEmbeddingAdadeltaParametersGradAccumDebug"
-    argspec: "args=[\'num_shards\', \'shard_id\', \'table_id\', \'table_name\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'\', \'None\'], "
+    argspec: "args=[\'num_shards\', \'shard_id\', \'table_id\', \'table_name\', \'config\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'\', \'\', \'None\'], "
   }
   member_method {
     name: "RetrieveTPUEmbeddingAdagradParameters"
-    argspec: "args=[\'num_shards\', \'shard_id\', \'table_id\', \'table_name\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'\', \'None\'], "
+    argspec: "args=[\'num_shards\', \'shard_id\', \'table_id\', \'table_name\', \'config\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'\', \'\', \'None\'], "
   }
   member_method {
     name: "RetrieveTPUEmbeddingAdagradParametersGradAccumDebug"
-    argspec: "args=[\'num_shards\', \'shard_id\', \'table_id\', \'table_name\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'\', \'None\'], "
+    argspec: "args=[\'num_shards\', \'shard_id\', \'table_id\', \'table_name\', \'config\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'\', \'\', \'None\'], "
   }
   member_method {
     name: "RetrieveTPUEmbeddingCenteredRMSPropParameters"
-    argspec: "args=[\'num_shards\', \'shard_id\', \'table_id\', \'table_name\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'\', \'None\'], "
+    argspec: "args=[\'num_shards\', \'shard_id\', \'table_id\', \'table_name\', \'config\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'\', \'\', \'None\'], "
   }
   member_method {
     name: "RetrieveTPUEmbeddingFTRLParameters"
-    argspec: "args=[\'num_shards\', \'shard_id\', \'table_id\', \'table_name\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'\', \'None\'], "
+    argspec: "args=[\'num_shards\', \'shard_id\', \'table_id\', \'table_name\', \'config\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'\', \'\', \'None\'], "
   }
   member_method {
     name: "RetrieveTPUEmbeddingFTRLParametersGradAccumDebug"
-    argspec: "args=[\'num_shards\', \'shard_id\', \'table_id\', \'table_name\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'\', \'None\'], "
+    argspec: "args=[\'num_shards\', \'shard_id\', \'table_id\', \'table_name\', \'config\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'\', \'\', \'None\'], "
   }
   member_method {
     name: "RetrieveTPUEmbeddingMDLAdagradLightParameters"
-    argspec: "args=[\'num_shards\', \'shard_id\', \'table_id\', \'table_name\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'\', \'None\'], "
+    argspec: "args=[\'num_shards\', \'shard_id\', \'table_id\', \'table_name\', \'config\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'\', \'\', \'None\'], "
   }
   member_method {
     name: "RetrieveTPUEmbeddingMomentumParameters"
-    argspec: "args=[\'num_shards\', \'shard_id\', \'table_id\', \'table_name\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'\', \'None\'], "
+    argspec: "args=[\'num_shards\', \'shard_id\', \'table_id\', \'table_name\', \'config\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'\', \'\', \'None\'], "
   }
   member_method {
     name: "RetrieveTPUEmbeddingMomentumParametersGradAccumDebug"
-    argspec: "args=[\'num_shards\', \'shard_id\', \'table_id\', \'table_name\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'\', \'None\'], "
+    argspec: "args=[\'num_shards\', \'shard_id\', \'table_id\', \'table_name\', \'config\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'\', \'\', \'None\'], "
   }
   member_method {
     name: "RetrieveTPUEmbeddingProximalAdagradParameters"
-    argspec: "args=[\'num_shards\', \'shard_id\', \'table_id\', \'table_name\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'\', \'None\'], "
+    argspec: "args=[\'num_shards\', \'shard_id\', \'table_id\', \'table_name\', \'config\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'\', \'\', \'None\'], "
   }
   member_method {
     name: "RetrieveTPUEmbeddingProximalAdagradParametersGradAccumDebug"
-    argspec: "args=[\'num_shards\', \'shard_id\', \'table_id\', \'table_name\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'\', \'None\'], "
+    argspec: "args=[\'num_shards\', \'shard_id\', \'table_id\', \'table_name\', \'config\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'\', \'\', \'None\'], "
   }
   member_method {
     name: "RetrieveTPUEmbeddingRMSPropParameters"
-    argspec: "args=[\'num_shards\', \'shard_id\', \'table_id\', \'table_name\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'\', \'None\'], "
+    argspec: "args=[\'num_shards\', \'shard_id\', \'table_id\', \'table_name\', \'config\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'\', \'\', \'None\'], "
   }
   member_method {
     name: "RetrieveTPUEmbeddingRMSPropParametersGradAccumDebug"
-    argspec: "args=[\'num_shards\', \'shard_id\', \'table_id\', \'table_name\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'\', \'None\'], "
+    argspec: "args=[\'num_shards\', \'shard_id\', \'table_id\', \'table_name\', \'config\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'\', \'\', \'None\'], "
   }
   member_method {
     name: "RetrieveTPUEmbeddingStochasticGradientDescentParameters"
-    argspec: "args=[\'num_shards\', \'shard_id\', \'table_id\', \'table_name\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'\', \'None\'], "
+    argspec: "args=[\'num_shards\', \'shard_id\', \'table_id\', \'table_name\', \'config\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'\', \'\', \'None\'], "
   }
   member_method {
     name: "Reverse"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.saved_model.-asset.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.saved_model.-asset.pbtxt
new file mode 100644
index 00000000000..0a20385c329
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.saved_model.-asset.pbtxt
@@ -0,0 +1,14 @@
+path: "tensorflow.saved_model.Asset"
+tf_class {
+  is_instance: "<class \'tensorflow.python.training.tracking.tracking.Asset\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "asset_path"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'path\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.saved_model.-save-options.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.saved_model.-save-options.pbtxt
index 174043aec78..ea31605ba1f 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.saved_model.-save-options.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.saved_model.-save-options.pbtxt
@@ -6,8 +6,12 @@ tf_class {
     name: "namespace_whitelist"
     mtype: "<type \'member_descriptor\'>"
   }
+  member {
+    name: "save_debug_info"
+    mtype: "<type \'member_descriptor\'>"
+  }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'namespace_whitelist\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'namespace_whitelist\', \'save_debug_info\'], varargs=None, keywords=None, defaults=[\'None\', \'False\'], "
   }
 }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.saved_model.constants.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.saved_model.constants.pbtxt
index 20e10aa094f..4e16707ba2a 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.saved_model.constants.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.saved_model.constants.pbtxt
@@ -8,6 +8,14 @@ tf_module {
     name: "ASSETS_KEY"
     mtype: "<type \'str\'>"
   }
+  member {
+    name: "DEBUG_DIRECTORY"
+    mtype: "<type \'str\'>"
+  }
+  member {
+    name: "DEBUG_INFO_FILENAME_PB"
+    mtype: "<type \'str\'>"
+  }
   member {
     name: "LEGACY_INIT_OP_KEY"
     mtype: "<type \'str\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.saved_model.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.saved_model.pbtxt
index f3558109ce8..8833f02b0db 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.saved_model.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.saved_model.pbtxt
@@ -8,6 +8,10 @@ tf_module {
     name: "ASSETS_KEY"
     mtype: "<type \'str\'>"
   }
+  member {
+    name: "Asset"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "Builder"
     mtype: "<type \'type\'>"
@@ -28,6 +32,14 @@ tf_module {
     name: "CLASSIFY_OUTPUT_SCORES"
     mtype: "<type \'str\'>"
   }
+  member {
+    name: "DEBUG_DIRECTORY"
+    mtype: "<type \'str\'>"
+  }
+  member {
+    name: "DEBUG_INFO_FILENAME_PB"
+    mtype: "<type \'str\'>"
+  }
   member {
     name: "DEFAULT_SERVING_SIGNATURE_DEF_KEY"
     mtype: "<type \'str\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.estimator.-run-config.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-run-config.pbtxt
index 843298f61f8..b730913ca91 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.estimator.-run-config.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-run-config.pbtxt
@@ -82,6 +82,10 @@ tf_class {
     name: "session_config"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "session_creation_timeout_secs"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "task_id"
     mtype: "<type \'property\'>"
@@ -100,7 +104,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'model_dir\', \'tf_random_seed\', \'save_summary_steps\', \'save_checkpoints_steps\', \'save_checkpoints_secs\', \'session_config\', \'keep_checkpoint_max\', \'keep_checkpoint_every_n_hours\', \'log_step_count_steps\', \'train_distribute\', \'device_fn\', \'protocol\', \'eval_distribute\', \'experimental_distribute\', \'experimental_max_worker_delay_secs\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'100\', \'<object object instance>\', \'<object object instance>\', \'None\', \'5\', \'10000\', \'100\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'model_dir\', \'tf_random_seed\', \'save_summary_steps\', \'save_checkpoints_steps\', \'save_checkpoints_secs\', \'session_config\', \'keep_checkpoint_max\', \'keep_checkpoint_every_n_hours\', \'log_step_count_steps\', \'train_distribute\', \'device_fn\', \'protocol\', \'eval_distribute\', \'experimental_distribute\', \'experimental_max_worker_delay_secs\', \'session_creation_timeout_secs\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'100\', \'<object object instance>\', \'<object object instance>\', \'None\', \'5\', \'10000\', \'100\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'7200\'], "
   }
   member_method {
     name: "replace"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.experimental.-peephole-l-s-t-m-cell.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.experimental.-peephole-l-s-t-m-cell.pbtxt
index 9c23568e3b8..6f214ecf10b 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.experimental.-peephole-l-s-t-m-cell.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.experimental.-peephole-l-s-t-m-cell.pbtxt
@@ -80,6 +80,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.experimental.-sequence-features.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.experimental.-sequence-features.pbtxt
index d5c79577782..0c32849561a 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.experimental.-sequence-features.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.experimental.-sequence-features.pbtxt
@@ -79,6 +79,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-abstract-r-n-n-cell.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-abstract-r-n-n-cell.pbtxt
index 595ef757cfb..dd22282d465 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-abstract-r-n-n-cell.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-abstract-r-n-n-cell.pbtxt
@@ -86,6 +86,10 @@ tf_class {
     name: "state_size"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-activation.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-activation.pbtxt
index 6f9e0bf2997..75d7a072851 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-activation.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-activation.pbtxt
@@ -78,6 +78,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-activity-regularization.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-activity-regularization.pbtxt
index ab04261130c..c16b1c5548d 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-activity-regularization.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-activity-regularization.pbtxt
@@ -78,6 +78,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-add.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-add.pbtxt
index 0aa9ce8ffe9..57fa16e14b2 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-add.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-add.pbtxt
@@ -79,6 +79,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-additive-attention.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-additive-attention.pbtxt
index 2cc75dc5118..c63ff3d3a36 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-additive-attention.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-additive-attention.pbtxt
@@ -79,6 +79,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-alpha-dropout.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-alpha-dropout.pbtxt
index cc4cfecec9c..d2387bb271c 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-alpha-dropout.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-alpha-dropout.pbtxt
@@ -78,6 +78,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-attention.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-attention.pbtxt
index b4a5fdb56fb..d42857d041d 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-attention.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-attention.pbtxt
@@ -79,6 +79,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-average-pooling1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-average-pooling1-d.pbtxt
index 10597fe6de0..e615535f477 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-average-pooling1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-average-pooling1-d.pbtxt
@@ -79,6 +79,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-average-pooling2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-average-pooling2-d.pbtxt
index c1abefc2861..268f46be93d 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-average-pooling2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-average-pooling2-d.pbtxt
@@ -79,6 +79,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-average-pooling3-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-average-pooling3-d.pbtxt
index b2f2b1af603..eee3a809649 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-average-pooling3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-average-pooling3-d.pbtxt
@@ -79,6 +79,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-average.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-average.pbtxt
index d6eff05a492..5c29d2cd8ac 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-average.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-average.pbtxt
@@ -79,6 +79,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-avg-pool1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-avg-pool1-d.pbtxt
index 998ed2b71cf..a4efe517d5f 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-avg-pool1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-avg-pool1-d.pbtxt
@@ -79,6 +79,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-avg-pool2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-avg-pool2-d.pbtxt
index afe4b427bd9..704ed8751ad 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-avg-pool2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-avg-pool2-d.pbtxt
@@ -79,6 +79,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-avg-pool3-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-avg-pool3-d.pbtxt
index 558a783beb9..4ab0c8118d6 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-avg-pool3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-avg-pool3-d.pbtxt
@@ -79,6 +79,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-batch-normalization.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-batch-normalization.pbtxt
index 6be94cf61a4..af844bdc32e 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-batch-normalization.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-batch-normalization.pbtxt
@@ -79,6 +79,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-bidirectional.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-bidirectional.pbtxt
index ca600890769..4d8fc6694a4 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-bidirectional.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-bidirectional.pbtxt
@@ -83,6 +83,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-concatenate.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-concatenate.pbtxt
index 65ddd376a4e..71773dd4ba1 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-concatenate.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-concatenate.pbtxt
@@ -79,6 +79,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv-l-s-t-m2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv-l-s-t-m2-d.pbtxt
index 027d1f9ec82..a31760190ff 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv-l-s-t-m2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv-l-s-t-m2-d.pbtxt
@@ -152,6 +152,10 @@ tf_class {
     name: "recurrent_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "states"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv1-d.pbtxt
index 82b18bfaf05..74969359fe1 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv1-d.pbtxt
@@ -79,6 +79,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv2-d-transpose.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv2-d-transpose.pbtxt
index 37b8290803d..cd130b50db1 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv2-d-transpose.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv2-d-transpose.pbtxt
@@ -80,6 +80,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv2-d.pbtxt
index 32eeb373f9a..cd23b8f5e55 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv2-d.pbtxt
@@ -79,6 +79,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv3-d-transpose.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv3-d-transpose.pbtxt
index f99f64ceb65..ee577ac5cfb 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv3-d-transpose.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv3-d-transpose.pbtxt
@@ -80,6 +80,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv3-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv3-d.pbtxt
index 6422501ab9e..1d8cf248d89 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv3-d.pbtxt
@@ -79,6 +79,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution1-d.pbtxt
index 6aeb1f6762c..926f2b77dd6 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution1-d.pbtxt
@@ -79,6 +79,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution2-d-transpose.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution2-d-transpose.pbtxt
index 4ed04f1cf46..23819dfeef4 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution2-d-transpose.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution2-d-transpose.pbtxt
@@ -80,6 +80,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution2-d.pbtxt
index 8e0bfdee2f0..34e0ecba5d3 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution2-d.pbtxt
@@ -79,6 +79,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution3-d-transpose.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution3-d-transpose.pbtxt
index bf42a30f104..6ebbc99fb78 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution3-d-transpose.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution3-d-transpose.pbtxt
@@ -80,6 +80,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution3-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution3-d.pbtxt
index fe07a733a61..96622497c64 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution3-d.pbtxt
@@ -79,6 +79,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-cropping1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-cropping1-d.pbtxt
index e38aeb14f8a..ee7342bdc87 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-cropping1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-cropping1-d.pbtxt
@@ -78,6 +78,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-cropping2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-cropping2-d.pbtxt
index 5544080f7b4..bc960d313d4 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-cropping2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-cropping2-d.pbtxt
@@ -78,6 +78,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-cropping3-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-cropping3-d.pbtxt
index c858dc8f5b5..7dead423590 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-cropping3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-cropping3-d.pbtxt
@@ -78,6 +78,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-dense-features.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-dense-features.pbtxt
index 631012bb227..7cd644cffb1 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-dense-features.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-dense-features.pbtxt
@@ -80,6 +80,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-dense.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-dense.pbtxt
index 1955e1828da..998f086f5be 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-dense.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-dense.pbtxt
@@ -78,6 +78,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-depthwise-conv2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-depthwise-conv2-d.pbtxt
index da67276b89d..0e528a854fa 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-depthwise-conv2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-depthwise-conv2-d.pbtxt
@@ -80,6 +80,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-dot.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-dot.pbtxt
index 09534bc4476..4a13c7e9bd2 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-dot.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-dot.pbtxt
@@ -79,6 +79,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-dropout.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-dropout.pbtxt
index 74471db6fe3..2093bedc6b4 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-dropout.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-dropout.pbtxt
@@ -78,6 +78,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-e-l-u.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-e-l-u.pbtxt
index 2085d5e2a9f..7037749e44d 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-e-l-u.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-e-l-u.pbtxt
@@ -78,6 +78,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-embedding.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-embedding.pbtxt
index 4672a0df8cd..fb0f699dcd5 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-embedding.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-embedding.pbtxt
@@ -78,6 +78,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-flatten.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-flatten.pbtxt
index ce3cc686176..28fb7463f56 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-flatten.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-flatten.pbtxt
@@ -78,6 +78,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-g-r-u-cell.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-g-r-u-cell.pbtxt
index 981fa35bb6b..4e402bb2673 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-g-r-u-cell.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-g-r-u-cell.pbtxt
@@ -80,6 +80,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-g-r-u.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-g-r-u.pbtxt
index 9d25d2d5938..7326feac116 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-g-r-u.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-g-r-u.pbtxt
@@ -141,6 +141,10 @@ tf_class {
     name: "reset_after"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "states"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-gaussian-dropout.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-gaussian-dropout.pbtxt
index 079f5b72eee..4b0be9fd513 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-gaussian-dropout.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-gaussian-dropout.pbtxt
@@ -78,6 +78,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-gaussian-noise.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-gaussian-noise.pbtxt
index 5dfa29c13f0..2b7dbdf4cdd 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-gaussian-noise.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-gaussian-noise.pbtxt
@@ -78,6 +78,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-average-pooling1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-average-pooling1-d.pbtxt
index 763e9238420..ca436ceb729 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-average-pooling1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-average-pooling1-d.pbtxt
@@ -79,6 +79,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-average-pooling2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-average-pooling2-d.pbtxt
index 347c3129035..16f543bb559 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-average-pooling2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-average-pooling2-d.pbtxt
@@ -79,6 +79,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-average-pooling3-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-average-pooling3-d.pbtxt
index ff84e8ed221..e628cfbc9c1 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-average-pooling3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-average-pooling3-d.pbtxt
@@ -79,6 +79,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-avg-pool1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-avg-pool1-d.pbtxt
index d0965962252..75de058501f 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-avg-pool1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-avg-pool1-d.pbtxt
@@ -79,6 +79,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-avg-pool2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-avg-pool2-d.pbtxt
index 6b12001650a..65aea76985a 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-avg-pool2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-avg-pool2-d.pbtxt
@@ -79,6 +79,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-avg-pool3-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-avg-pool3-d.pbtxt
index 35dfa019fe9..f89a720bc4d 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-avg-pool3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-avg-pool3-d.pbtxt
@@ -79,6 +79,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pool1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pool1-d.pbtxt
index cfc3d28a79d..137046d71b5 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pool1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pool1-d.pbtxt
@@ -79,6 +79,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pool2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pool2-d.pbtxt
index 8829b2dedb6..8b4fb108520 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pool2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pool2-d.pbtxt
@@ -79,6 +79,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pool3-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pool3-d.pbtxt
index 0513968de37..065d75d0f5e 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pool3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pool3-d.pbtxt
@@ -79,6 +79,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pooling1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pooling1-d.pbtxt
index c6536083f40..dc42e79d466 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pooling1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pooling1-d.pbtxt
@@ -79,6 +79,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pooling2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pooling2-d.pbtxt
index 778ac960287..bfef3ad9f31 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pooling2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pooling2-d.pbtxt
@@ -79,6 +79,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pooling3-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pooling3-d.pbtxt
index 828ac869e6a..00346e8cd4a 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pooling3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pooling3-d.pbtxt
@@ -79,6 +79,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-input-layer.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-input-layer.pbtxt
index d6375854f4e..981a5999418 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-input-layer.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-input-layer.pbtxt
@@ -78,6 +78,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-l-s-t-m-cell.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-l-s-t-m-cell.pbtxt
index 5492b1495f0..3e581c7ffb5 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-l-s-t-m-cell.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-l-s-t-m-cell.pbtxt
@@ -80,6 +80,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-l-s-t-m.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-l-s-t-m.pbtxt
index dc8d8452c7b..921a4fd4ecb 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-l-s-t-m.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-l-s-t-m.pbtxt
@@ -137,6 +137,10 @@ tf_class {
     name: "recurrent_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "states"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-lambda.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-lambda.pbtxt
index ee0f5bab3a6..0dd9c220f8f 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-lambda.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-lambda.pbtxt
@@ -78,6 +78,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-layer-normalization.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-layer-normalization.pbtxt
index b5ce0bf21e8..bd352257e6d 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-layer-normalization.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-layer-normalization.pbtxt
@@ -78,6 +78,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-layer.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-layer.pbtxt
index 87070ed86f0..077aa8670d2 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-layer.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-layer.pbtxt
@@ -77,6 +77,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-leaky-re-l-u.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-leaky-re-l-u.pbtxt
index cd64b92927a..8ffd2ab8897 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-leaky-re-l-u.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-leaky-re-l-u.pbtxt
@@ -78,6 +78,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-locally-connected1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-locally-connected1-d.pbtxt
index d7eff7256cc..def8f117547 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-locally-connected1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-locally-connected1-d.pbtxt
@@ -78,6 +78,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-locally-connected2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-locally-connected2-d.pbtxt
index f8a345956f4..44cdbf84d4a 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-locally-connected2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-locally-connected2-d.pbtxt
@@ -78,6 +78,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-masking.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-masking.pbtxt
index ad20f1108ae..d87a67b1d26 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-masking.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-masking.pbtxt
@@ -78,6 +78,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pool1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pool1-d.pbtxt
index f5c76e41c0c..801f9f9754d 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pool1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pool1-d.pbtxt
@@ -79,6 +79,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pool2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pool2-d.pbtxt
index 6d0472f5272..546964e017a 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pool2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pool2-d.pbtxt
@@ -79,6 +79,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pool3-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pool3-d.pbtxt
index 6881ecaa57d..e1180bee3f0 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pool3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pool3-d.pbtxt
@@ -79,6 +79,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pooling1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pooling1-d.pbtxt
index 39013b24cd2..cea4913d199 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pooling1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pooling1-d.pbtxt
@@ -79,6 +79,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pooling2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pooling2-d.pbtxt
index 283707ff2af..78c8f4d7a91 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pooling2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pooling2-d.pbtxt
@@ -79,6 +79,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pooling3-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pooling3-d.pbtxt
index 8c7757903f2..d9760cc9d8c 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pooling3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pooling3-d.pbtxt
@@ -79,6 +79,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-maximum.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-maximum.pbtxt
index 5e7c03ede38..8936ce4afc5 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-maximum.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-maximum.pbtxt
@@ -79,6 +79,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-minimum.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-minimum.pbtxt
index 9b9524f4cb6..b73d89cdb2b 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-minimum.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-minimum.pbtxt
@@ -79,6 +79,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-multiply.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-multiply.pbtxt
index dc1273d67d4..2aa98b84534 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-multiply.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-multiply.pbtxt
@@ -79,6 +79,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-p-re-l-u.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-p-re-l-u.pbtxt
index 833e951f794..2bee039cd84 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-p-re-l-u.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-p-re-l-u.pbtxt
@@ -78,6 +78,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-permute.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-permute.pbtxt
index 507df211484..605b5cffa9e 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-permute.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-permute.pbtxt
@@ -78,6 +78,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-r-n-n.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-r-n-n.pbtxt
index e4ca62748f0..e2fd95541a7 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-r-n-n.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-r-n-n.pbtxt
@@ -78,6 +78,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "states"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-re-l-u.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-re-l-u.pbtxt
index 17307bf8a49..fd3089ca422 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-re-l-u.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-re-l-u.pbtxt
@@ -78,6 +78,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-repeat-vector.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-repeat-vector.pbtxt
index aabbb55e40d..481868f4258 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-repeat-vector.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-repeat-vector.pbtxt
@@ -78,6 +78,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-reshape.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-reshape.pbtxt
index 9a2ecb0b16e..7509236e3ef 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-reshape.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-reshape.pbtxt
@@ -78,6 +78,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-separable-conv1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-separable-conv1-d.pbtxt
index f0e9ceace32..59384af69c8 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-separable-conv1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-separable-conv1-d.pbtxt
@@ -80,6 +80,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-separable-conv2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-separable-conv2-d.pbtxt
index 4a45b309801..07979ca71b5 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-separable-conv2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-separable-conv2-d.pbtxt
@@ -80,6 +80,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-separable-convolution1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-separable-convolution1-d.pbtxt
index 66945c2a2f4..4110880623a 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-separable-convolution1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-separable-convolution1-d.pbtxt
@@ -80,6 +80,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-separable-convolution2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-separable-convolution2-d.pbtxt
index caf1fd8de75..93472cb42af 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-separable-convolution2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-separable-convolution2-d.pbtxt
@@ -80,6 +80,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-simple-r-n-n-cell.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-simple-r-n-n-cell.pbtxt
index edecba5218a..833118d92e4 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-simple-r-n-n-cell.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-simple-r-n-n-cell.pbtxt
@@ -79,6 +79,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-simple-r-n-n.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-simple-r-n-n.pbtxt
index e9425da97bd..f777d01603c 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-simple-r-n-n.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-simple-r-n-n.pbtxt
@@ -127,6 +127,10 @@ tf_class {
     name: "recurrent_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "states"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-softmax.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-softmax.pbtxt
index 0d568225db8..f53b421d8aa 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-softmax.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-softmax.pbtxt
@@ -78,6 +78,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-spatial-dropout1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-spatial-dropout1-d.pbtxt
index 84c3725f005..ba406d8345a 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-spatial-dropout1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-spatial-dropout1-d.pbtxt
@@ -79,6 +79,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-spatial-dropout2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-spatial-dropout2-d.pbtxt
index 90448585e68..792f54806ca 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-spatial-dropout2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-spatial-dropout2-d.pbtxt
@@ -79,6 +79,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-spatial-dropout3-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-spatial-dropout3-d.pbtxt
index 530b1e363fe..77c609653d8 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-spatial-dropout3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-spatial-dropout3-d.pbtxt
@@ -79,6 +79,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-stacked-r-n-n-cells.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-stacked-r-n-n-cells.pbtxt
index 6229011bac0..da0244db822 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-stacked-r-n-n-cells.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-stacked-r-n-n-cells.pbtxt
@@ -86,6 +86,10 @@ tf_class {
     name: "state_size"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-subtract.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-subtract.pbtxt
index 4b82875ae0f..2e81f864ea5 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-subtract.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-subtract.pbtxt
@@ -79,6 +79,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-thresholded-re-l-u.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-thresholded-re-l-u.pbtxt
index 85ca2fb9ab6..54b7b0b457d 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-thresholded-re-l-u.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-thresholded-re-l-u.pbtxt
@@ -78,6 +78,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-time-distributed.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-time-distributed.pbtxt
index e3627416221..eadd5212a61 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-time-distributed.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-time-distributed.pbtxt
@@ -79,6 +79,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-up-sampling1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-up-sampling1-d.pbtxt
index 0215e708884..ea569ec906e 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-up-sampling1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-up-sampling1-d.pbtxt
@@ -78,6 +78,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-up-sampling2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-up-sampling2-d.pbtxt
index 735469eb70e..91cdd3d9838 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-up-sampling2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-up-sampling2-d.pbtxt
@@ -78,6 +78,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-up-sampling3-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-up-sampling3-d.pbtxt
index ee75caacc67..4fb02a19a22 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-up-sampling3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-up-sampling3-d.pbtxt
@@ -78,6 +78,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-wrapper.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-wrapper.pbtxt
index ee5ed97987b..d412f09306a 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-wrapper.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-wrapper.pbtxt
@@ -78,6 +78,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-zero-padding1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-zero-padding1-d.pbtxt
index 6dfc1281265..2e8d03cb5d3 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-zero-padding1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-zero-padding1-d.pbtxt
@@ -78,6 +78,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-zero-padding2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-zero-padding2-d.pbtxt
index 0bd8230caab..bbd0e007626 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-zero-padding2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-zero-padding2-d.pbtxt
@@ -78,6 +78,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-zero-padding3-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-zero-padding3-d.pbtxt
index 081b60f5a6b..6c0be03beb6 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-zero-padding3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-zero-padding3-d.pbtxt
@@ -78,6 +78,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-a-u-c.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-a-u-c.pbtxt
index 031f50119f3..84684d5112e 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-a-u-c.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-a-u-c.pbtxt
@@ -79,6 +79,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-accuracy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-accuracy.pbtxt
index f9e8ff5dc3f..1d5f21871b1 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-accuracy.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-accuracy.pbtxt
@@ -82,6 +82,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-binary-accuracy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-binary-accuracy.pbtxt
index c3c31ccd9ba..d3ff59ba251 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-binary-accuracy.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-binary-accuracy.pbtxt
@@ -82,6 +82,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-binary-crossentropy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-binary-crossentropy.pbtxt
index 8f589aac0b6..2aaa33cf236 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-binary-crossentropy.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-binary-crossentropy.pbtxt
@@ -82,6 +82,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-categorical-accuracy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-categorical-accuracy.pbtxt
index 9a496dc6396..f10bcd1b3ed 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-categorical-accuracy.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-categorical-accuracy.pbtxt
@@ -82,6 +82,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-categorical-crossentropy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-categorical-crossentropy.pbtxt
index 85ad7ee52ef..252ee32e9e6 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-categorical-crossentropy.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-categorical-crossentropy.pbtxt
@@ -82,6 +82,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-categorical-hinge.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-categorical-hinge.pbtxt
index 462ee8bf46e..040f8683294 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-categorical-hinge.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-categorical-hinge.pbtxt
@@ -82,6 +82,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-cosine-similarity.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-cosine-similarity.pbtxt
index 93b668a0cbd..c7e5e2889e3 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-cosine-similarity.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-cosine-similarity.pbtxt
@@ -82,6 +82,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-false-negatives.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-false-negatives.pbtxt
index 5c4d9215cff..94f6f1dea71 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-false-negatives.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-false-negatives.pbtxt
@@ -80,6 +80,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-false-positives.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-false-positives.pbtxt
index 354a3ee5127..4eabf3ab01b 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-false-positives.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-false-positives.pbtxt
@@ -80,6 +80,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-hinge.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-hinge.pbtxt
index 8066dc86b32..181594ddeeb 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-hinge.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-hinge.pbtxt
@@ -82,6 +82,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-k-l-divergence.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-k-l-divergence.pbtxt
index 44e0d4da17c..d8dda74df2b 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-k-l-divergence.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-k-l-divergence.pbtxt
@@ -82,6 +82,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-log-cosh-error.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-log-cosh-error.pbtxt
index 36644ce0671..0709b09a035 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-log-cosh-error.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-log-cosh-error.pbtxt
@@ -82,6 +82,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-mean-absolute-error.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-mean-absolute-error.pbtxt
index f61b33b3b2f..3929a50f1df 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-mean-absolute-error.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-mean-absolute-error.pbtxt
@@ -82,6 +82,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-mean-absolute-percentage-error.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-mean-absolute-percentage-error.pbtxt
index 2216343cec0..0fc7eb57cfc 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-mean-absolute-percentage-error.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-mean-absolute-percentage-error.pbtxt
@@ -82,6 +82,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-mean-io-u.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-mean-io-u.pbtxt
index ee0268c8d39..77d9b9b3d19 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-mean-io-u.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-mean-io-u.pbtxt
@@ -79,6 +79,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-mean-relative-error.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-mean-relative-error.pbtxt
index 811bd8b4032..2507fdb639c 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-mean-relative-error.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-mean-relative-error.pbtxt
@@ -81,6 +81,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-mean-squared-error.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-mean-squared-error.pbtxt
index 65a8738dd39..f5f4170df6a 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-mean-squared-error.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-mean-squared-error.pbtxt
@@ -82,6 +82,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-mean-squared-logarithmic-error.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-mean-squared-logarithmic-error.pbtxt
index 882fc260c97..2845bfc6b92 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-mean-squared-logarithmic-error.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-mean-squared-logarithmic-error.pbtxt
@@ -82,6 +82,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-mean-tensor.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-mean-tensor.pbtxt
index 80aaf335e39..db50c90d539 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-mean-tensor.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-mean-tensor.pbtxt
@@ -83,6 +83,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-mean.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-mean.pbtxt
index 20270089a22..662ee90c43a 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-mean.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-mean.pbtxt
@@ -80,6 +80,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-metric.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-metric.pbtxt
index 625a2ac6ee3..c7a31e96985 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-metric.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-metric.pbtxt
@@ -78,6 +78,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-poisson.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-poisson.pbtxt
index 02223475046..8474174a651 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-poisson.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-poisson.pbtxt
@@ -82,6 +82,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-precision.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-precision.pbtxt
index c68067ef76e..387957b8b0d 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-precision.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-precision.pbtxt
@@ -79,6 +79,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-recall.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-recall.pbtxt
index 5d101daec2a..f7612622586 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-recall.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-recall.pbtxt
@@ -79,6 +79,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-root-mean-squared-error.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-root-mean-squared-error.pbtxt
index a1b63dc494b..fa90c648094 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-root-mean-squared-error.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-root-mean-squared-error.pbtxt
@@ -81,6 +81,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-sensitivity-at-specificity.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-sensitivity-at-specificity.pbtxt
index 02ba5b724a3..ede3905bd43 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-sensitivity-at-specificity.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-sensitivity-at-specificity.pbtxt
@@ -80,6 +80,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-sparse-categorical-accuracy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-sparse-categorical-accuracy.pbtxt
index 03f3c7f1964..ce24016f128 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-sparse-categorical-accuracy.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-sparse-categorical-accuracy.pbtxt
@@ -82,6 +82,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-sparse-categorical-crossentropy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-sparse-categorical-crossentropy.pbtxt
index e7c07d90853..74a9010f085 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-sparse-categorical-crossentropy.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-sparse-categorical-crossentropy.pbtxt
@@ -82,6 +82,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-sparse-top-k-categorical-accuracy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-sparse-top-k-categorical-accuracy.pbtxt
index d86b149238e..a3709d03a85 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-sparse-top-k-categorical-accuracy.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-sparse-top-k-categorical-accuracy.pbtxt
@@ -82,6 +82,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-specificity-at-sensitivity.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-specificity-at-sensitivity.pbtxt
index 8bed099c95b..d986ef77e11 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-specificity-at-sensitivity.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-specificity-at-sensitivity.pbtxt
@@ -80,6 +80,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-squared-hinge.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-squared-hinge.pbtxt
index 704d6e1a923..e61667ded19 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-squared-hinge.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-squared-hinge.pbtxt
@@ -82,6 +82,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-sum.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-sum.pbtxt
index 099fbc6fdcf..e4b006b7e0c 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-sum.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-sum.pbtxt
@@ -80,6 +80,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-top-k-categorical-accuracy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-top-k-categorical-accuracy.pbtxt
index 71f261cd598..df761b75b45 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-top-k-categorical-accuracy.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-top-k-categorical-accuracy.pbtxt
@@ -82,6 +82,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-true-negatives.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-true-negatives.pbtxt
index 639a328c80f..93971a1a749 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-true-negatives.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-true-negatives.pbtxt
@@ -80,6 +80,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-true-positives.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-true-positives.pbtxt
index 6e6bf0401e2..4df8ff463c4 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-true-positives.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-true-positives.pbtxt
@@ -80,6 +80,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.mixed_precision.experimental.-loss-scale-optimizer.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.mixed_precision.experimental.-loss-scale-optimizer.pbtxt
index 31696139210..c4049240358 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.mixed_precision.experimental.-loss-scale-optimizer.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.mixed_precision.experimental.-loss-scale-optimizer.pbtxt
@@ -26,7 +26,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'opt\', \'loss_scale\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'optimizer\', \'loss_scale\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "add_slot"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-a-u-c.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-a-u-c.pbtxt
index dc5fba737d7..7458a25d940 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-a-u-c.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-a-u-c.pbtxt
@@ -79,6 +79,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-accuracy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-accuracy.pbtxt
index 73a90b21521..80ea3764caa 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-accuracy.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-accuracy.pbtxt
@@ -82,6 +82,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-binary-accuracy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-binary-accuracy.pbtxt
index 3811475edbf..1bab82afaa6 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-binary-accuracy.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-binary-accuracy.pbtxt
@@ -82,6 +82,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-binary-crossentropy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-binary-crossentropy.pbtxt
index b082767ca1e..fd463f2cb5e 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-binary-crossentropy.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-binary-crossentropy.pbtxt
@@ -82,6 +82,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-categorical-accuracy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-categorical-accuracy.pbtxt
index a5ac6b3c1f1..38d034f6b11 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-categorical-accuracy.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-categorical-accuracy.pbtxt
@@ -82,6 +82,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-categorical-crossentropy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-categorical-crossentropy.pbtxt
index cec6f5e786b..b3d7c89dd98 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-categorical-crossentropy.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-categorical-crossentropy.pbtxt
@@ -82,6 +82,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-categorical-hinge.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-categorical-hinge.pbtxt
index 8dc3eb06d44..1bd670416b2 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-categorical-hinge.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-categorical-hinge.pbtxt
@@ -82,6 +82,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-cosine-similarity.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-cosine-similarity.pbtxt
index 1c9dbd11db0..2906c1f4924 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-cosine-similarity.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-cosine-similarity.pbtxt
@@ -82,6 +82,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-false-negatives.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-false-negatives.pbtxt
index 15c231e0ee8..00241dfadb2 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-false-negatives.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-false-negatives.pbtxt
@@ -80,6 +80,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-false-positives.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-false-positives.pbtxt
index e64cc8b5f8e..b8054567367 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-false-positives.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-false-positives.pbtxt
@@ -80,6 +80,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-hinge.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-hinge.pbtxt
index d1c14eb2904..7c0c55e7307 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-hinge.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-hinge.pbtxt
@@ -82,6 +82,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-k-l-divergence.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-k-l-divergence.pbtxt
index 268584bbc64..fe4511dcd6c 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-k-l-divergence.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-k-l-divergence.pbtxt
@@ -82,6 +82,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-log-cosh-error.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-log-cosh-error.pbtxt
index 3dd47916a26..8fe42e31c18 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-log-cosh-error.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-log-cosh-error.pbtxt
@@ -82,6 +82,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-mean-absolute-error.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-mean-absolute-error.pbtxt
index 0f2ed8bf28b..702d028d8c5 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-mean-absolute-error.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-mean-absolute-error.pbtxt
@@ -82,6 +82,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-mean-absolute-percentage-error.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-mean-absolute-percentage-error.pbtxt
index 8ccf3322c95..98a79bb6ab5 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-mean-absolute-percentage-error.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-mean-absolute-percentage-error.pbtxt
@@ -82,6 +82,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-mean-io-u.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-mean-io-u.pbtxt
index 8b4a3c0b0a1..073956c17ad 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-mean-io-u.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-mean-io-u.pbtxt
@@ -79,6 +79,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-mean-relative-error.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-mean-relative-error.pbtxt
index 89d5e4d3e8c..34b08fe638c 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-mean-relative-error.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-mean-relative-error.pbtxt
@@ -81,6 +81,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-mean-squared-error.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-mean-squared-error.pbtxt
index a33defcce7d..8ead9ad99a2 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-mean-squared-error.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-mean-squared-error.pbtxt
@@ -82,6 +82,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-mean-squared-logarithmic-error.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-mean-squared-logarithmic-error.pbtxt
index ca9fbef766c..72778bd5150 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-mean-squared-logarithmic-error.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-mean-squared-logarithmic-error.pbtxt
@@ -82,6 +82,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-mean-tensor.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-mean-tensor.pbtxt
index 394fe1d6083..2fd261eec4d 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-mean-tensor.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-mean-tensor.pbtxt
@@ -83,6 +83,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-mean.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-mean.pbtxt
index dc507c46e1e..5983096438d 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-mean.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-mean.pbtxt
@@ -80,6 +80,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-metric.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-metric.pbtxt
index e9431ffa883..f647b59779f 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-metric.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-metric.pbtxt
@@ -78,6 +78,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-poisson.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-poisson.pbtxt
index fc0beebae79..01ab9a06be4 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-poisson.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-poisson.pbtxt
@@ -82,6 +82,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-precision.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-precision.pbtxt
index de813980903..3d73e6f00d9 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-precision.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-precision.pbtxt
@@ -79,6 +79,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-recall.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-recall.pbtxt
index 659a859cf5b..229ddfc11b1 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-recall.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-recall.pbtxt
@@ -79,6 +79,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-root-mean-squared-error.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-root-mean-squared-error.pbtxt
index 55cbfc1add4..0f82f936153 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-root-mean-squared-error.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-root-mean-squared-error.pbtxt
@@ -81,6 +81,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-sensitivity-at-specificity.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-sensitivity-at-specificity.pbtxt
index 08a61682533..5eb6e31304b 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-sensitivity-at-specificity.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-sensitivity-at-specificity.pbtxt
@@ -80,6 +80,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-sparse-categorical-accuracy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-sparse-categorical-accuracy.pbtxt
index 3aced4e8fef..2dc879a9835 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-sparse-categorical-accuracy.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-sparse-categorical-accuracy.pbtxt
@@ -82,6 +82,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-sparse-categorical-crossentropy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-sparse-categorical-crossentropy.pbtxt
index 1ff75edc2ca..21df3f3fb62 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-sparse-categorical-crossentropy.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-sparse-categorical-crossentropy.pbtxt
@@ -82,6 +82,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-sparse-top-k-categorical-accuracy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-sparse-top-k-categorical-accuracy.pbtxt
index 624f79f0e68..03ad259c5fb 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-sparse-top-k-categorical-accuracy.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-sparse-top-k-categorical-accuracy.pbtxt
@@ -82,6 +82,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-specificity-at-sensitivity.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-specificity-at-sensitivity.pbtxt
index 70b3394c488..276c56a2234 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-specificity-at-sensitivity.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-specificity-at-sensitivity.pbtxt
@@ -80,6 +80,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-squared-hinge.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-squared-hinge.pbtxt
index 9c857023cb3..4ed16ff78f0 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-squared-hinge.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-squared-hinge.pbtxt
@@ -82,6 +82,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-sum.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-sum.pbtxt
index 02879e936e4..6b33e002f32 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-sum.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-sum.pbtxt
@@ -80,6 +80,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-top-k-categorical-accuracy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-top-k-categorical-accuracy.pbtxt
index c400b2b0676..657083af350 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-top-k-categorical-accuracy.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-top-k-categorical-accuracy.pbtxt
@@ -82,6 +82,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-true-negatives.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-true-negatives.pbtxt
index 403075af5e1..770d29b6646 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-true-negatives.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-true-negatives.pbtxt
@@ -80,6 +80,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-true-positives.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-true-positives.pbtxt
index 2f2ae39c20b..49d7c0975d5 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-true-positives.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-true-positives.pbtxt
@@ -80,6 +80,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.mlir.experimental.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.mlir.experimental.pbtxt
new file mode 100644
index 00000000000..b39c6d3b85d
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.mlir.experimental.pbtxt
@@ -0,0 +1,7 @@
+path: "tensorflow.mlir.experimental"
+tf_module {
+  member_method {
+    name: "convert_graph_def"
+    argspec: "args=[\'graph_def\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.mlir.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.mlir.pbtxt
new file mode 100644
index 00000000000..d4ad55c8999
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.mlir.pbtxt
@@ -0,0 +1,7 @@
+path: "tensorflow.mlir"
+tf_module {
+  member {
+    name: "experimental"
+    mtype: "<type \'module\'>"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.nn.-r-n-n-cell-device-wrapper.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.nn.-r-n-n-cell-device-wrapper.pbtxt
index 5df50d7dd63..17dffb85f9e 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.nn.-r-n-n-cell-device-wrapper.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.nn.-r-n-n-cell-device-wrapper.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "state_size"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.nn.-r-n-n-cell-dropout-wrapper.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.nn.-r-n-n-cell-dropout-wrapper.pbtxt
index 8eb492fd248..c6de2ed1f99 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.nn.-r-n-n-cell-dropout-wrapper.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.nn.-r-n-n-cell-dropout-wrapper.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "state_size"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.nn.-r-n-n-cell-residual-wrapper.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.nn.-r-n-n-cell-residual-wrapper.pbtxt
index b1409d972e5..1845f0276e7 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.nn.-r-n-n-cell-residual-wrapper.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.nn.-r-n-n-cell-residual-wrapper.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "state_size"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.pbtxt
index ee3c0cc22bb..fec187d73cd 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.pbtxt
@@ -260,6 +260,10 @@ tf_module {
     name: "mixed_precision"
     mtype: "<type \'module\'>"
   }
+  member {
+    name: "mlir"
+    mtype: "<type \'module\'>"
+  }
   member {
     name: "name_scope"
     mtype: "<type \'type\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.quantization.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.quantization.pbtxt
index e5921e8c3b2..72fb6ec854f 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.quantization.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.quantization.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.quantization"
 tf_module {
   member_method {
     name: "dequantize"
-    argspec: "args=[\'input\', \'min_range\', \'max_range\', \'mode\', \'name\'], varargs=None, keywords=None, defaults=[\'MIN_COMBINED\', \'None\'], "
+    argspec: "args=[\'input\', \'min_range\', \'max_range\', \'mode\', \'name\', \'axis\'], varargs=None, keywords=None, defaults=[\'MIN_COMBINED\', \'None\', \'None\'], "
   }
   member_method {
     name: "fake_quant_with_min_max_args"
@@ -30,7 +30,7 @@ tf_module {
   }
   member_method {
     name: "quantize"
-    argspec: "args=[\'input\', \'min_range\', \'max_range\', \'T\', \'mode\', \'round_mode\', \'name\'], varargs=None, keywords=None, defaults=[\'MIN_COMBINED\', \'HALF_AWAY_FROM_ZERO\', \'None\'], "
+    argspec: "args=[\'input\', \'min_range\', \'max_range\', \'T\', \'mode\', \'round_mode\', \'name\', \'narrow_range\', \'axis\'], varargs=None, keywords=None, defaults=[\'MIN_COMBINED\', \'HALF_AWAY_FROM_ZERO\', \'None\', \'False\', \'None\'], "
   }
   member_method {
     name: "quantize_and_dequantize"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.raw_ops.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.raw_ops.pbtxt
index 25fa79970be..f6d953d5df7 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.raw_ops.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.raw_ops.pbtxt
@@ -1026,7 +1026,7 @@ tf_module {
   }
   member_method {
     name: "Dequantize"
-    argspec: "args=[\'input\', \'min_range\', \'max_range\', \'mode\', \'name\'], varargs=None, keywords=None, defaults=[\'MIN_COMBINED\', \'None\'], "
+    argspec: "args=[\'input\', \'min_range\', \'max_range\', \'mode\', \'axis\', \'name\'], varargs=None, keywords=None, defaults=[\'MIN_COMBINED\', \'-1\', \'None\'], "
   }
   member_method {
     name: "DeserializeIterator"
@@ -1174,7 +1174,7 @@ tf_module {
   }
   member_method {
     name: "Equal"
-    argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'x\', \'y\', \'incompatible_shape_error\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'None\'], "
   }
   member_method {
     name: "Erf"
@@ -1626,15 +1626,15 @@ tf_module {
   }
   member_method {
     name: "IRFFT"
-    argspec: "args=[\'input\', \'fft_length\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'input\', \'fft_length\', \'Treal\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'float32\'>\", \'None\'], "
   }
   member_method {
     name: "IRFFT2D"
-    argspec: "args=[\'input\', \'fft_length\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'input\', \'fft_length\', \'Treal\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'float32\'>\", \'None\'], "
   }
   member_method {
     name: "IRFFT3D"
-    argspec: "args=[\'input\', \'fft_length\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'input\', \'fft_length\', \'Treal\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'float32\'>\", \'None\'], "
   }
   member_method {
     name: "Identity"
@@ -1898,71 +1898,71 @@ tf_module {
   }
   member_method {
     name: "LoadTPUEmbeddingADAMParameters"
-    argspec: "args=[\'parameters\', \'momenta\', \'velocities\', \'num_shards\', \'shard_id\', \'table_id\', \'table_name\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'\', \'None\'], "
+    argspec: "args=[\'parameters\', \'momenta\', \'velocities\', \'num_shards\', \'shard_id\', \'table_id\', \'table_name\', \'config\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'\', \'\', \'None\'], "
   }
   member_method {
     name: "LoadTPUEmbeddingADAMParametersGradAccumDebug"
-    argspec: "args=[\'parameters\', \'momenta\', \'velocities\', \'gradient_accumulators\', \'num_shards\', \'shard_id\', \'table_id\', \'table_name\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'\', \'None\'], "
+    argspec: "args=[\'parameters\', \'momenta\', \'velocities\', \'gradient_accumulators\', \'num_shards\', \'shard_id\', \'table_id\', \'table_name\', \'config\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'\', \'\', \'None\'], "
   }
   member_method {
     name: "LoadTPUEmbeddingAdadeltaParameters"
-    argspec: "args=[\'parameters\', \'accumulators\', \'updates\', \'num_shards\', \'shard_id\', \'table_id\', \'table_name\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'\', \'None\'], "
+    argspec: "args=[\'parameters\', \'accumulators\', \'updates\', \'num_shards\', \'shard_id\', \'table_id\', \'table_name\', \'config\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'\', \'\', \'None\'], "
   }
   member_method {
     name: "LoadTPUEmbeddingAdadeltaParametersGradAccumDebug"
-    argspec: "args=[\'parameters\', \'accumulators\', \'updates\', \'gradient_accumulators\', \'num_shards\', \'shard_id\', \'table_id\', \'table_name\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'\', \'None\'], "
+    argspec: "args=[\'parameters\', \'accumulators\', \'updates\', \'gradient_accumulators\', \'num_shards\', \'shard_id\', \'table_id\', \'table_name\', \'config\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'\', \'\', \'None\'], "
   }
   member_method {
     name: "LoadTPUEmbeddingAdagradParameters"
-    argspec: "args=[\'parameters\', \'accumulators\', \'num_shards\', \'shard_id\', \'table_id\', \'table_name\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'\', \'None\'], "
+    argspec: "args=[\'parameters\', \'accumulators\', \'num_shards\', \'shard_id\', \'table_id\', \'table_name\', \'config\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'\', \'\', \'None\'], "
   }
   member_method {
     name: "LoadTPUEmbeddingAdagradParametersGradAccumDebug"
-    argspec: "args=[\'parameters\', \'accumulators\', \'gradient_accumulators\', \'num_shards\', \'shard_id\', \'table_id\', \'table_name\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'\', \'None\'], "
+    argspec: "args=[\'parameters\', \'accumulators\', \'gradient_accumulators\', \'num_shards\', \'shard_id\', \'table_id\', \'table_name\', \'config\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'\', \'\', \'None\'], "
   }
   member_method {
     name: "LoadTPUEmbeddingCenteredRMSPropParameters"
-    argspec: "args=[\'parameters\', \'ms\', \'mom\', \'mg\', \'num_shards\', \'shard_id\', \'table_id\', \'table_name\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'\', \'None\'], "
+    argspec: "args=[\'parameters\', \'ms\', \'mom\', \'mg\', \'num_shards\', \'shard_id\', \'table_id\', \'table_name\', \'config\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'\', \'\', \'None\'], "
   }
   member_method {
     name: "LoadTPUEmbeddingFTRLParameters"
-    argspec: "args=[\'parameters\', \'accumulators\', \'linears\', \'num_shards\', \'shard_id\', \'table_id\', \'table_name\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'\', \'None\'], "
+    argspec: "args=[\'parameters\', \'accumulators\', \'linears\', \'num_shards\', \'shard_id\', \'table_id\', \'table_name\', \'config\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'\', \'\', \'None\'], "
   }
   member_method {
     name: "LoadTPUEmbeddingFTRLParametersGradAccumDebug"
-    argspec: "args=[\'parameters\', \'accumulators\', \'linears\', \'gradient_accumulators\', \'num_shards\', \'shard_id\', \'table_id\', \'table_name\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'\', \'None\'], "
+    argspec: "args=[\'parameters\', \'accumulators\', \'linears\', \'gradient_accumulators\', \'num_shards\', \'shard_id\', \'table_id\', \'table_name\', \'config\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'\', \'\', \'None\'], "
   }
   member_method {
     name: "LoadTPUEmbeddingMDLAdagradLightParameters"
-    argspec: "args=[\'parameters\', \'accumulators\', \'weights\', \'benefits\', \'num_shards\', \'shard_id\', \'table_id\', \'table_name\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'\', \'None\'], "
+    argspec: "args=[\'parameters\', \'accumulators\', \'weights\', \'benefits\', \'num_shards\', \'shard_id\', \'table_id\', \'table_name\', \'config\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'\', \'\', \'None\'], "
   }
   member_method {
     name: "LoadTPUEmbeddingMomentumParameters"
-    argspec: "args=[\'parameters\', \'momenta\', \'num_shards\', \'shard_id\', \'table_id\', \'table_name\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'\', \'None\'], "
+    argspec: "args=[\'parameters\', \'momenta\', \'num_shards\', \'shard_id\', \'table_id\', \'table_name\', \'config\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'\', \'\', \'None\'], "
   }
   member_method {
     name: "LoadTPUEmbeddingMomentumParametersGradAccumDebug"
-    argspec: "args=[\'parameters\', \'momenta\', \'gradient_accumulators\', \'num_shards\', \'shard_id\', \'table_id\', \'table_name\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'\', \'None\'], "
+    argspec: "args=[\'parameters\', \'momenta\', \'gradient_accumulators\', \'num_shards\', \'shard_id\', \'table_id\', \'table_name\', \'config\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'\', \'\', \'None\'], "
   }
   member_method {
     name: "LoadTPUEmbeddingProximalAdagradParameters"
-    argspec: "args=[\'parameters\', \'accumulators\', \'num_shards\', \'shard_id\', \'table_id\', \'table_name\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'\', \'None\'], "
+    argspec: "args=[\'parameters\', \'accumulators\', \'num_shards\', \'shard_id\', \'table_id\', \'table_name\', \'config\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'\', \'\', \'None\'], "
   }
   member_method {
     name: "LoadTPUEmbeddingProximalAdagradParametersGradAccumDebug"
-    argspec: "args=[\'parameters\', \'accumulators\', \'gradient_accumulators\', \'num_shards\', \'shard_id\', \'table_id\', \'table_name\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'\', \'None\'], "
+    argspec: "args=[\'parameters\', \'accumulators\', \'gradient_accumulators\', \'num_shards\', \'shard_id\', \'table_id\', \'table_name\', \'config\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'\', \'\', \'None\'], "
   }
   member_method {
     name: "LoadTPUEmbeddingRMSPropParameters"
-    argspec: "args=[\'parameters\', \'ms\', \'mom\', \'num_shards\', \'shard_id\', \'table_id\', \'table_name\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'\', \'None\'], "
+    argspec: "args=[\'parameters\', \'ms\', \'mom\', \'num_shards\', \'shard_id\', \'table_id\', \'table_name\', \'config\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'\', \'\', \'None\'], "
   }
   member_method {
     name: "LoadTPUEmbeddingRMSPropParametersGradAccumDebug"
-    argspec: "args=[\'parameters\', \'ms\', \'mom\', \'gradient_accumulators\', \'num_shards\', \'shard_id\', \'table_id\', \'table_name\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'\', \'None\'], "
+    argspec: "args=[\'parameters\', \'ms\', \'mom\', \'gradient_accumulators\', \'num_shards\', \'shard_id\', \'table_id\', \'table_name\', \'config\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'\', \'\', \'None\'], "
   }
   member_method {
     name: "LoadTPUEmbeddingStochasticGradientDescentParameters"
-    argspec: "args=[\'parameters\', \'num_shards\', \'shard_id\', \'table_id\', \'table_name\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'\', \'None\'], "
+    argspec: "args=[\'parameters\', \'num_shards\', \'shard_id\', \'table_id\', \'table_name\', \'config\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'\', \'\', \'None\'], "
   }
   member_method {
     name: "Log"
@@ -2398,7 +2398,7 @@ tf_module {
   }
   member_method {
     name: "NotEqual"
-    argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'x\', \'y\', \'incompatible_shape_error\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'None\'], "
   }
   member_method {
     name: "NthElement"
@@ -2658,7 +2658,7 @@ tf_module {
   }
   member_method {
     name: "QuantizeV2"
-    argspec: "args=[\'input\', \'min_range\', \'max_range\', \'T\', \'mode\', \'round_mode\', \'name\'], varargs=None, keywords=None, defaults=[\'MIN_COMBINED\', \'HALF_AWAY_FROM_ZERO\', \'None\'], "
+    argspec: "args=[\'input\', \'min_range\', \'max_range\', \'T\', \'mode\', \'round_mode\', \'narrow_range\', \'axis\', \'name\'], varargs=None, keywords=None, defaults=[\'MIN_COMBINED\', \'HALF_AWAY_FROM_ZERO\', \'False\', \'-1\', \'None\'], "
   }
   member_method {
     name: "QuantizedAdd"
@@ -2858,15 +2858,15 @@ tf_module {
   }
   member_method {
     name: "RFFT"
-    argspec: "args=[\'input\', \'fft_length\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'input\', \'fft_length\', \'Tcomplex\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'complex64\'>\", \'None\'], "
   }
   member_method {
     name: "RFFT2D"
-    argspec: "args=[\'input\', \'fft_length\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'input\', \'fft_length\', \'Tcomplex\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'complex64\'>\", \'None\'], "
   }
   member_method {
     name: "RFFT3D"
-    argspec: "args=[\'input\', \'fft_length\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'input\', \'fft_length\', \'Tcomplex\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'complex64\'>\", \'None\'], "
   }
   member_method {
     name: "RGBToHSV"
@@ -3374,71 +3374,71 @@ tf_module {
   }
   member_method {
     name: "RetrieveTPUEmbeddingADAMParameters"
-    argspec: "args=[\'num_shards\', \'shard_id\', \'table_id\', \'table_name\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'\', \'None\'], "
+    argspec: "args=[\'num_shards\', \'shard_id\', \'table_id\', \'table_name\', \'config\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'\', \'\', \'None\'], "
   }
   member_method {
     name: "RetrieveTPUEmbeddingADAMParametersGradAccumDebug"
-    argspec: "args=[\'num_shards\', \'shard_id\', \'table_id\', \'table_name\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'\', \'None\'], "
+    argspec: "args=[\'num_shards\', \'shard_id\', \'table_id\', \'table_name\', \'config\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'\', \'\', \'None\'], "
   }
   member_method {
     name: "RetrieveTPUEmbeddingAdadeltaParameters"
-    argspec: "args=[\'num_shards\', \'shard_id\', \'table_id\', \'table_name\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'\', \'None\'], "
+    argspec: "args=[\'num_shards\', \'shard_id\', \'table_id\', \'table_name\', \'config\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'\', \'\', \'None\'], "
   }
   member_method {
     name: "RetrieveTPUEmbeddingAdadeltaParametersGradAccumDebug"
-    argspec: "args=[\'num_shards\', \'shard_id\', \'table_id\', \'table_name\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'\', \'None\'], "
+    argspec: "args=[\'num_shards\', \'shard_id\', \'table_id\', \'table_name\', \'config\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'\', \'\', \'None\'], "
   }
   member_method {
     name: "RetrieveTPUEmbeddingAdagradParameters"
-    argspec: "args=[\'num_shards\', \'shard_id\', \'table_id\', \'table_name\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'\', \'None\'], "
+    argspec: "args=[\'num_shards\', \'shard_id\', \'table_id\', \'table_name\', \'config\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'\', \'\', \'None\'], "
   }
   member_method {
     name: "RetrieveTPUEmbeddingAdagradParametersGradAccumDebug"
-    argspec: "args=[\'num_shards\', \'shard_id\', \'table_id\', \'table_name\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'\', \'None\'], "
+    argspec: "args=[\'num_shards\', \'shard_id\', \'table_id\', \'table_name\', \'config\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'\', \'\', \'None\'], "
   }
   member_method {
     name: "RetrieveTPUEmbeddingCenteredRMSPropParameters"
-    argspec: "args=[\'num_shards\', \'shard_id\', \'table_id\', \'table_name\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'\', \'None\'], "
+    argspec: "args=[\'num_shards\', \'shard_id\', \'table_id\', \'table_name\', \'config\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'\', \'\', \'None\'], "
   }
   member_method {
     name: "RetrieveTPUEmbeddingFTRLParameters"
-    argspec: "args=[\'num_shards\', \'shard_id\', \'table_id\', \'table_name\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'\', \'None\'], "
+    argspec: "args=[\'num_shards\', \'shard_id\', \'table_id\', \'table_name\', \'config\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'\', \'\', \'None\'], "
   }
   member_method {
     name: "RetrieveTPUEmbeddingFTRLParametersGradAccumDebug"
-    argspec: "args=[\'num_shards\', \'shard_id\', \'table_id\', \'table_name\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'\', \'None\'], "
+    argspec: "args=[\'num_shards\', \'shard_id\', \'table_id\', \'table_name\', \'config\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'\', \'\', \'None\'], "
   }
   member_method {
     name: "RetrieveTPUEmbeddingMDLAdagradLightParameters"
-    argspec: "args=[\'num_shards\', \'shard_id\', \'table_id\', \'table_name\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'\', \'None\'], "
+    argspec: "args=[\'num_shards\', \'shard_id\', \'table_id\', \'table_name\', \'config\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'\', \'\', \'None\'], "
   }
   member_method {
     name: "RetrieveTPUEmbeddingMomentumParameters"
-    argspec: "args=[\'num_shards\', \'shard_id\', \'table_id\', \'table_name\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'\', \'None\'], "
+    argspec: "args=[\'num_shards\', \'shard_id\', \'table_id\', \'table_name\', \'config\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'\', \'\', \'None\'], "
   }
   member_method {
     name: "RetrieveTPUEmbeddingMomentumParametersGradAccumDebug"
-    argspec: "args=[\'num_shards\', \'shard_id\', \'table_id\', \'table_name\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'\', \'None\'], "
+    argspec: "args=[\'num_shards\', \'shard_id\', \'table_id\', \'table_name\', \'config\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'\', \'\', \'None\'], "
   }
   member_method {
     name: "RetrieveTPUEmbeddingProximalAdagradParameters"
-    argspec: "args=[\'num_shards\', \'shard_id\', \'table_id\', \'table_name\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'\', \'None\'], "
+    argspec: "args=[\'num_shards\', \'shard_id\', \'table_id\', \'table_name\', \'config\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'\', \'\', \'None\'], "
   }
   member_method {
     name: "RetrieveTPUEmbeddingProximalAdagradParametersGradAccumDebug"
-    argspec: "args=[\'num_shards\', \'shard_id\', \'table_id\', \'table_name\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'\', \'None\'], "
+    argspec: "args=[\'num_shards\', \'shard_id\', \'table_id\', \'table_name\', \'config\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'\', \'\', \'None\'], "
   }
   member_method {
     name: "RetrieveTPUEmbeddingRMSPropParameters"
-    argspec: "args=[\'num_shards\', \'shard_id\', \'table_id\', \'table_name\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'\', \'None\'], "
+    argspec: "args=[\'num_shards\', \'shard_id\', \'table_id\', \'table_name\', \'config\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'\', \'\', \'None\'], "
   }
   member_method {
     name: "RetrieveTPUEmbeddingRMSPropParametersGradAccumDebug"
-    argspec: "args=[\'num_shards\', \'shard_id\', \'table_id\', \'table_name\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'\', \'None\'], "
+    argspec: "args=[\'num_shards\', \'shard_id\', \'table_id\', \'table_name\', \'config\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'\', \'\', \'None\'], "
   }
   member_method {
     name: "RetrieveTPUEmbeddingStochasticGradientDescentParameters"
-    argspec: "args=[\'num_shards\', \'shard_id\', \'table_id\', \'table_name\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'\', \'None\'], "
+    argspec: "args=[\'num_shards\', \'shard_id\', \'table_id\', \'table_name\', \'config\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'\', \'\', \'None\'], "
   }
   member_method {
     name: "Reverse"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.saved_model.-asset.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.saved_model.-asset.pbtxt
new file mode 100644
index 00000000000..0a20385c329
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.saved_model.-asset.pbtxt
@@ -0,0 +1,14 @@
+path: "tensorflow.saved_model.Asset"
+tf_class {
+  is_instance: "<class \'tensorflow.python.training.tracking.tracking.Asset\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "asset_path"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'path\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.saved_model.-save-options.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.saved_model.-save-options.pbtxt
index 174043aec78..ea31605ba1f 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.saved_model.-save-options.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.saved_model.-save-options.pbtxt
@@ -6,8 +6,12 @@ tf_class {
     name: "namespace_whitelist"
     mtype: "<type \'member_descriptor\'>"
   }
+  member {
+    name: "save_debug_info"
+    mtype: "<type \'member_descriptor\'>"
+  }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'namespace_whitelist\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'namespace_whitelist\', \'save_debug_info\'], varargs=None, keywords=None, defaults=[\'None\', \'False\'], "
   }
 }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.saved_model.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.saved_model.pbtxt
index 94fa0eaad53..0a82cfd0873 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.saved_model.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.saved_model.pbtxt
@@ -8,6 +8,10 @@ tf_module {
     name: "ASSETS_KEY"
     mtype: "<type \'str\'>"
   }
+  member {
+    name: "Asset"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "CLASSIFY_INPUTS"
     mtype: "<type \'str\'>"
@@ -24,6 +28,14 @@ tf_module {
     name: "CLASSIFY_OUTPUT_SCORES"
     mtype: "<type \'str\'>"
   }
+  member {
+    name: "DEBUG_DIRECTORY"
+    mtype: "<type \'str\'>"
+  }
+  member {
+    name: "DEBUG_INFO_FILENAME_PB"
+    mtype: "<type \'str\'>"
+  }
   member {
     name: "DEFAULT_SERVING_SIGNATURE_DEF_KEY"
     mtype: "<type \'str\'>"
diff --git a/tensorflow/tools/api/tests/api_compatibility_test.py b/tensorflow/tools/api/tests/api_compatibility_test.py
index e3c166ebee8..09517c71464 100644
--- a/tensorflow/tools/api/tests/api_compatibility_test.py
+++ b/tensorflow/tools/api/tests/api_compatibility_test.py
@@ -47,6 +47,14 @@ from tensorflow.tools.api.lib import python_object_to_proto_visitor
 from tensorflow.tools.common import public_api
 from tensorflow.tools.common import traverse
 
+# pylint: disable=g-import-not-at-top,unused-import
+_TENSORBOARD_AVAILABLE = True
+try:
+  import tensorboard as _tb
+except ImportError:
+  _TENSORBOARD_AVAILABLE = False
+# pylint: enable=g-import-not-at-top,unused-import
+
 # FLAGS defined at the bottom:
 FLAGS = None
 # DEFINE_boolean, update_goldens, default False:
@@ -360,7 +368,8 @@ class ApiCompatibilityTest(test.TestCase):
         resource_loader.get_root_dir_with_all_resources(),
         _KeyToFilePath('*', api_version))
     omit_golden_symbols_map = {}
-    if api_version == 2 and FLAGS.only_test_core_api:
+    if (api_version == 2 and FLAGS.only_test_core_api
+        and not _TENSORBOARD_AVAILABLE):
       # In TF 2.0 these summary symbols are imported from TensorBoard.
       omit_golden_symbols_map['tensorflow.summary'] = [
           'audio', 'histogram', 'image', 'scalar', 'text']
@@ -398,7 +407,7 @@ class ApiCompatibilityTest(test.TestCase):
         resource_loader.get_root_dir_with_all_resources(),
         _KeyToFilePath('*', api_version))
     omit_golden_symbols_map = {}
-    if FLAGS.only_test_core_api:
+    if FLAGS.only_test_core_api and not _TENSORBOARD_AVAILABLE:
       # In TF 2.0 these summary symbols are imported from TensorBoard.
       omit_golden_symbols_map['tensorflow.summary'] = [
           'audio', 'histogram', 'image', 'scalar', 'text']
diff --git a/tensorflow/tools/api/tests/deprecation_test.py b/tensorflow/tools/api/tests/deprecation_test.py
index 962b557d7a9..185dd70696b 100644
--- a/tensorflow/tools/api/tests/deprecation_test.py
+++ b/tensorflow/tools/api/tests/deprecation_test.py
@@ -19,6 +19,7 @@ from __future__ import division
 from __future__ import print_function
 
 import numpy as np
+import os
 import tensorflow as tf
 
 from tensorflow.python.platform import test
@@ -145,12 +146,29 @@ class DeprecationTest(test.TestCase):
   def testKerasDeprecation(self, mock_warning):
     self.assertEqual(0, mock_warning.call_count)
     tf.keras.backend.get_session()
-    self.assertEqual(1, mock_warning.call_count)
-    self.assertRegexpMatches(
-        mock_warning.call_args[0][-1],
-        "tf.compat.v1.keras.backend.get_session")
+    # if OpenMP is set in environment, then logging.warning
+    # is called two times. First for deprecation and 2nd for
+    # OMP related warning.
+    if os.environ.get("OMP_NUM_THREADS"):
+      self.assertEqual(2, mock_warning.call_count)
+      # First message on deprecation warning.
+      self.assertRegexpMatches(mock_warning.call_args_list[0][0][-1],
+                               "tf.compat.v1.keras.backend.get_session")
+      # Second message is not a deprecation warning.
+      self.assertRegexpMatches(
+          mock_warning.call_args_list[1][0][0],
+          "OMP_NUM_THREADS is no longer used by the default Keras config."
+          " To configure the number of threads, use tf.config.threading"
+          " APIs")
+    else:
+      self.assertEqual(1, mock_warning.call_count)
+      self.assertRegexpMatches(mock_warning.call_args[0][-1],
+                               "tf.compat.v1.keras.backend.get_session")
     tf.keras.backend.get_session()
-    self.assertEqual(1, mock_warning.call_count)
+    if os.environ.get("OMP_NUM_THREADS"):
+      self.assertEqual(2, mock_warning.call_count)
+    else:
+      self.assertEqual(1, mock_warning.call_count)
 
   @test.mock.patch.object(logging, "warning", autospec=True)
   def testKerasEndpointDeprecation(self, mock_warning):
diff --git a/tensorflow/tools/ci_build/Dockerfile.pi b/tensorflow/tools/ci_build/Dockerfile.pi
index 75ef30d32b0..f772880cec1 100644
--- a/tensorflow/tools/ci_build/Dockerfile.pi
+++ b/tensorflow/tools/ci_build/Dockerfile.pi
@@ -14,6 +14,7 @@ RUN /install/install_proto3.sh
 RUN /install/install_buildifier.sh
 RUN /install/install_auditwheel.sh
 RUN /install/install_golang.sh
+RUN /install/install_gcc6.sh
 
 # The following line installs the Python cross-compilation toolchain. All the
 # preceding dependencies should be kept in sync with the main CPU docker file.
diff --git a/tensorflow/tools/ci_build/builds/pip_new.sh b/tensorflow/tools/ci_build/builds/pip_new.sh
index 72f1b582087..e3f375edb9b 100755
--- a/tensorflow/tools/ci_build/builds/pip_new.sh
+++ b/tensorflow/tools/ci_build/builds/pip_new.sh
@@ -458,6 +458,12 @@ install_tensorflow_pip() {
   #   ImportError: No module named builtins
   ${PIP_BIN_PATH} install --upgrade "future>=0.17.1" || \
     die "Error: future install, upgrade FAILED"
+
+  # Install the gast package in the virtualenv. Installing it in user system
+  # packages does not appear to port it over when creating a virtualenv.
+  ${PIP_BIN_PATH} install --upgrade "gast==0.2.2" || \
+    die "Error: gast install, upgrade FAILED"
+
 }
 
 run_test_with_bazel() {
diff --git a/tensorflow/tools/ci_build/install/install_gcc6.sh b/tensorflow/tools/ci_build/install/install_gcc6.sh
new file mode 100755
index 00000000000..e5ce86b9981
--- /dev/null
+++ b/tensorflow/tools/ci_build/install/install_gcc6.sh
@@ -0,0 +1,25 @@
+#!/usr/bin/env bash
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+apt-get update
+apt-get install build-essential software-properties-common -y
+add-apt-repository ppa:ubuntu-toolchain-r/test -y
+apt-get update
+apt-get install gcc-snapshot -y
+apt-get update
+apt-get install gcc-6 g++-6 -y
+update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-6 100 --slave /usr/bin/g++ g++ /usr/bin/g++-6
+update-alternatives --set gcc /usr/bin/gcc-6
diff --git a/tensorflow/tools/ci_build/linux/mkl/build-dev-container.sh b/tensorflow/tools/ci_build/linux/mkl/build-dev-container.sh
index 69f674540df..bb9b74a9f26 100755
--- a/tensorflow/tools/ci_build/linux/mkl/build-dev-container.sh
+++ b/tensorflow/tools/ci_build/linux/mkl/build-dev-container.sh
@@ -155,7 +155,7 @@ function test_container()
   debug "ID of the running docker container: ${CONTAINER_ID}"
 
   debug "Performing basic sanity checks on the running container..."
-  TEST_CMD=$(${DOCKER_BINARY} exec ${CONTAINER_ID} bash -c "${PYTHON} -c 'from tensorflow.python import pywrap_tensorflow; print(pywrap_tensorflow.IsMklEnabled())'")
+  TEST_CMD=$(${DOCKER_BINARY} exec ${CONTAINER_ID} bash -c "${PYTHON} -c 'from tensorflow.python import _pywrap_util_port; print(_pywrap_util_port.IsMklEnabled())'")
   debug "Running test command: ${TEST_CMD}"
   if [ "${TEST_CMD}" = "True" ] ; then
       echo "PASS: MKL enabled test in ${TEMP_IMAGE_NAME}"
diff --git a/tensorflow/tools/ci_build/pi/build_raspberry_pi.sh b/tensorflow/tools/ci_build/pi/build_raspberry_pi.sh
index 1398b79b338..be84b994482 100755
--- a/tensorflow/tools/ci_build/pi/build_raspberry_pi.sh
+++ b/tensorflow/tools/ci_build/pi/build_raspberry_pi.sh
@@ -57,11 +57,11 @@ TOOLCHAIN_INSTALL_PATH=/tmp/toolchain_install/
 sudo rm -rf ${TOOLCHAIN_INSTALL_PATH}
 mkdir ${TOOLCHAIN_INSTALL_PATH}
 cd ${TOOLCHAIN_INSTALL_PATH}
-curl -L https://github.com/raspberrypi/tools/archive/0e906ebc527eab1cdbf7adabff5b474da9562e9f.tar.gz -o toolchain.tar.gz
+curl -L https://github.com/rvagg/rpi-newer-crosstools/archive/eb68350c5c8ec1663b7fe52c742ac4271e3217c5.tar.gz -o toolchain.tar.gz
 tar xzf toolchain.tar.gz
-mv tools-0e906ebc527eab1cdbf7adabff5b474da9562e9f/ tools
+mv rpi-newer-crosstools-eb68350c5c8ec1663b7fe52c742ac4271e3217c5 tools
 
-CROSSTOOL_CC=${TOOLCHAIN_INSTALL_PATH}/tools/arm-bcm2708/arm-rpi-4.9.3-linux-gnueabihf/bin/arm-linux-gnueabihf-gcc
+CROSSTOOL_CC=${TOOLCHAIN_INSTALL_PATH}/tools/x64-gcc-6.5.0/arm-rpi-linux-gnueabihf/bin/arm-rpi-linux-gnueabihf-gcc
 
 OPENBLAS_SRC_PATH=/tmp/openblas_src/
 sudo rm -rf ${OPENBLAS_SRC_PATH}
diff --git a/tensorflow/tools/compatibility/tf_upgrade_v2_test.py b/tensorflow/tools/compatibility/tf_upgrade_v2_test.py
index 4464a2aed63..6bdf65135ad 100644
--- a/tensorflow/tools/compatibility/tf_upgrade_v2_test.py
+++ b/tensorflow/tools/compatibility/tf_upgrade_v2_test.py
@@ -340,11 +340,16 @@ class TestUpgrade(test_util.TensorFlowTestCase, parameterized.TestCase):
     method_names = full_dict.keys()
     for method_name in method_names:
       args = full_dict[method_name].keys()
-      # special case for optimizer methods
-      if method_name.startswith("*."):
+      if "contrib" in method_name:
+        # Skip descending and fetching contrib methods during test. These are
+        # not available in the repo anymore.
+        continue
+      elif method_name.startswith("*."):
+        # special case for optimizer methods
         method = method_name.replace("*", "tf.train.Optimizer")
       else:
         method = method_name
+
       method = get_symbol_for_name(tf, method)
       arg_spec = tf_inspect.getfullargspec(method)
       for (arg, pos) in args:
diff --git a/tensorflow/tools/def_file_filter/symbols_pybind.txt b/tensorflow/tools/def_file_filter/symbols_pybind.txt
index 7bcb43bd510..0bfe998a5ed 100644
--- a/tensorflow/tools/def_file_filter/symbols_pybind.txt
+++ b/tensorflow/tools/def_file_filter/symbols_pybind.txt
@@ -1,4 +1,4 @@
-[cpp_python_util]
+[cpp_python_util] # util
 tensorflow::swig::IsSequence
 tensorflow::swig::IsSequenceOrComposite
 tensorflow::swig::IsCompositeTensor
@@ -18,6 +18,17 @@ tensorflow::swig::FlattenForData
 tensorflow::swig::AssertSameStructureForData
 tensorflow::swig::RegisterType
 
-[stream_executor_pimpl]
+[stream_executor_pimpl] # stat_summarizer
 stream_executor::StreamExecutor::EnablePeerAccessTo
 stream_executor::StreamExecutor::CanEnablePeerAccessTo
+
+[print_model_analysis] # tfprof
+tensorflow::tfprof::NewProfiler
+tensorflow::tfprof::DeleteProfiler
+tensorflow::tfprof::AddStep
+tensorflow::tfprof::WriteProfile
+tensorflow::tfprof::ProfilerFromFile
+tensorflow::tfprof::Profile
+tensorflow::tfprof::PrintModelAnalysis
+tensorflow::tfprof::SerializeToString
+
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/cpu-jupyter.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/cpu-jupyter.Dockerfile
index 2e8d43843c5..46443bb6946 100644
--- a/tensorflow/tools/dockerfiles/dockerfiles/cpu-jupyter.Dockerfile
+++ b/tensorflow/tools/dockerfiles/dockerfiles/cpu-jupyter.Dockerfile
@@ -23,6 +23,8 @@ ARG UBUNTU_VERSION=18.04
 
 FROM ubuntu:${UBUNTU_VERSION} as base
 
+RUN apt-get update && apt-get install -y curl
+
 ARG USE_PYTHON_3_NOT_2
 ARG _PY_SUFFIX=${USE_PYTHON_3_NOT_2:+3}
 ARG PYTHON=python${_PY_SUFFIX}
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/cpu.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/cpu.Dockerfile
index a0bda2e72af..bf1d51894d7 100644
--- a/tensorflow/tools/dockerfiles/dockerfiles/cpu.Dockerfile
+++ b/tensorflow/tools/dockerfiles/dockerfiles/cpu.Dockerfile
@@ -23,6 +23,8 @@ ARG UBUNTU_VERSION=18.04
 
 FROM ubuntu:${UBUNTU_VERSION} as base
 
+RUN apt-get update && apt-get install -y curl
+
 ARG USE_PYTHON_3_NOT_2
 ARG _PY_SUFFIX=${USE_PYTHON_3_NOT_2:+3}
 ARG PYTHON=python${_PY_SUFFIX}
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/ppc64le/cpu-ppc64le-jupyter.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/ppc64le/cpu-ppc64le-jupyter.Dockerfile
index cd78aa57e22..907d6af7b3c 100644
--- a/tensorflow/tools/dockerfiles/dockerfiles/ppc64le/cpu-ppc64le-jupyter.Dockerfile
+++ b/tensorflow/tools/dockerfiles/dockerfiles/ppc64le/cpu-ppc64le-jupyter.Dockerfile
@@ -23,6 +23,8 @@ ARG UBUNTU_VERSION=18.04
 
 FROM ubuntu:${UBUNTU_VERSION} as base
 
+RUN apt-get update && apt-get install -y curl
+
 ARG USE_PYTHON_3_NOT_2
 ARG _PY_SUFFIX=${USE_PYTHON_3_NOT_2:+3}
 ARG PYTHON=python${_PY_SUFFIX}
@@ -48,7 +50,7 @@ RUN ln -s $(which ${PYTHON}) /usr/local/bin/python
 #   tf-nightly
 #   tf-nightly-gpu
 ARG TF_PACKAGE=tensorflow
-RUN apt-get update && apt-get install -y wget libhdf5-dev
+RUN apt-get update && apt-get install -y curl libhdf5-dev wget
 RUN ${PIP} install --global-option=build_ext \
             --global-option=-I/usr/include/hdf5/serial/ \
             --global-option=-L/usr/lib/powerpc64le-linux-gnu/hdf5/serial \
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/ppc64le/cpu-ppc64le.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/ppc64le/cpu-ppc64le.Dockerfile
index 22016c245ee..3ec3f3a6486 100644
--- a/tensorflow/tools/dockerfiles/dockerfiles/ppc64le/cpu-ppc64le.Dockerfile
+++ b/tensorflow/tools/dockerfiles/dockerfiles/ppc64le/cpu-ppc64le.Dockerfile
@@ -23,6 +23,8 @@ ARG UBUNTU_VERSION=18.04
 
 FROM ubuntu:${UBUNTU_VERSION} as base
 
+RUN apt-get update && apt-get install -y curl
+
 ARG USE_PYTHON_3_NOT_2
 ARG _PY_SUFFIX=${USE_PYTHON_3_NOT_2:+3}
 ARG PYTHON=python${_PY_SUFFIX}
@@ -48,7 +50,7 @@ RUN ln -s $(which ${PYTHON}) /usr/local/bin/python
 #   tf-nightly
 #   tf-nightly-gpu
 ARG TF_PACKAGE=tensorflow
-RUN apt-get update && apt-get install -y wget libhdf5-dev
+RUN apt-get update && apt-get install -y curl libhdf5-dev wget
 RUN ${PIP} install --global-option=build_ext \
             --global-option=-I/usr/include/hdf5/serial/ \
             --global-option=-L/usr/lib/powerpc64le-linux-gnu/hdf5/serial \
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/ppc64le/gpu-ppc64le-jupyter.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/ppc64le/gpu-ppc64le-jupyter.Dockerfile
index 7ebfcedbf85..71a1b79a3db 100644
--- a/tensorflow/tools/dockerfiles/dockerfiles/ppc64le/gpu-ppc64le-jupyter.Dockerfile
+++ b/tensorflow/tools/dockerfiles/dockerfiles/ppc64le/gpu-ppc64le-jupyter.Dockerfile
@@ -89,7 +89,7 @@ RUN ln -s $(which ${PYTHON}) /usr/local/bin/python
 #   tf-nightly
 #   tf-nightly-gpu
 ARG TF_PACKAGE=tensorflow
-RUN apt-get update && apt-get install -y wget libhdf5-dev
+RUN apt-get update && apt-get install -y curl libhdf5-dev wget
 RUN ${PIP} install --global-option=build_ext \
             --global-option=-I/usr/include/hdf5/serial/ \
             --global-option=-L/usr/lib/powerpc64le-linux-gnu/hdf5/serial \
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/ppc64le/gpu-ppc64le.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/ppc64le/gpu-ppc64le.Dockerfile
index 0b511bb1817..4655b1d5509 100644
--- a/tensorflow/tools/dockerfiles/dockerfiles/ppc64le/gpu-ppc64le.Dockerfile
+++ b/tensorflow/tools/dockerfiles/dockerfiles/ppc64le/gpu-ppc64le.Dockerfile
@@ -89,7 +89,7 @@ RUN ln -s $(which ${PYTHON}) /usr/local/bin/python
 #   tf-nightly
 #   tf-nightly-gpu
 ARG TF_PACKAGE=tensorflow
-RUN apt-get update && apt-get install -y wget libhdf5-dev
+RUN apt-get update && apt-get install -y curl libhdf5-dev wget
 RUN ${PIP} install --global-option=build_ext \
             --global-option=-I/usr/include/hdf5/serial/ \
             --global-option=-L/usr/lib/powerpc64le-linux-gnu/hdf5/serial \
diff --git a/tensorflow/tools/dockerfiles/partials/tensorflow-ppc64le.partial.Dockerfile b/tensorflow/tools/dockerfiles/partials/tensorflow-ppc64le.partial.Dockerfile
index 1e79574a34d..fbeb7f994ff 100644
--- a/tensorflow/tools/dockerfiles/partials/tensorflow-ppc64le.partial.Dockerfile
+++ b/tensorflow/tools/dockerfiles/partials/tensorflow-ppc64le.partial.Dockerfile
@@ -4,7 +4,7 @@
 #   tf-nightly
 #   tf-nightly-gpu
 ARG TF_PACKAGE=tensorflow
-RUN apt-get update && apt-get install -y wget libhdf5-dev
+RUN apt-get update && apt-get install -y curl libhdf5-dev wget
 RUN ${PIP} install --global-option=build_ext \
             --global-option=-I/usr/include/hdf5/serial/ \
             --global-option=-L/usr/lib/powerpc64le-linux-gnu/hdf5/serial \
diff --git a/tensorflow/tools/dockerfiles/partials/ubuntu/cpu.partial.Dockerfile b/tensorflow/tools/dockerfiles/partials/ubuntu/cpu.partial.Dockerfile
index d01b26e27f6..db44fe95772 100644
--- a/tensorflow/tools/dockerfiles/partials/ubuntu/cpu.partial.Dockerfile
+++ b/tensorflow/tools/dockerfiles/partials/ubuntu/cpu.partial.Dockerfile
@@ -1 +1,3 @@
 FROM ubuntu:${UBUNTU_VERSION} as base
+
+RUN apt-get update && apt-get install -y curl
diff --git a/tensorflow/tools/docs/BUILD b/tensorflow/tools/docs/BUILD
index dff3af9edaa..e4806027a91 100644
--- a/tensorflow/tools/docs/BUILD
+++ b/tensorflow/tools/docs/BUILD
@@ -15,14 +15,10 @@ py_test(
     srcs = ["tf_doctest.py"],
     python_version = "PY3",
     tags = [
-        # Disabling it for now. When we are ready to test, we can
-        # remove these tags.
+        "no_oss_py2",
         "noasan",
         "nomsan",
         "notsan",
-        "optonly",
-        "notap",
-        "manual",
     ],
     deps = [
         "//tensorflow:tensorflow_py",
diff --git a/tensorflow/tools/docs/generate2.py b/tensorflow/tools/docs/generate2.py
index 1ea9f139569..8ccd0ae24fb 100644
--- a/tensorflow/tools/docs/generate2.py
+++ b/tensorflow/tools/docs/generate2.py
@@ -79,6 +79,11 @@ if tf.__version__.startswith('1'):
       'tf.contrib.autograph': ['utils', 'operators'],
       'tf.test': ['mock'],
       'tf.contrib.estimator': ['python'],
+      'tf': ['python', 'core', 'compiler', 'examples', 'tools'],
+      # There's some aliasing between the compats and v1/2s, so it's easier to
+      # block by name and location than by deleting, or hiding objects.
+      'tf.compat.v1.compat': ['v1', 'v2'],
+      'tf.compat.v2.compat': ['v1', 'v2']
   }
 
   DO_NOT_DESCEND_MAP = {
diff --git a/tensorflow/tools/docs/tf_doctest.py b/tensorflow/tools/docs/tf_doctest.py
index 85641744f15..f56b270b157 100644
--- a/tensorflow/tools/docs/tf_doctest.py
+++ b/tensorflow/tools/docs/tf_doctest.py
@@ -19,19 +19,20 @@ from __future__ import division
 from __future__ import print_function
 
 import os
-import pkgutil
+import re
+import sys
+import textwrap
 import numpy as np
 
 from absl import flags
 from absl.testing import absltest
 
 import tensorflow.compat.v2 as tf
-import tensorflow.python as core
 tf.compat.v1.enable_v2_behavior()
 
 # We put doctest after absltest so that it picks up the unittest monkeypatch.
 # Otherwise doctest tests aren't runnable at all.
-import doctest  # pylint: disable=g-import-not-at-top; import: after=absltest
+import doctest  # pylint: disable=g-import-not-at-top, g-bad-import-order
 
 FLAGS = flags.FLAGS
 
@@ -39,17 +40,16 @@ flags.DEFINE_string('module', '', 'A specific module to run doctest on.')
 flags.DEFINE_boolean('list', False,
                      'List all the modules in the core package imported.')
 
+PACKAGE = 'tensorflow.python.'
+
 
 def find_modules():
   """Finds all the modules in the core package imported."""
 
   tf_modules = []
-  for _, name, _ in pkgutil.walk_packages(
-      core.__path__, prefix=core.__name__ + '.'):
-    try:
-      tf_modules.append(__import__(name, fromlist=['']))
-    except (ImportError, AttributeError):
-      pass
+  for name, module in sys.modules.items():
+    if name.startswith(PACKAGE):
+      tf_modules.append(module)
 
   return tf_modules
 
@@ -71,12 +71,47 @@ def filter_on_submodules(all_modules, submodule):
 
   filtered_modules = [
       mod for mod in all_modules
-      if core.__name__ + '.' + submodule in mod.__name__
+      if PACKAGE + submodule in mod.__name__
   ]
   return filtered_modules
 
 
-def load_tests(loader, tests, ignore):
+class TfTestCase(tf.test.TestCase):
+
+  def set_up(self, test):
+    self.setUp()
+
+  def tear_down(self, test):
+    self.tearDown()
+
+
+class CustomOutputChecker(doctest.OutputChecker):
+  """Changes the `want` and `got` strings.
+
+  This allows it to be customized before they are compared.
+  """
+
+  def check_output(self, want, got, optionflags):
+    # Replace tf.Tensor's id with ellipsis(...) because tensor's id can change
+    # on each execution. Users may forget to use ellipsis while writing
+    # examples in docstrings, so replacing the id with `...` makes it safe.
+    want = re.sub(r'\bid=(\d+)\b', r'id=...', want)
+    return doctest.OutputChecker.check_output(self, want, got, optionflags)
+
+  _MESSAGE = textwrap.dedent("""\n
+        #############################################################
+        Check the doctest documentation
+        (https://docs.python.org/3/library/doctest.html) on how to
+        write testable docstrings.
+        #############################################################""")
+
+  def output_difference(self, example, got, optionflags):
+    got = got + self._MESSAGE
+    return doctest.OutputChecker.output_difference(self, example, got,
+                                                   optionflags)
+
+
+def load_tests(unused_loader, tests, unused_ignore):
   """Loads all the tests in the docstrings and runs them."""
 
   tf_modules = find_modules()
@@ -85,11 +120,14 @@ def load_tests(loader, tests, ignore):
     tf_modules = filter_on_submodules(tf_modules, FLAGS.module)
 
   if FLAGS.list:
+    print('**************************************************')
     for mod in tf_modules:
       print(mod.__name__)
+    print('**************************************************')
     return tests
 
   for module in tf_modules:
+    testcase = TfTestCase()
     tests.addTests(
         doctest.DocTestSuite(
             module,
@@ -99,7 +137,14 @@ def load_tests(loader, tests, ignore):
                 'np': np,
                 'os': os
             },
-            optionflags=(doctest.ELLIPSIS | doctest.NORMALIZE_WHITESPACE)))
+            setUp=testcase.set_up,
+            tearDown=testcase.tear_down,
+            checker=CustomOutputChecker(),
+            optionflags=(doctest.ELLIPSIS |
+                         doctest.NORMALIZE_WHITESPACE |
+                         doctest.IGNORE_EXCEPTION_DETAIL |
+                         doctest.DONT_ACCEPT_BLANKLINE),
+        ))
   return tests
 
 
diff --git a/tensorflow/tools/optimization/BUILD b/tensorflow/tools/optimization/BUILD
index bc10fb1687e..73a8a15b121 100644
--- a/tensorflow/tools/optimization/BUILD
+++ b/tensorflow/tools/optimization/BUILD
@@ -19,7 +19,6 @@ tf_cuda_library(
     srcs = ["optimization_pass_runner.cc"],
     hdrs = ["optimization_pass_runner.h"],
     deps = [
-        "//tensorflow/contrib:contrib_ops_op_lib",
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:core_cpu_base",
         "//tensorflow/core:framework",
@@ -39,7 +38,6 @@ tf_cc_binary(
         "//tensorflow/compiler/jit:xla_cpu_jit",
         "//tensorflow/compiler/jit:xla_gpu_jit",
         "//tensorflow/compiler/tf2xla:xla_compiler",
-        "//tensorflow/contrib:contrib_ops_op_lib",
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:core_cpu_base",
         "//tensorflow/core:framework",
diff --git a/tensorflow/tools/pip_package/BUILD b/tensorflow/tools/pip_package/BUILD
index b0e32a2296b..0a8871700e2 100644
--- a/tensorflow/tools/pip_package/BUILD
+++ b/tensorflow/tools/pip_package/BUILD
@@ -74,7 +74,6 @@ COMMON_PIP_DEPS = [
     "//tensorflow/python:loss_scale",
     "//tensorflow/python:loss_scale_optimizer",
     "//tensorflow/python:meta_graph_testdata",
-    "//tensorflow/python:spectral_ops_test_util",
     "//tensorflow/python:util_example_parser_configuration",
     "//tensorflow/python/data/benchmarks:benchmark_base",
     "//tensorflow/python/data/experimental/kernel_tests/serialization:dataset_serialization_test_base",
diff --git a/tensorflow/tools/pip_package/setup.py b/tensorflow/tools/pip_package/setup.py
index 9e01575db68..3cc673698da 100644
--- a/tensorflow/tools/pip_package/setup.py
+++ b/tensorflow/tools/pip_package/setup.py
@@ -54,7 +54,7 @@ REQUIRED_PACKAGES = [
     'astor >= 0.6.0',
     'backports.weakref >= 1.0rc1;python_version<"3.4"',
     'enum34 >= 1.1.6;python_version<"3.4"',
-    'gast >= 0.2.0',
+    'gast == 0.2.2',
     'google_pasta >= 0.1.6',
     'keras_applications >= 1.0.8',
     'keras_preprocessing >= 1.0.5',
@@ -62,7 +62,7 @@ REQUIRED_PACKAGES = [
     'opt_einsum >= 2.3.2',
     'six >= 1.10.0',
     'protobuf >= 3.6.1',
-    'tensorboard >= 1.14.0, < 1.15.0',
+    'tensorboard >= 1.15.0, < 1.16.0',
     'tensorflow_estimator >= 1.14.0rc0, < 1.15.0rc0',
     'termcolor >= 1.1.0',
     'wrapt >= 1.11.1',
diff --git a/tensorflow/virtual_root_template_v1.__init__.py b/tensorflow/virtual_root_template_v1.__init__.py
index 9603ddca5c0..d341de2721a 100644
--- a/tensorflow/virtual_root_template_v1.__init__.py
+++ b/tensorflow/virtual_root_template_v1.__init__.py
@@ -129,4 +129,8 @@ try:
   del examples
 except NameError:
   pass
+
+# Manually patch keras and estimator so tf.keras and tf.estimator work
+keras = _sys.modules["tensorflow.keras"]
+if not _root_estimator: estimator = _sys.modules["tensorflow.estimator"]
 # LINT.ThenChange(//tensorflow/virtual_root_template_v2.__init__.py.oss)
diff --git a/tensorflow/virtual_root_template_v2.__init__.py b/tensorflow/virtual_root_template_v2.__init__.py
index dc3011c96ee..e09127d70a6 100644
--- a/tensorflow/virtual_root_template_v2.__init__.py
+++ b/tensorflow/virtual_root_template_v2.__init__.py
@@ -122,4 +122,15 @@ try:
   del examples
 except NameError:
   pass
+
+# TODO(mihaimaruseac): Revisit all of this once we release 2.1
+# Manually patch keras and estimator so tf.keras and tf.estimator work
+keras = _sys.modules["tensorflow.keras"]
+if not _root_estimator: estimator = _sys.modules["tensorflow.estimator"]
+# Also import module aliases
+try:
+  from tensorflow_core import losses, metrics, initializers, optimizers
+except ImportError:
+  pass
+
 # LINT.ThenChange(//tensorflow/virtual_root_template_v1.__init__.py.oss)
diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl
index d0e076feb57..51fb0ac49d9 100755
--- a/tensorflow/workspace.bzl
+++ b/tensorflow/workspace.bzl
@@ -7,6 +7,7 @@ load("//third_party/nccl:nccl_configure.bzl", "nccl_configure")
 load("//third_party/mkl:build_defs.bzl", "mkl_repository")
 load("//third_party/git:git_configure.bzl", "git_configure")
 load("//third_party/py:python_configure.bzl", "python_configure")
+load("//third_party/mlir:mlir_configure.bzl", "mlir_configure")
 load("//third_party/sycl:sycl_configure.bzl", "sycl_configure")
 load("//third_party/systemlibs:syslibs_configure.bzl", "syslibs_configure")
 load("//third_party/toolchains/remote:configure.bzl", "remote_execution_configure")
@@ -75,10 +76,7 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""):
     syslibs_configure(name = "local_config_syslibs")
     python_configure(name = "local_config_python")
     rocm_configure(name = "local_config_rocm")
-    native.local_repository(
-        name = "local_config_mlir",
-        path = "third_party/mlir",
-    )
+    mlir_configure(name = "local_config_mlir")
     remote_execution_configure(name = "local_config_remote_execution")
 
     initialize_third_party()
@@ -171,22 +169,22 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""):
         name = "eigen_archive",
         build_file = clean_dep("//third_party:eigen.BUILD"),
         patch_file = clean_dep("//third_party/eigen3:gpu_packet_math.patch"),
-        sha256 = "1387ecbf59312e149fd83c4af4e8f478710a63cf2e972b2db7821127bf35d658",
-        strip_prefix = "eigen-eigen-07eb497b497e",
+        sha256 = "b3e1c3df05377d22bb960f54acce8d7018bc9477f37e8f39f9d3c784f5aaa87f",
+        strip_prefix = "eigen-eigen-49177915a14a",
         urls = [
-            "https://storage.googleapis.com/mirror.tensorflow.org/bitbucket.org/eigen/eigen/get/07eb497b497e.tar.gz",
-            "https://bitbucket.org/eigen/eigen/get/07eb497b497e.tar.gz",
+            "https://storage.googleapis.com/mirror.tensorflow.org/bitbucket.org/eigen/eigen/get/49177915a14a.tar.gz",
+            "https://bitbucket.org/eigen/eigen/get/49177915a14a.tar.gz",
         ],
     )
 
     tf_http_archive(
         name = "arm_compiler",
         build_file = clean_dep("//:arm_compiler.BUILD"),
-        sha256 = "4c622a5c7b9feb9615d4723b03a13142a7f3f813f9296861d5401282b9fbea96",
-        strip_prefix = "tools-0e906ebc527eab1cdbf7adabff5b474da9562e9f/arm-bcm2708/arm-rpi-4.9.3-linux-gnueabihf",
+        sha256 = "b9e7d50ffd9996ed18900d041d362c99473b382c0ae049b2fce3290632d2656f",
+        strip_prefix = "rpi-newer-crosstools-eb68350c5c8ec1663b7fe52c742ac4271e3217c5/x64-gcc-6.5.0/arm-rpi-linux-gnueabihf/",
         urls = [
-            "https://storage.googleapis.com/mirror.tensorflow.org/github.com/raspberrypi/tools/archive/0e906ebc527eab1cdbf7adabff5b474da9562e9f.tar.gz",
-            "https://github.com/raspberrypi/tools/archive/0e906ebc527eab1cdbf7adabff5b474da9562e9f.tar.gz",
+            "https://storage.googleapis.com/mirror.tensorflow.org/github.com/rvagg/rpi-newer-crosstools/archive/eb68350c5c8ec1663b7fe52c742ac4271e3217c5.tar.gz",
+            "https://github.com/rvagg/rpi-newer-crosstools/archive/eb68350c5c8ec1663b7fe52c742ac4271e3217c5.tar.gz",
         ],
     )
 
@@ -287,12 +285,13 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""):
     tf_http_archive(
         name = "gif_archive",
         build_file = clean_dep("//third_party:gif.BUILD"),
-        sha256 = "34a7377ba834397db019e8eb122e551a49c98f49df75ec3fcc92b9a794a4f6d1",
-        strip_prefix = "giflib-5.1.4",
+        patch_file = clean_dep("//third_party:gif_fix_strtok_r.patch"),
+        sha256 = "31da5562f44c5f15d63340a09a4fd62b48c45620cd302f77a6d9acf0077879bd",
+        strip_prefix = "giflib-5.2.1",
         system_build_file = clean_dep("//third_party/systemlibs:gif.BUILD"),
         urls = [
-            "https://storage.googleapis.com/mirror.tensorflow.org/ufpr.dl.sourceforge.net/project/giflib/giflib-5.1.4.tar.gz",
-            "http://pilotfiber.dl.sourceforge.net/project/giflib/giflib-5.1.4.tar.gz",
+            "https://storage.googleapis.com/mirror.tensorflow.org/pilotfiber.dl.sourceforge.net/project/giflib/giflib-5.2.1.tar.gz",
+            "http://pilotfiber.dl.sourceforge.net/project/giflib/giflib-5.2.1.tar.gz",
         ],
     )
 
@@ -369,8 +368,8 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""):
 
     tf_http_archive(
         name = "absl_py",
-        sha256 = "3d0f39e0920379ff1393de04b573bca3484d82a5f8b939e9e83b20b6106c9bbe",
-        strip_prefix = "abseil-py-pypi-v0.7.1",
+        sha256 = "280c76ec0c9ab7a1dff550cdc37b7c7cd28551103dc3955202760ea8e381aa9d",
+        strip_prefix = "abseil-py-pypi-v0.8.0",
         system_build_file = clean_dep("//third_party/systemlibs:absl_py.BUILD"),
         system_link_files = {
             "//third_party/systemlibs:absl_py.absl.BUILD": "absl/BUILD",
@@ -378,8 +377,8 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""):
             "//third_party/systemlibs:absl_py.absl.testing.BUILD": "absl/testing/BUILD",
         },
         urls = [
-            "https://storage.googleapis.com/mirror.tensorflow.org/github.com/abseil/abseil-py/archive/pypi-v0.7.1.tar.gz",
-            "https://github.com/abseil/abseil-py/archive/pypi-v0.7.1.tar.gz",
+            "https://storage.googleapis.com/mirror.tensorflow.org/github.com/abseil/abseil-py/archive/pypi-v0.8.0.tar.gz",
+            "https://github.com/abseil/abseil-py/archive/pypi-v0.8.0.tar.gz",
         ],
     )
 
diff --git a/third_party/gif.BUILD b/third_party/gif.BUILD
index cbe730fe105..51621ba953e 100644
--- a/third_party/gif.BUILD
+++ b/third_party/gif.BUILD
@@ -8,18 +8,18 @@ exports_files(["COPYING"])
 cc_library(
     name = "gif",
     srcs = [
-        "lib/dgif_lib.c",
-        "lib/egif_lib.c",
-        "lib/gif_err.c",
-        "lib/gif_font.c",
-        "lib/gif_hash.c",
-        "lib/gif_hash.h",
-        "lib/gif_lib_private.h",
-        "lib/gifalloc.c",
-        "lib/openbsd-reallocarray.c",
-        "lib/quantize.c",
+        "dgif_lib.c",
+        "egif_lib.c",
+        "gif_err.c",
+        "gif_font.c",
+        "gif_hash.c",
+        "gif_hash.h",
+        "gif_lib_private.h",
+        "gifalloc.c",
+        "openbsd-reallocarray.c",
+        "quantize.c",
     ],
-    hdrs = ["lib/gif_lib.h"],
+    hdrs = ["gif_lib.h"],
     defines = select({
         ":android": [
             "S_IREAD=S_IRUSR",
@@ -28,7 +28,7 @@ cc_library(
         ],
         "//conditions:default": [],
     }),
-    includes = ["lib/."],
+    includes = ["."],
     visibility = ["//visibility:public"],
     deps = select({
         ":windows": [":windows_polyfill"],
diff --git a/third_party/gif_fix_strtok_r.patch b/third_party/gif_fix_strtok_r.patch
new file mode 100644
index 00000000000..c9c9c30c41f
--- /dev/null
+++ b/third_party/gif_fix_strtok_r.patch
@@ -0,0 +1,15 @@
+diff -r -u ./fixed_gif_font.c ./gif_font.c
+--- ./fixed_gif_font.c	2019-09-05 11:05:25.009598262 -0700
++++ ./gif_font.c	2019-09-05 10:52:45.308389085 -0700
+@@ -11,6 +11,11 @@
+
+ #include "gif_lib.h"
+
++// Windows doesn't have strtok_r.
++#if defined(WIN32) || defined(_WIN32) || defined(__WIN32) && !defined(__CYGWIN__)
++#define strtok_r strtok_s
++#endif
++
+ /*****************************************************************************
+  Ascii 8 by 8 regular font - only first 128 characters are supported.
+ *****************************************************************************/
diff --git a/third_party/gpus/cuda_configure.bzl b/third_party/gpus/cuda_configure.bzl
index cf63adcbaa2..ac969a52856 100644
--- a/third_party/gpus/cuda_configure.bzl
+++ b/third_party/gpus/cuda_configure.bzl
@@ -1,4 +1,3 @@
-# -*- Python -*-
 """Repository rule for CUDA autoconfiguration.
 
 `cuda_configure` depends on the following environment variables:
diff --git a/third_party/gpus/rocm_configure.bzl b/third_party/gpus/rocm_configure.bzl
index 610e184e99b..acd6c7546fb 100644
--- a/third_party/gpus/rocm_configure.bzl
+++ b/third_party/gpus/rocm_configure.bzl
@@ -1,4 +1,3 @@
-# -*- Python -*-
 """Repository rule for ROCm autoconfiguration.
 
 `rocm_configure` depends on the following environment variables:
diff --git a/third_party/llvm/llvm.autogenerated.BUILD b/third_party/llvm/llvm.autogenerated.BUILD
index 32705321ea1..9a16108b1ae 100644
--- a/third_party/llvm/llvm.autogenerated.BUILD
+++ b/third_party/llvm/llvm.autogenerated.BUILD
@@ -400,7 +400,7 @@ filegroup(
 
 py_binary(
     name = "lit",
-    srcs = ["utils/lit/lit.py"] + glob(["utils/lit/lit/*.py"]),
+    srcs = ["utils/lit/lit.py"] + glob(["utils/lit/lit/**/*.py"]),
 )
 
 cc_binary(
diff --git a/third_party/mkl/build_defs.bzl b/third_party/mkl/build_defs.bzl
index 9f16fdd124e..6f3e7c0b52d 100644
--- a/third_party/mkl/build_defs.bzl
+++ b/third_party/mkl/build_defs.bzl
@@ -1,4 +1,3 @@
-# -*- Python -*-
 """Skylark macros for MKL.
 
 if_mkl is a conditional to check if we are building with MKL.
diff --git a/third_party/mlir/BUILD b/third_party/mlir/BUILD
index 644a88d345a..601c531a3f4 100644
--- a/third_party/mlir/BUILD
+++ b/third_party/mlir/BUILD
@@ -3,10 +3,7 @@
 
 licenses(["notice"])
 
-package(
-    default_visibility = [":friends"],
-    features = ["-layering_check"],
-)
+package(default_visibility = [":friends"])
 
 package_group(
     name = "subpackages",
@@ -21,7 +18,6 @@ package_group(
     packages = [
         "//...",
         "//learning/glassbox/evaluation/compiler/...",
-        "//tensorflow/compiler/xla/service/gpu/mlir/...",
     ],
 )
 
@@ -32,6 +28,14 @@ exports_files([
 
 load(":tblgen.bzl", "gentbl")
 
+cc_library(
+    name = "DialectSymbolRegistry",
+    # strip_include_prefix does not apply to textual_hdrs.
+    hdrs = ["include/mlir/IR/DialectSymbolRegistry.def"],
+    strip_include_prefix = "include/mlir/IR",
+    textual_hdrs = ["include/mlir/IR/DialectSymbolRegistry.def"],
+)
+
 cc_library(
     name = "IR",
     srcs = [
@@ -79,7 +83,6 @@ cc_library(
         "include/mlir/IR/Dialect.h",
         "include/mlir/IR/DialectHooks.h",
         "include/mlir/IR/DialectInterface.h",
-        "include/mlir/IR/DialectSymbolRegistry.def",
         "include/mlir/IR/Function.h",
         "include/mlir/IR/FunctionSupport.h",
         "include/mlir/IR/Identifier.h",
@@ -107,6 +110,7 @@ cc_library(
     ],
     includes = ["include"],
     deps = [
+        ":DialectSymbolRegistry",
         ":Support",
         "@llvm//:support",
     ],
@@ -187,6 +191,7 @@ filegroup(
 
 gentbl(
     name = "AffineOpsIncGen",
+    strip_include_prefix = "include",
     tbl_outs = [
         (
             "-gen-op-decls",
@@ -214,6 +219,7 @@ filegroup(
 
 gentbl(
     name = "LoopOpsIncGen",
+    strip_include_prefix = "include",
     tbl_outs = [
         (
             "-gen-op-decls",
@@ -241,6 +247,7 @@ filegroup(
 
 gentbl(
     name = "StandardOpsIncGen",
+    strip_include_prefix = "include",
     tbl_outs = [
         (
             "-gen-op-decls",
@@ -266,6 +273,7 @@ cc_library(
     hdrs = [
         "include/mlir/Dialect/Traits.h",
     ],
+    includes = ["include"],
     deps = [
         ":IR",
         "@llvm//:support",
@@ -275,11 +283,13 @@ cc_library(
 cc_library(
     name = "AffineOps",
     srcs = [
+        "include/mlir/Transforms/InliningUtils.h",
         "lib/Dialect/AffineOps/AffineOps.cpp",
     ],
     hdrs = [
         "include/mlir/Dialect/AffineOps/AffineOps.h",
     ],
+    includes = ["include"],
     deps = [
         ":AffineOpsIncGen",
         ":IR",
@@ -316,6 +326,7 @@ cc_library(
         "include/mlir/Dialect/SDBM/SDBMDialect.h",
         "include/mlir/Dialect/SDBM/SDBMExpr.h",
     ],
+    includes = ["include"],
     deps = [
         ":IR",
         ":Support",
@@ -332,6 +343,7 @@ cc_library(
     hdrs = [
         "include/mlir/Dialect/LoopOps/LoopOps.h",
     ],
+    includes = ["include"],
     deps = [
         ":IR",
         ":LoopOpsIncGen",
@@ -358,7 +370,9 @@ cc_library(
     ],
     hdrs = [
         "include/mlir/Dialect/StandardOps/Ops.h",
+        "include/mlir/Transforms/InliningUtils.h",
     ],
+    includes = ["include"],
     deps = [
         ":IR",
         ":StandardOpsIncGen",
@@ -386,6 +400,7 @@ cc_library(
     hdrs = [
         "include/mlir/Dialect/VectorOps/VectorOps.h",
     ],
+    includes = ["include"],
     deps = [
         ":IR",
         ":Support",
@@ -427,6 +442,14 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "ParserTokenKinds",
+    # strip_include_prefix does not apply to textual_hdrs.
+    hdrs = ["lib/Parser/TokenKinds.def"],
+    strip_include_prefix = "lib/Parser",
+    textual_hdrs = ["lib/Parser/TokenKinds.def"],
+)
+
 cc_library(
     name = "Parser",
     srcs = [
@@ -438,12 +461,12 @@ cc_library(
     ],
     hdrs = [
         "include/mlir/Parser.h",
-        "lib/Parser/TokenKinds.def",
     ],
     includes = ["include"],
     deps = [
         ":Analysis",
         ":IR",
+        ":ParserTokenKinds",
         ":Support",
         "@llvm//:support",
     ],
@@ -452,8 +475,6 @@ cc_library(
 cc_library(
     name = "LLVMDialect",
     srcs = [
-        "include/mlir/Dialect/LLVMIR/LLVMOps.cpp.inc",
-        "include/mlir/Dialect/LLVMIR/LLVMOps.h.inc",
         "lib/Dialect/LLVMIR/IR/LLVMDialect.cpp",
     ],
     hdrs = [
@@ -481,6 +502,7 @@ filegroup(
 
 gentbl(
     name = "GPUOpsIncGen",
+    strip_include_prefix = "include",
     tbl_outs = [
         (
             "-gen-op-decls",
@@ -501,7 +523,9 @@ gentbl(
 cc_library(
     name = "GPUDialect",
     srcs = ["lib/Dialect/GPU/IR/GPUDialect.cpp"],
-    hdrs = ["include/mlir/Dialect/GPU/GPUDialect.h"],
+    hdrs = [
+        "include/mlir/Dialect/GPU/GPUDialect.h",
+    ],
     includes = ["include"],
     deps = [
         ":GPUOpsIncGen",
@@ -514,7 +538,6 @@ cc_library(
 cc_library(
     name = "GPUDialectRegistration",
     srcs = ["lib/Dialect/GPU/IR/DialectRegistration.cpp"],
-    includes = ["include"],
     deps = [
         ":GPUDialect",
     ],
@@ -609,6 +632,7 @@ cc_library(
 
 gentbl(
     name = "LLVMOpsIncGen",
+    strip_include_prefix = "include",
     tbl_outs = [
         (
             "-gen-op-decls",
@@ -636,6 +660,7 @@ gentbl(
 
 gentbl(
     name = "LLVMConversionIncGen",
+    strip_include_prefix = "include",
     tbl_outs = [
         (
             "-gen-llvmir-conversions",
@@ -652,8 +677,6 @@ gentbl(
 cc_library(
     name = "NVVMDialect",
     srcs = [
-        "include/mlir/Dialect/LLVMIR/NVVMOps.cpp.inc",
-        "include/mlir/Dialect/LLVMIR/NVVMOps.h.inc",
         "lib/Dialect/LLVMIR/IR/NVVMDialect.cpp",
     ],
     hdrs = [
@@ -684,6 +707,7 @@ filegroup(
 
 gentbl(
     name = "NVVMOpsIncGen",
+    strip_include_prefix = "include",
     tbl_outs = [
         (
             "-gen-op-decls",
@@ -703,6 +727,7 @@ gentbl(
 
 gentbl(
     name = "NVVMConversionIncGen",
+    strip_include_prefix = "include",
     tbl_outs = [
         (
             "-gen-llvmir-conversions",
@@ -731,6 +756,7 @@ filegroup(
 
 gentbl(
     name = "SPIRVOpsIncGen",
+    strip_include_prefix = "include",
     tbl_outs = [
         (
             "-gen-op-decls",
@@ -762,6 +788,7 @@ gentbl(
 
 gentbl(
     name = "StandardToSPIRVGen",
+    strip_include_prefix = "lib/Conversion/StandardToSPIRV",
     tbl_outs = [
         (
             "-gen-rewriters",
@@ -778,6 +805,7 @@ gentbl(
 
 gentbl(
     name = "SPIRVOpUtilsIncGen",
+    strip_include_prefix = "include",
     tbl_outs = [
         (
             "-gen-spirv-op-utils",
@@ -793,6 +821,7 @@ gentbl(
 
 gentbl(
     name = "SPIRVSerializationGen",
+    strip_include_prefix = "include",
     tbl_outs = [
         (
             "-gen-spirv-serialization",
@@ -809,11 +838,6 @@ gentbl(
 cc_library(
     name = "SPIRVDialect",
     srcs = [
-        "include/mlir/Dialect/SPIRV/SPIRVEnums.cpp.inc",
-        "include/mlir/Dialect/SPIRV/SPIRVEnums.h.inc",
-        "include/mlir/Dialect/SPIRV/SPIRVOpUtils.inc",
-        "include/mlir/Dialect/SPIRV/SPIRVOps.cpp.inc",
-        "include/mlir/Dialect/SPIRV/SPIRVOps.h.inc",
         "lib/Dialect/SPIRV/SPIRVDialect.cpp",
         "lib/Dialect/SPIRV/SPIRVOps.cpp",
         "lib/Dialect/SPIRV/SPIRVTypes.cpp",
@@ -840,7 +864,6 @@ cc_library(
     srcs = [
         "lib/Conversion/StandardToSPIRV/ConvertStandardToSPIRV.cpp",
         "lib/Conversion/StandardToSPIRV/ConvertStandardToSPIRVPass.cpp",
-        "lib/Conversion/StandardToSPIRV/StandardToSPIRV.cpp.inc",
     ],
     hdrs = [
         "include/mlir/Conversion/StandardToSPIRV/ConvertStandardToSPIRV.h",
@@ -866,7 +889,6 @@ cc_library(
 cc_library(
     name = "SPIRVSerialization",
     srcs = [
-        "include/mlir/Dialect/SPIRV/SPIRVSerialization.inc",
         "lib/Dialect/SPIRV/Serialization/Deserializer.cpp",
         "lib/Dialect/SPIRV/Serialization/SPIRVBinaryUtils.cpp",
         "lib/Dialect/SPIRV/Serialization/Serializer.cpp",
@@ -934,6 +956,7 @@ cc_library(
     srcs = [
         "lib/Transforms/Utils/FoldUtils.cpp",
         "lib/Transforms/Utils/GreedyPatternRewriteDriver.cpp",
+        "lib/Transforms/Utils/InliningUtils.cpp",
         "lib/Transforms/Utils/LoopFusionUtils.cpp",
         "lib/Transforms/Utils/LoopUtils.cpp",
         "lib/Transforms/Utils/RegionUtils.cpp",
@@ -941,6 +964,7 @@ cc_library(
     ],
     hdrs = [
         "include/mlir/Transforms/FoldUtils.h",
+        "include/mlir/Transforms/InliningUtils.h",
         "include/mlir/Transforms/LoopFusionUtils.h",
         "include/mlir/Transforms/LoopUtils.h",
         "include/mlir/Transforms/RegionUtils.h",
@@ -967,6 +991,7 @@ cc_library(
         "lib/Transforms/CSE.cpp",
         "lib/Transforms/Canonicalizer.cpp",
         "lib/Transforms/DialectConversion.cpp",
+        "lib/Transforms/Inliner.cpp",
         "lib/Transforms/LoopCoalescing.cpp",
         "lib/Transforms/LoopFusion.cpp",
         "lib/Transforms/LoopInvariantCodeMotion.cpp",
@@ -1142,6 +1167,7 @@ cc_library(
     name = "Translation",
     srcs = ["lib/Translation/Translation.cpp"],
     hdrs = ["include/mlir/Translation.h"],
+    includes = ["include"],
     deps = [
         ":IR",
         ":Parser",
@@ -1159,6 +1185,7 @@ cc_library(
     hdrs = [
         "include/mlir/Target/LLVMIR/ModuleTranslation.h",
     ],
+    includes = ["include"],
     deps = [
         ":IR",
         ":LLVMConversionIncGen",
@@ -1229,8 +1256,8 @@ cc_library(
         ":Support",
         ":TargetLLVMIR",
         ":Translation",
-        "//third_party/llvm/llvm:bit_reader",
-        "//third_party/llvm/llvm:bit_writer",
+        "@llvm//:bit_reader",
+        "@llvm//:bit_writer",
         "@llvm//:core",
         "@llvm//:execution_engine",
         "@llvm//:mc",
@@ -1287,11 +1314,27 @@ cc_library(
         ":Support",
         ":Transforms",
         ":VectorToLLVMTransforms",
+        ":ViewOpGraph",
         ":ViewRegionGraph",
         "@llvm//:support",
     ],
 )
 
+cc_library(
+    name = "ViewOpGraph",
+    srcs = ["lib/Transforms/ViewOpGraph.cpp"],
+    hdrs = ["include/mlir/Transforms/ViewOpGraph.h"],
+    includes = ["include"],
+    deps = [
+        ":Analysis",
+        ":IR",
+        ":Pass",
+        ":Support",
+        "@llvm//:support",
+    ],
+    alwayslink = 1,
+)
+
 cc_library(
     name = "ViewRegionGraph",
     srcs = ["lib/Transforms/ViewRegionGraph.cpp"],
@@ -1304,6 +1347,7 @@ cc_library(
         ":Support",
         "@llvm//:support",
     ],
+    alwayslink = 1,
 )
 
 cc_library(
@@ -1386,6 +1430,7 @@ cc_binary(
         "//test:TestDialect",
         "//test:TestTransforms",
         "@llvm//:support",
+        "@local_config_mlir//test:TestPass",
     ],
 )
 
@@ -1428,7 +1473,6 @@ cc_binary(
 cc_binary(
     name = "tools/libcuda-runtime-wrappers.so",
     srcs = ["tools/mlir-cuda-runner/cuda-runtime-wrappers.cpp"],
-    includes = ["include"],
     linkshared = True,
     deps = [
         "//third_party/gpus/cuda:cuda_headers",
@@ -1444,7 +1488,6 @@ cc_binary(
     data = [
         ":tools/libcuda-runtime-wrappers.so",
     ],
-    includes = ["include"],
     deps = [
         ":GPUDialect",
         ":GPUDialectRegistration",
@@ -1502,6 +1545,21 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "MlirTableGenMain",
+    srcs = [
+        "tools/mlir-tblgen/mlir-tblgen.cpp",
+    ],
+    includes = ["include"],
+    deps = [
+        ":Support",
+        ":TableGen",
+        "@llvm//:config",
+        "@llvm//:support",
+        "@llvm//:tablegen",
+    ],
+)
+
 cc_binary(
     name = "mlir-tblgen",
     srcs = [
@@ -1514,14 +1572,13 @@ cc_binary(
         "tools/mlir-tblgen/RewriterGen.cpp",
         "tools/mlir-tblgen/SPIRVUtilsGen.cpp",
         "tools/mlir-tblgen/StructsGen.cpp",
-        "tools/mlir-tblgen/mlir-tblgen.cpp",
     ],
-    includes = ["include"],
     linkopts = [
         "-lm",
         "-lpthread",
     ],
     deps = [
+        ":MlirTableGenMain",
         ":Support",
         ":TableGen",
         "@llvm//:config",
@@ -1542,6 +1599,7 @@ filegroup(
 ## QuantOps dialect
 gentbl(
     name = "QuantOpsIncGen",
+    strip_include_prefix = "include",
     tbl_outs = [
         (
             "-gen-op-decls",
@@ -1566,8 +1624,6 @@ gentbl(
 cc_library(
     name = "QuantOps",
     srcs = [
-        "include/mlir/Dialect/QuantOps/QuantOps.cpp.inc",
-        "include/mlir/Dialect/QuantOps/QuantOps.h.inc",
         "lib/Dialect/QuantOps/IR/QuantOps.cpp",
         "lib/Dialect/QuantOps/IR/QuantTypes.cpp",
         "lib/Dialect/QuantOps/IR/TypeDetail.h",
@@ -1586,6 +1642,7 @@ cc_library(
         "include/mlir/Dialect/QuantOps/QuantizeUtils.h",
         "include/mlir/Dialect/QuantOps/UniformSupport.h",
     ],
+    includes = ["include"],
     deps = [
         ":Analysis",
         ":IR",
@@ -1602,7 +1659,6 @@ cc_library(
 cc_library(
     name = "QuantOpsDialectRegistration",
     srcs = ["lib/Dialect/QuantOps/IR/DialectRegistration.cpp"],
-    includes = ["include"],
     deps = [
         ":IR",
         ":QuantOps",
@@ -1623,6 +1679,7 @@ filegroup(
 ## FxpMathOps dialect
 gentbl(
     name = "FxpMathOpsIncGen",
+    strip_include_prefix = "include",
     tbl_outs = [
         (
             "-gen-op-decls",
@@ -1647,8 +1704,6 @@ gentbl(
 cc_library(
     name = "FxpMathOps",
     srcs = [
-        "include/mlir/Dialect/FxpMathOps/FxpMathOps.cpp.inc",
-        "include/mlir/Dialect/FxpMathOps/FxpMathOps.h.inc",
         "lib/Dialect/FxpMathOps/IR/FxpMathOps.cpp",
         "lib/Dialect/FxpMathOps/Transforms/LowerUniformRealMath.cpp",
         "lib/Dialect/FxpMathOps/Transforms/UniformKernelUtils.h",
@@ -1657,6 +1712,7 @@ cc_library(
         "include/mlir/Dialect/FxpMathOps/FxpMathOps.h",
         "include/mlir/Dialect/FxpMathOps/Passes.h",
     ],
+    includes = ["include"],
     deps = [
         ":Analysis",
         ":FxpMathOpsIncGen",
@@ -1674,7 +1730,6 @@ cc_library(
 cc_library(
     name = "FxpMathOpsDialectRegistration",
     srcs = ["lib/Dialect/FxpMathOps/IR/DialectRegistration.cpp"],
-    includes = ["include"],
     deps = [
         ":FxpMathOps",
         ":IR",
@@ -1695,6 +1750,7 @@ filegroup(
 
 gentbl(
     name = "LinalgOpsIncGen",
+    strip_include_prefix = "include",
     tbl_outs = [
         (
             "-gen-op-decls",
@@ -1724,6 +1780,7 @@ filegroup(
 
 gentbl(
     name = "LinalgLibraryOpsIncGen",
+    strip_include_prefix = "include",
     tbl_outs = [
         (
             "-gen-op-decls",
@@ -1770,6 +1827,7 @@ cc_library(
         "include/mlir/Dialect/Linalg/Utils/Intrinsics.h",
         "include/mlir/Dialect/Linalg/Utils/Utils.h",
     ],
+    includes = ["include"],
     deps = [
         ":AffineOps",
         ":CFGTransforms",
@@ -1796,7 +1854,6 @@ cc_library(
 cc_library(
     name = "LinalgDialectRegistration",
     srcs = ["lib/Dialect/Linalg/LinalgRegistration.cpp"],
-    includes = ["include"],
     deps = [
         ":IR",
         ":Linalg",
@@ -1849,6 +1906,7 @@ cc_library(
     hdrs = [
         "include/mlir/Quantizer/Transforms/Passes.h",
     ],
+    includes = ["include"],
     deps = [
         ":IR",
         ":Pass",
@@ -1870,6 +1928,7 @@ filegroup(
 
 gentbl(
     name = "VectorOpsIncGen",
+    strip_include_prefix = "include",
     tbl_outs = [
         (
             "-gen-op-decls",
diff --git a/third_party/mlir/WORKSPACE b/third_party/mlir/WORKSPACE
deleted file mode 100644
index e69de29bb2d..00000000000
diff --git a/third_party/mlir/include/mlir/Analysis/AffineStructures.h b/third_party/mlir/include/mlir/Analysis/AffineStructures.h
index 28e96df665a..50df6f856f4 100644
--- a/third_party/mlir/include/mlir/Analysis/AffineStructures.h
+++ b/third_party/mlir/include/mlir/Analysis/AffineStructures.h
@@ -204,8 +204,8 @@ private:
 };
 
 /// A flat list of affine equalities and inequalities in the form.
-/// Inequality: c_0*x_0 + c_1*x_1 + .... + c_{n-1}*x_{n-1} == 0
-/// Equality: c_0*x_0 + c_1*x_1 + .... + c_{n-1}*x_{n-1} >= 0
+/// Inequality: c_0*x_0 + c_1*x_1 + .... + c_{n-1}*x_{n-1} >= 0
+/// Equality: c_0*x_0 + c_1*x_1 + .... + c_{n-1}*x_{n-1} == 0
 ///
 /// FlatAffineConstraints stores coefficients in a contiguous buffer (one buffer
 /// for equalities and one for inequalities). The size of each buffer is
diff --git a/third_party/mlir/include/mlir/Conversion/StandardToLLVM/ConvertStandardToLLVM.h b/third_party/mlir/include/mlir/Conversion/StandardToLLVM/ConvertStandardToLLVM.h
index 2f413de8ec3..30aaa0d5397 100644
--- a/third_party/mlir/include/mlir/Conversion/StandardToLLVM/ConvertStandardToLLVM.h
+++ b/third_party/mlir/include/mlir/Conversion/StandardToLLVM/ConvertStandardToLLVM.h
@@ -103,9 +103,6 @@ private:
   // pointer as defined by the data layout of the module.
   LLVM::LLVMType getIndexType();
 
-  // Wrap the given LLVM IR type into an LLVM IR dialect type.
-  Type wrap(llvm::Type *llvmType);
-
   // Extract an LLVM IR dialect type.
   LLVM::LLVMType unwrap(Type type);
 };
diff --git a/third_party/mlir/include/mlir/Dialect/AffineOps/AffineOps.h b/third_party/mlir/include/mlir/Dialect/AffineOps/AffineOps.h
index a6af20eca0b..03b945c0a5f 100644
--- a/third_party/mlir/include/mlir/Dialect/AffineOps/AffineOps.h
+++ b/third_party/mlir/include/mlir/Dialect/AffineOps/AffineOps.h
@@ -522,6 +522,10 @@ bool isValidSymbol(Value *value);
 /// 2. drop unused dims and symbols from map
 void canonicalizeMapAndOperands(AffineMap *map,
                                 llvm::SmallVectorImpl<Value *> *operands);
+/// Canonicalizes an integer set the same way canonicalizeMapAndOperands does
+/// for affine maps.
+void canonicalizeSetAndOperands(IntegerSet *set,
+                                llvm::SmallVectorImpl<Value *> *operands);
 
 /// Returns a composed AffineApplyOp by composing `map` and `operands` with
 /// other AffineApplyOps supplying those operands. The operands of the resulting
diff --git a/third_party/mlir/include/mlir/Dialect/AffineOps/AffineOps.td b/third_party/mlir/include/mlir/Dialect/AffineOps/AffineOps.td
index 237692c04a7..370adfd6ee4 100644
--- a/third_party/mlir/include/mlir/Dialect/AffineOps/AffineOps.td
+++ b/third_party/mlir/include/mlir/Dialect/AffineOps/AffineOps.td
@@ -212,9 +212,10 @@ def AffineIfOp : Affine_Op<"if", [ImplicitAffineTerminator]> {
   let regions = (region SizedRegion<1>:$thenRegion, AnyRegion:$elseRegion);
 
   let skipDefaultBuilders = 1;
+
   let builders = [
     OpBuilder<"Builder *builder, OperationState *result, "
-              "Value *cond, bool withElseRegion">
+              "IntegerSet set, ArrayRef<Value *> args, bool withElseRegion">
   ];
 
   let extraClassDeclaration = [{
@@ -223,6 +224,11 @@ def AffineIfOp : Affine_Op<"if", [ImplicitAffineTerminator]> {
     IntegerSet getIntegerSet();
     void setIntegerSet(IntegerSet newSet);
 
+    /// Sets the integer set with its operands. The size of 'operands' must not
+    /// exceed the current number of operands for this instance, as the operands
+    /// list of AffineIf is not resizable.
+    void setConditional(IntegerSet set, ArrayRef<Value *> operands);
+
     OpBuilder getThenBodyBuilder() {
       assert(!thenRegion().empty() && "Unexpected empty 'then' region.");
       Block &body = thenRegion().front();
@@ -234,6 +240,8 @@ def AffineIfOp : Affine_Op<"if", [ImplicitAffineTerminator]> {
       return OpBuilder(&body, std::prev(body.end()));
     }
   }];
+
+  let hasCanonicalizer = 1;
 }
 
 def AffineTerminatorOp :
diff --git a/third_party/mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td b/third_party/mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td
index 33844ac070d..d90a2320bba 100644
--- a/third_party/mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td
+++ b/third_party/mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td
@@ -92,22 +92,35 @@ class LLVM_ZeroResultOp<string mnemonic, list<OpTrait> traits = []> :
 class LLVM_TerminatorOp<string mnemonic, list<OpTrait> traits = []> :
     LLVM_Op<mnemonic, !listconcat(traits, [Terminator])>,
     Arguments<(ins Variadic<LLVM_Type>:$args)>, Results<(outs)> {
-  let builders = [OpBuilder<
-    "Builder *, OperationState *result, "
-    "ArrayRef<Value *> properOperands, "
-    "ArrayRef<Block *> destinations, "
-    "ArrayRef<ArrayRef<Value *>> operands = {}, "
-    "ArrayRef<NamedAttribute> attributes = {}",
+  let builders = [
+    OpBuilder<
+      "Builder *, OperationState *result, "
+      "ArrayRef<Value *> properOperands, "
+      "ArrayRef<Block *> destinations, "
+      "ArrayRef<ArrayRef<Value *>> operands, "
+      "ArrayRef<NamedAttribute> attributes = {}",
+      [{
+        result->addOperands(properOperands);
+        for (auto kvp : llvm::zip(destinations, operands)) {
+          result->addSuccessor(std::get<0>(kvp), std::get<1>(kvp));
+        }
+        for (auto namedAttr : attributes) {
+          result->addAttribute(namedAttr.first, namedAttr.second);
+        }
+      }]
+    >,
+    OpBuilder<
+      "Builder *builder, OperationState *result, "
+      "ArrayRef<Value *> properOperands, "
+      "ArrayRef<Block *> destinations, "
+      "ArrayRef<NamedAttribute> attributes = {}",
     [{
-      result->addOperands(properOperands);
-      for (auto kvp : llvm::zip(destinations, operands)) {
-        result->addSuccessor(std::get<0>(kvp), std::get<1>(kvp));
-      }
-      for (auto namedAttr : attributes) {
-        result->addAttribute(namedAttr.first, namedAttr.second);
-      }
-    }]
-  >];
+        SmallVector<ArrayRef<Value *>, 2> operands(destinations.size(), {});
+        build(builder, result, properOperands,
+            destinations, operands, attributes);
+      }]
+    >,
+  ];
 }
 
 // Class for arithmetic binary operations.
diff --git a/third_party/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td b/third_party/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
index 224a5804d5f..7f011cd9d6f 100644
--- a/third_party/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
+++ b/third_party/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
@@ -25,15 +25,27 @@
 
 include "mlir/Dialect/LLVMIR/LLVMOpBase.td"
 
+//===----------------------------------------------------------------------===//
+// NVVM dialect definitions
+//===----------------------------------------------------------------------===//
+
 def NVVM_Dialect : Dialect {
   let name = "nvvm";
   let cppNamespace = "NVVM";
 }
 
+//===----------------------------------------------------------------------===//
+// NVVM op definitions
+//===----------------------------------------------------------------------===//
+
 class NVVM_Op<string mnemonic, list<OpTrait> traits = []> :
   LLVM_OpBase<NVVM_Dialect, mnemonic, traits> {
 }
 
+//===----------------------------------------------------------------------===//
+// NVVM special register op definitions
+//===----------------------------------------------------------------------===//
+
 class NVVM_SpecialRegisterOp<string mnemonic,
     list<OpTrait> traits = []> :
   NVVM_Op<mnemonic, !listconcat(traits, [NoSideEffect])>,
@@ -44,12 +56,22 @@ class NVVM_SpecialRegisterOp<string mnemonic,
   let printer = [{ printNVVMIntrinsicOp(p, this->getOperation()); }];
 }
 
+//===----------------------------------------------------------------------===//
+// Lane index and range
+def NVVM_LaneIdOp : NVVM_SpecialRegisterOp<"read.ptx.sreg.laneid">;
+def NVVM_WarpSizeOp : NVVM_SpecialRegisterOp<"read.ptx.sreg.warpsize">;
+
+//===----------------------------------------------------------------------===//
+// Thread index and range
 def NVVM_ThreadIdXOp : NVVM_SpecialRegisterOp<"read.ptx.sreg.tid.x">;
 def NVVM_ThreadIdYOp : NVVM_SpecialRegisterOp<"read.ptx.sreg.tid.y">;
 def NVVM_ThreadIdZOp : NVVM_SpecialRegisterOp<"read.ptx.sreg.tid.z">;
 def NVVM_BlockDimXOp : NVVM_SpecialRegisterOp<"read.ptx.sreg.ntid.x">;
 def NVVM_BlockDimYOp : NVVM_SpecialRegisterOp<"read.ptx.sreg.ntid.y">;
 def NVVM_BlockDimZOp : NVVM_SpecialRegisterOp<"read.ptx.sreg.ntid.z">;
+
+//===----------------------------------------------------------------------===//
+// Block index and range
 def NVVM_BlockIdXOp : NVVM_SpecialRegisterOp<"read.ptx.sreg.ctaid.x">;
 def NVVM_BlockIdYOp : NVVM_SpecialRegisterOp<"read.ptx.sreg.ctaid.y">;
 def NVVM_BlockIdZOp : NVVM_SpecialRegisterOp<"read.ptx.sreg.ctaid.z">;
@@ -57,6 +79,10 @@ def NVVM_GridDimXOp : NVVM_SpecialRegisterOp<"read.ptx.sreg.nctaid.x">;
 def NVVM_GridDimYOp : NVVM_SpecialRegisterOp<"read.ptx.sreg.nctaid.y">;
 def NVVM_GridDimZOp : NVVM_SpecialRegisterOp<"read.ptx.sreg.nctaid.z">;
 
+//===----------------------------------------------------------------------===//
+// NVVM synchronization op definitions
+//===----------------------------------------------------------------------===//
+
 def NVVM_Barrier0Op : NVVM_Op<"barrier0"> {
   string llvmBuilder = [{
       createIntrinsicCall(builder, llvm::Intrinsic::nvvm_barrier0);
diff --git a/third_party/mlir/include/mlir/Dialect/QuantOps/FakeQuantSupport.h b/third_party/mlir/include/mlir/Dialect/QuantOps/FakeQuantSupport.h
index 560b6327f96..23e2967bd77 100644
--- a/third_party/mlir/include/mlir/Dialect/QuantOps/FakeQuantSupport.h
+++ b/third_party/mlir/include/mlir/Dialect/QuantOps/FakeQuantSupport.h
@@ -62,6 +62,14 @@ UniformQuantizedType fakeQuantAttrsToType(Location loc, unsigned numBits,
                                           bool narrowRange, Type expressedType,
                                           bool isSigned = false);
 
+/// Converts per-channel FakeQuant attributes to the corresponding type.
+/// In the event that the parameters cannot be converted, returns a nullptr
+/// convertible Type and issues an appropriate error.
+UniformQuantizedPerAxisType
+fakeQuantAttrsToType(Location loc, unsigned numBits, int32_t quantizedDimension,
+                     ArrayRef<double> rmins, ArrayRef<double> rmax,
+                     bool narrowRange, Type expressedType,
+                     bool isSigned = false);
 } // namespace quant
 } // namespace mlir
 
diff --git a/third_party/mlir/include/mlir/Dialect/QuantOps/QuantOps.td b/third_party/mlir/include/mlir/Dialect/QuantOps/QuantOps.td
index 394d3a18ced..d95b4527607 100644
--- a/third_party/mlir/include/mlir/Dialect/QuantOps/QuantOps.td
+++ b/third_party/mlir/include/mlir/Dialect/QuantOps/QuantOps.td
@@ -132,6 +132,38 @@ def quant_ConstFakeQuant : quant_Op<"const_fake_quant",
   );
 }
 
+def quant_ConstFakeQuantPerAxis : quant_Op<"const_fake_quant_per_axis",
+                                    [SameOperandsAndResultType, NoSideEffect]> {
+  let summary =
+      "Simulates the effect of per axis uniform quantization with const range.";
+
+  let description = [{
+    Given a const min, max, num_bits and narrow_range attribute, applies the
+    same per axis uniform quantization simulation as is done by the TensorFlow
+    fake_quant_with_min_max_vars_per_channel op. See the fakeQuantAttrsToType()
+    utility method and the quant-convert-simulated-quantization pass for futher
+    details.
+  }];
+
+  let arguments = (ins
+    F32Tensor:$inputs,
+    F32ArrayAttr:$min,
+    F32ArrayAttr:$max,
+    // The quantized dimension of the inputs tensor.
+    I64Attr:$axis,
+    // The bitwidth of the quantization; between 2 and 16, inclusive.
+    I64Attr:$num_bits,
+    // Quantization range starts from 0 or 1; starts from 1 if true.
+    DefaultValuedAttr<BoolAttr, "false">:$narrow_range,
+    // The sign of the quantization.
+    DefaultValuedAttr<BoolAttr, "false">:$is_signed
+  );
+
+  let results = (outs
+    F32Tensor:$outputs
+  );
+}
+
 def quant_StatisticsRefOp : quant_Op<"stats_ref", [SameOperandsAndResultType]> {
   let summary =
       "Indicates that statistics are resolved by reference.";
diff --git a/third_party/mlir/include/mlir/Dialect/QuantOps/UniformSupport.h b/third_party/mlir/include/mlir/Dialect/QuantOps/UniformSupport.h
index 5d11c769b8e..42366842ada 100644
--- a/third_party/mlir/include/mlir/Dialect/QuantOps/UniformSupport.h
+++ b/third_party/mlir/include/mlir/Dialect/QuantOps/UniformSupport.h
@@ -29,7 +29,7 @@ namespace mlir {
 namespace quant {
 
 /// Performs type conversion from an arbitrary input type to a type
-/// that is expressed by a UniformQuantizedType.
+/// that is expressed by a QuantizedType.
 ///
 /// This handles cases where the inputType is a supported primitive type
 /// (i.e. f32, bf16, etc) or a vector/tensor type based on a supported
@@ -38,14 +38,13 @@ namespace quant {
 /// Since conversion often involves introspecting some attributes of the
 /// input type in order to determine how to represent it, this is a two step
 /// process.
-struct ExpressedToUniformQuantizedConverter {
+struct ExpressedToQuantizedConverter {
   /// Creates a converter for the given input type.
-  static const ExpressedToUniformQuantizedConverter
-  forInputType(Type inputType);
+  static const ExpressedToQuantizedConverter forInputType(Type inputType);
 
   /// Converts the inputType to be based on the given elemental type,
   /// returning the new type (or nullptr and emit an error on failure).
-  Type convert(UniformQuantizedType elementalType) const;
+  Type convert(QuantizedType elementalType) const;
 
   /// Whether the conversion is legal.
   explicit operator bool() const { return (bool)expressedType; }
diff --git a/third_party/mlir/include/mlir/Dialect/SDBM/SDBM.h b/third_party/mlir/include/mlir/Dialect/SDBM/SDBM.h
index 3115805bb5f..97078465ff1 100644
--- a/third_party/mlir/include/mlir/Dialect/SDBM/SDBM.h
+++ b/third_party/mlir/include/mlir/Dialect/SDBM/SDBM.h
@@ -31,7 +31,7 @@ namespace mlir {
 class MLIRContext;
 class SDBMDialect;
 class SDBMExpr;
-class SDBMPositiveExpr;
+class SDBMTermExpr;
 
 /// A utility class for SDBM to represent an integer with potentially infinite
 /// positive value. This uses the largest value of int64_t to represent infinity
@@ -130,14 +130,14 @@ private:
   /// and at(col,row) of the DBM.  Depending on the values being finite and
   /// being subsumed by stripe expressions, this may or may not add elements to
   /// the lists of equalities and inequalities.
-  void convertDBMElement(unsigned row, unsigned col, SDBMPositiveExpr rowExpr,
-                         SDBMPositiveExpr colExpr,
+  void convertDBMElement(unsigned row, unsigned col, SDBMTermExpr rowExpr,
+                         SDBMTermExpr colExpr,
                          SmallVectorImpl<SDBMExpr> &inequalities,
                          SmallVectorImpl<SDBMExpr> &equalities);
 
   /// Populate `inequalities` based on the value at(pos,pos) of the DBM. Only
   /// adds new inequalities if the inequality is not trivially true.
-  void convertDBMDiagonalElement(unsigned pos, SDBMPositiveExpr expr,
+  void convertDBMDiagonalElement(unsigned pos, SDBMTermExpr expr,
                                  SmallVectorImpl<SDBMExpr> &inequalities);
 
   /// Get the total number of elements in the matrix.
diff --git a/third_party/mlir/include/mlir/Dialect/SDBM/SDBMExpr.h b/third_party/mlir/include/mlir/Dialect/SDBM/SDBMExpr.h
index 1e695b68f97..fdb914d54d6 100644
--- a/third_party/mlir/include/mlir/Dialect/SDBM/SDBMExpr.h
+++ b/third_party/mlir/include/mlir/Dialect/SDBM/SDBMExpr.h
@@ -38,7 +38,7 @@ namespace detail {
 struct SDBMExprStorage;
 struct SDBMBinaryExprStorage;
 struct SDBMDiffExprStorage;
-struct SDBMPositiveExprStorage;
+struct SDBMTermExprStorage;
 struct SDBMConstantExprStorage;
 struct SDBMNegExprStorage;
 } // namespace detail
@@ -176,10 +176,12 @@ public:
   }
 };
 
-/// SDBM positive variable expression can be one of:
+/// SDBM term expression can be one of:
 ///  - single variable expression;
 ///  - stripe expression.
-class SDBMPositiveExpr : public SDBMVaryingExpr {
+/// Stripe expressions are treated as terms since, in the SDBM domain, they are
+/// attached to temporary variables and can appear anywhere a variable can.
+class SDBMTermExpr : public SDBMVaryingExpr {
 public:
   using SDBMVaryingExpr::SDBMVaryingExpr;
 
@@ -209,40 +211,38 @@ public:
   SDBMConstantExpr getRHS() const;
 };
 
-/// SDBM difference expression.  Both LHS and RHS are positive variable
-/// expressions.
+/// SDBM difference expression.  Both LHS and RHS are SDBM term expressions.
 class SDBMDiffExpr : public SDBMVaryingExpr {
 public:
   using ImplType = detail::SDBMDiffExprStorage;
   using SDBMVaryingExpr::SDBMVaryingExpr;
 
   /// Obtain or create a difference expression unique'ed in the given context.
-  static SDBMDiffExpr get(SDBMPositiveExpr lhs, SDBMPositiveExpr rhs);
+  static SDBMDiffExpr get(SDBMTermExpr lhs, SDBMTermExpr rhs);
 
   static bool isClassFor(const SDBMExpr &expr) {
     return expr.getKind() == SDBMExprKind::Diff;
   }
 
-  SDBMPositiveExpr getLHS() const;
-  SDBMPositiveExpr getRHS() const;
+  SDBMTermExpr getLHS() const;
+  SDBMTermExpr getRHS() const;
 };
 
-/// SDBM stripe expression "x # C" where "x" is a positive variable expression,
-/// "C" is a constant expression and "#" is the stripe operator defined as:
+/// SDBM stripe expression "x # C" where "x" is a term expression, "C" is a
+/// constant expression and "#" is the stripe operator defined as:
 ///   x # C = x - x mod C.
-class SDBMStripeExpr : public SDBMPositiveExpr {
+class SDBMStripeExpr : public SDBMTermExpr {
 public:
   using ImplType = detail::SDBMBinaryExprStorage;
-  using SDBMPositiveExpr::SDBMPositiveExpr;
+  using SDBMTermExpr::SDBMTermExpr;
 
   static bool isClassFor(const SDBMExpr &expr) {
     return expr.getKind() == SDBMExprKind::Stripe;
   }
 
-  static SDBMStripeExpr get(SDBMPositiveExpr var,
-                            SDBMConstantExpr stripeFactor);
+  static SDBMStripeExpr get(SDBMTermExpr var, SDBMConstantExpr stripeFactor);
 
-  SDBMPositiveExpr getVar() const;
+  SDBMTermExpr getVar() const;
   SDBMConstantExpr getStripeFactor() const;
 };
 
@@ -250,10 +250,10 @@ public:
 /// a symbol identifier.  When used to define SDBM functions, dimensions are
 /// interpreted as function arguments while symbols are treated as unknown but
 /// constant values, hence the name.
-class SDBMInputExpr : public SDBMPositiveExpr {
+class SDBMInputExpr : public SDBMTermExpr {
 public:
-  using ImplType = detail::SDBMPositiveExprStorage;
-  using SDBMPositiveExpr::SDBMPositiveExpr;
+  using ImplType = detail::SDBMTermExprStorage;
+  using SDBMTermExpr::SDBMTermExpr;
 
   static bool isClassFor(const SDBMExpr &expr) {
     return expr.getKind() == SDBMExprKind::DimId ||
@@ -267,7 +267,7 @@ public:
 /// when defining functions using SDBM expressions.
 class SDBMDimExpr : public SDBMInputExpr {
 public:
-  using ImplType = detail::SDBMPositiveExprStorage;
+  using ImplType = detail::SDBMTermExprStorage;
   using SDBMInputExpr::SDBMInputExpr;
 
   /// Obtain or create a dimension expression unique'ed in the given dialect
@@ -283,7 +283,7 @@ public:
 /// defining functions using SDBM expressions.
 class SDBMSymbolExpr : public SDBMInputExpr {
 public:
-  using ImplType = detail::SDBMPositiveExprStorage;
+  using ImplType = detail::SDBMTermExprStorage;
   using SDBMInputExpr::SDBMInputExpr;
 
   /// Obtain or create a symbol expression unique'ed in the given dialect (which
@@ -303,13 +303,13 @@ public:
   using SDBMVaryingExpr::SDBMVaryingExpr;
 
   /// Obtain or create a negation expression unique'ed in the given context.
-  static SDBMNegExpr get(SDBMPositiveExpr var);
+  static SDBMNegExpr get(SDBMTermExpr var);
 
   static bool isClassFor(const SDBMExpr &expr) {
     return expr.getKind() == SDBMExprKind::Neg;
   }
 
-  SDBMPositiveExpr getVar() const;
+  SDBMTermExpr getVar() const;
 };
 
 /// A visitor class for SDBM expressions.  Calls the kind-specific function
@@ -352,10 +352,10 @@ protected:
   void visitNeg(SDBMNegExpr) {}
   void visitConstant(SDBMConstantExpr) {}
 
-  /// Default implementation of visitPositive dispatches to the special
+  /// Default implementation of visitTerm dispatches to the special
   /// functions for stripes and other variables.  Concrete visitors can override
   /// it.
-  Result visitPositive(SDBMPositiveExpr expr) {
+  Result visitTerm(SDBMTermExpr expr) {
     auto *derived = static_cast<Derived *>(this);
     if (expr.getKind() == SDBMExprKind::Stripe)
       return derived->visitStripe(expr.cast<SDBMStripeExpr>());
@@ -379,8 +379,8 @@ protected:
   /// override it to visit all variables and negations instead.
   Result visitVarying(SDBMVaryingExpr expr) {
     auto *derived = static_cast<Derived *>(this);
-    if (auto var = expr.dyn_cast<SDBMPositiveExpr>())
-      return derived->visitPositive(var);
+    if (auto var = expr.dyn_cast<SDBMTermExpr>())
+      return derived->visitTerm(var);
     else if (auto neg = expr.dyn_cast<SDBMNegExpr>())
       return derived->visitNeg(neg);
     else if (auto sum = expr.dyn_cast<SDBMSumExpr>())
@@ -486,22 +486,20 @@ template <> struct DenseMapInfo<mlir::SDBMVaryingExpr> {
   }
 };
 
-// SDBMPositiveExpr hash just like pointers.
-template <> struct DenseMapInfo<mlir::SDBMPositiveExpr> {
-  static mlir::SDBMPositiveExpr getEmptyKey() {
+// SDBMTermExpr hash just like pointers.
+template <> struct DenseMapInfo<mlir::SDBMTermExpr> {
+  static mlir::SDBMTermExpr getEmptyKey() {
     auto *pointer = llvm::DenseMapInfo<void *>::getEmptyKey();
-    return mlir::SDBMPositiveExpr(
-        static_cast<mlir::SDBMExpr::ImplType *>(pointer));
+    return mlir::SDBMTermExpr(static_cast<mlir::SDBMExpr::ImplType *>(pointer));
   }
-  static mlir::SDBMPositiveExpr getTombstoneKey() {
+  static mlir::SDBMTermExpr getTombstoneKey() {
     auto *pointer = llvm::DenseMapInfo<void *>::getTombstoneKey();
-    return mlir::SDBMPositiveExpr(
-        static_cast<mlir::SDBMExpr::ImplType *>(pointer));
+    return mlir::SDBMTermExpr(static_cast<mlir::SDBMExpr::ImplType *>(pointer));
   }
-  static unsigned getHashValue(mlir::SDBMPositiveExpr expr) {
+  static unsigned getHashValue(mlir::SDBMTermExpr expr) {
     return expr.hash_value();
   }
-  static bool isEqual(mlir::SDBMPositiveExpr lhs, mlir::SDBMPositiveExpr rhs) {
+  static bool isEqual(mlir::SDBMTermExpr lhs, mlir::SDBMTermExpr rhs) {
     return lhs == rhs;
   }
 };
diff --git a/third_party/mlir/include/mlir/Dialect/SPIRV/SPIRVBase.td b/third_party/mlir/include/mlir/Dialect/SPIRV/SPIRVBase.td
index 7dea586f919..0accb05f0ac 100644
--- a/third_party/mlir/include/mlir/Dialect/SPIRV/SPIRVBase.td
+++ b/third_party/mlir/include/mlir/Dialect/SPIRV/SPIRVBase.td
@@ -144,6 +144,7 @@ def SPV_OC_OpFOrdLessThanEqual      : I32EnumAttrCase<"OpFOrdLessThanEqual", 188
 def SPV_OC_OpFUnordLessThanEqual    : I32EnumAttrCase<"OpFUnordLessThanEqual", 189>;
 def SPV_OC_OpFOrdGreaterThanEqual   : I32EnumAttrCase<"OpFOrdGreaterThanEqual", 190>;
 def SPV_OC_OpFUnordGreaterThanEqual : I32EnumAttrCase<"OpFUnordGreaterThanEqual", 191>;
+def SPV_OC_OpLoopMerge              : I32EnumAttrCase<"OpLoopMerge", 246>;
 def SPV_OC_OpLabel                  : I32EnumAttrCase<"OpLabel", 248>;
 def SPV_OC_OpBranch                 : I32EnumAttrCase<"OpBranch", 249>;
 def SPV_OC_OpBranchConditional      : I32EnumAttrCase<"OpBranchConditional", 250>;
@@ -173,9 +174,9 @@ def SPV_OpcodeAttr :
       SPV_OC_OpFUnordNotEqual, SPV_OC_OpFOrdLessThan, SPV_OC_OpFUnordLessThan,
       SPV_OC_OpFOrdGreaterThan, SPV_OC_OpFUnordGreaterThan,
       SPV_OC_OpFOrdLessThanEqual, SPV_OC_OpFUnordLessThanEqual,
-      SPV_OC_OpFOrdGreaterThanEqual, SPV_OC_OpFUnordGreaterThanEqual, SPV_OC_OpLabel,
-      SPV_OC_OpBranch, SPV_OC_OpBranchConditional, SPV_OC_OpReturn,
-      SPV_OC_OpReturnValue
+      SPV_OC_OpFOrdGreaterThanEqual, SPV_OC_OpFUnordGreaterThanEqual,
+      SPV_OC_OpLoopMerge, SPV_OC_OpLabel, SPV_OC_OpBranch,
+      SPV_OC_OpBranchConditional, SPV_OC_OpReturn, SPV_OC_OpReturnValue
       ]> {
     let returnType = "::mlir::spirv::Opcode";
     let convertFromStorage = "static_cast<::mlir::spirv::Opcode>($_self.getInt())";
@@ -924,6 +925,28 @@ def SPV_LinkageTypeAttr :
   let cppNamespace = "::mlir::spirv";
 }
 
+def SPV_LC_None               : I32EnumAttrCase<"None", 0x0000>;
+def SPV_LC_Unroll             : I32EnumAttrCase<"Unroll", 0x0001>;
+def SPV_LC_DontUnroll         : I32EnumAttrCase<"DontUnroll", 0x0002>;
+def SPV_LC_DependencyInfinite : I32EnumAttrCase<"DependencyInfinite", 0x0004>;
+def SPV_LC_DependencyLength   : I32EnumAttrCase<"DependencyLength", 0x0008>;
+def SPV_LC_MinIterations      : I32EnumAttrCase<"MinIterations", 0x0010>;
+def SPV_LC_MaxIterations      : I32EnumAttrCase<"MaxIterations", 0x0020>;
+def SPV_LC_IterationMultiple  : I32EnumAttrCase<"IterationMultiple", 0x0040>;
+def SPV_LC_PeelCount          : I32EnumAttrCase<"PeelCount", 0x0080>;
+def SPV_LC_PartialCount       : I32EnumAttrCase<"PartialCount", 0x0100>;
+
+def SPV_LoopControlAttr :
+    I32EnumAttr<"LoopControl", "valid SPIR-V LoopControl", [
+      SPV_LC_None, SPV_LC_Unroll, SPV_LC_DontUnroll, SPV_LC_DependencyInfinite,
+      SPV_LC_DependencyLength, SPV_LC_MinIterations, SPV_LC_MaxIterations,
+      SPV_LC_IterationMultiple, SPV_LC_PeelCount, SPV_LC_PartialCount
+    ]> {
+  let returnType = "::mlir::spirv::LoopControl";
+  let convertFromStorage = "static_cast<::mlir::spirv::LoopControl>($_self.getInt())";
+  let cppNamespace = "::mlir::spirv";
+}
+
 def SPV_MA_None                    : I32EnumAttrCase<"None", 0x0000>;
 def SPV_MA_Volatile                : I32EnumAttrCase<"Volatile", 0x0001>;
 def SPV_MA_Aligned                 : I32EnumAttrCase<"Aligned", 0x0002>;
diff --git a/third_party/mlir/include/mlir/Dialect/SPIRV/SPIRVControlFlowOps.td b/third_party/mlir/include/mlir/Dialect/SPIRV/SPIRVControlFlowOps.td
index 0927684732f..015dde8e77d 100644
--- a/third_party/mlir/include/mlir/Dialect/SPIRV/SPIRVControlFlowOps.td
+++ b/third_party/mlir/include/mlir/Dialect/SPIRV/SPIRVControlFlowOps.td
@@ -64,6 +64,10 @@ def SPV_BranchOp : SPV_Op<"Branch", [Terminator]> {
 
   let skipDefaultBuilders = 1;
 
+  let extraClassDeclaration = [{
+    Block *getTarget() { return getOperation()->getSuccessor(0); }
+  }];
+
   let autogenSerialization = 0;
 }
 
@@ -114,13 +118,19 @@ def SPV_BranchConditionalOp : SPV_Op<"BranchConditional", [Terminator]> {
 
   let builders = [
     OpBuilder<
-      "Builder *, OperationState *state, Value *condition, "
-      "Block *trueBranch, Block *falseBranch, /*optional*/ArrayAttr weights",
+      "Builder *builder, OperationState *state, Value *condition, "
+      "Block *trueBranch, Block *falseBranch, "
+      "Optional<std::pair<uint32_t, uint32_t>> weights",
       [{
         state->addOperands(condition);
         state->addSuccessor(trueBranch, {});
         state->addSuccessor(falseBranch, {});
-        state->addAttribute("branch_weights", weights);
+        if (weights) {
+          auto attr =
+              builder->getI32ArrayAttr({static_cast<int32_t>(weights->first),
+                                        static_cast<int32_t>(weights->second)});
+          state->addAttribute("branch_weights", attr);
+        }
       }]
     >
   ];
@@ -132,11 +142,95 @@ def SPV_BranchConditionalOp : SPV_Op<"BranchConditional", [Terminator]> {
   let extraClassDeclaration = [{
     // Branch indices into the successor list.
     enum { kTrueIndex = 0, kFalseIndex = 1 };
+
+    Block *getTrueBlock() { return getOperation()->getSuccessor(kTrueIndex); }
+
+    Block *getFalseBlock() { return getOperation()->getSuccessor(kFalseIndex); }
   }];
 }
 
 // -----
 
+def SPV_LoopOp : SPV_Op<"loop"> {
+  let summary = "Define a structured loop.";
+
+  let description = [{
+    SPIR-V can explicitly declare structured control-flow constructs using merge
+    instructions. These explicitly declare a header block before the control
+    flow diverges and a merge block where control flow subsequently converges.
+    These blocks delimit constructs that must nest, and can only be entered
+    and exited in structured ways. See "2.11. Structured Control Flow" of the
+    SPIR-V spec for more details.
+
+    Instead of having a `spv.LoopMerge` op to directly model loop merge
+    instruction for indicating the merge and continue target, we use regions
+    to delimit the boundary of the loop: the merge target is the next op
+    following the `spv.loop` op and the continue target is the block that
+    has a back-edge pointing to the entry block inside the `spv.loop`'s region.
+    This way it's easier to discover all blocks belonging to a construct and
+    it plays nicer with the MLIR system.
+
+    The `spv.loop` region should contain at least four blocks: one entry block,
+    one loop header block, one loop continue block, one loop merge block.
+    The entry block should be the first block and it should jump to the loop
+    header block, which is the second block. The loop merge block should be the
+    last block. The merge block should only contain a `spv._merge` op.
+    The continue block should be the second to last block and it should have a
+    branch to the loop header block. The loop continue block should be the only
+    block, except the entry block, branching to the header block.
+  }];
+
+  let arguments = (ins
+    SPV_LoopControlAttr:$loop_control
+  );
+
+  let results = (outs);
+
+  let regions = (region AnyRegion:$body);
+
+  let extraClassDeclaration = [{
+    // Returns the loop header block.
+    Block *getHeaderBlock();
+
+    // Returns the loop continue block.
+    Block *getContinueBlock();
+
+    // Returns the loop merge block.
+    Block *getMergeBlock();
+
+    // Adds an empty entry block and loop merge block containing one
+    // spv._merge op.
+    void addEntryAndMergeBlock();
+  }];
+
+  let hasOpcode = 0;
+}
+
+// -----
+
+def SPV_MergeOp : SPV_Op<"_merge", [HasParent<"LoopOp">, Terminator]> {
+  let summary = "A special terminator for merging a structured selection/loop.";
+
+  let description = [{
+    We use `spv.selection`/`spv.loop` for modelling structured selection/loop.
+    This op is a terminator used inside their regions to mean jumping to the
+    merge point, which is the next op following the `spv.selection` or
+    `spv.loop` op. This op does not have a corresponding instruction in the
+    SPIR-V binary format; it's solely for structural purpose.
+  }];
+
+  let arguments = (ins);
+
+  let results = (outs);
+
+  let parser = [{ return parseNoIOOp(parser, result); }];
+  let printer = [{ printNoIOOp(getOperation(), p); }];
+
+  let hasOpcode = 0;
+}
+
+// -----
+
 def SPV_ReturnOp : SPV_Op<"Return", [InFunctionScope, Terminator]> {
   let summary = "Return with no value from a function with void return type.";
 
diff --git a/third_party/mlir/include/mlir/Dialect/SPIRV/SPIRVOps.td b/third_party/mlir/include/mlir/Dialect/SPIRV/SPIRVOps.td
index 6aad60009af..8d1a19a6ef8 100644
--- a/third_party/mlir/include/mlir/Dialect/SPIRV/SPIRVOps.td
+++ b/third_party/mlir/include/mlir/Dialect/SPIRV/SPIRVOps.td
@@ -160,6 +160,8 @@ def SPV_CompositeExtractOp : SPV_Op<"CompositeExtract", [NoSideEffect]> {
   let results = (outs
     SPV_Type:$component
   );
+
+  let hasFolder = 1;
 }
 
 // -----
diff --git a/third_party/mlir/include/mlir/Dialect/StandardOps/Ops.td b/third_party/mlir/include/mlir/Dialect/StandardOps/Ops.td
index 37f2ac7b5c1..4f4165349fa 100644
--- a/third_party/mlir/include/mlir/Dialect/StandardOps/Ops.td
+++ b/third_party/mlir/include/mlir/Dialect/StandardOps/Ops.td
@@ -881,7 +881,7 @@ def TensorCastOp : CastOp<"tensor_cast"> {
     operation is invalid if converting to a mismatching constant dimension.
 
     Convert from unknown rank to rank 2 with unknown dimension sizes.
-       %2 = tensor_cast %1 : tensor<??f32> to tensor<?x?xf32>
+       %2 = tensor_cast %1 : tensor<*xf32> to tensor<?x?xf32>
   }];
 
   let arguments = (ins AnyTensor);
diff --git a/third_party/mlir/include/mlir/EDSC/Intrinsics.h b/third_party/mlir/include/mlir/EDSC/Intrinsics.h
index 98e9cea377f..a9a15df50ce 100644
--- a/third_party/mlir/include/mlir/EDSC/Intrinsics.h
+++ b/third_party/mlir/include/mlir/EDSC/Intrinsics.h
@@ -198,10 +198,11 @@ template <typename Op> struct OperationBuilder : public OperationHandle {
   OperationBuilder() : OperationHandle(OperationHandle::create<Op>()) {}
 };
 
-using alloc = ValueBuilder<AllocOp>;
 using affine_apply = ValueBuilder<AffineApplyOp>;
+using affine_if = OperationBuilder<AffineIfOp>;
 using affine_load = ValueBuilder<AffineLoadOp>;
 using affine_store = OperationBuilder<AffineStoreOp>;
+using alloc = ValueBuilder<AllocOp>;
 using call = OperationBuilder<mlir::CallOp>;
 using constant_float = ValueBuilder<ConstantFloatOp>;
 using constant_index = ValueBuilder<ConstantIndexOp>;
diff --git a/third_party/mlir/include/mlir/ExecutionEngine/ExecutionEngine.h b/third_party/mlir/include/mlir/ExecutionEngine/ExecutionEngine.h
index 72aacd0e8e8..23a8764db1d 100644
--- a/third_party/mlir/include/mlir/ExecutionEngine/ExecutionEngine.h
+++ b/third_party/mlir/include/mlir/ExecutionEngine/ExecutionEngine.h
@@ -72,13 +72,15 @@ public:
 
   /// Creates an execution engine for the given module.  If `transformer` is
   /// provided, it will be called on the LLVM module during JIT-compilation and
-  /// can be used, e.g., for reporting or optimization.
-  /// If `sharedLibPaths` are provided, the underlying JIT-compilation will open
-  /// and link the shared libraries for symbol resolution.
-  /// If `objectCache` is provided, JIT compiler will use it to store the object
-  /// generated for the given module.
+  /// can be used, e.g., for reporting or optimization. `jitCodeGenOptLevel`,
+  /// when provided, is used as the optimization level for target code
+  /// generation. If `sharedLibPaths` are provided, the underlying
+  /// JIT-compilation will open and link the shared libraries for symbol
+  /// resolution. If `objectCache` is provided, JIT compiler will use it to
+  /// store the object generated for the given module.
   static llvm::Expected<std::unique_ptr<ExecutionEngine>> create(
       ModuleOp m, std::function<llvm::Error(llvm::Module *)> transformer = {},
+      Optional<llvm::CodeGenOpt::Level> jitCodeGenOptLevel = llvm::None,
       ArrayRef<StringRef> sharedLibPaths = {}, bool enableObjectCache = false);
 
   /// Looks up a packed-argument function with the given name and returns a
diff --git a/third_party/mlir/include/mlir/IR/Function.h b/third_party/mlir/include/mlir/IR/Function.h
index 73da52ff8ec..cc130f7ece7 100644
--- a/third_party/mlir/include/mlir/IR/Function.h
+++ b/third_party/mlir/include/mlir/IR/Function.h
@@ -104,6 +104,10 @@ public:
   /// returned.
   Block *addEntryBlock();
 
+  /// Add a normal block to the end of the function's block list. The function
+  /// should at least already have an entry block.
+  Block *addBlock();
+
 private:
   // This trait needs access to `getNumFuncArguments` and `verifyType` hooks
   // defined below.
diff --git a/third_party/mlir/include/mlir/IR/IntegerSet.h b/third_party/mlir/include/mlir/IR/IntegerSet.h
index b7662f095a5..e989f91bafd 100644
--- a/third_party/mlir/include/mlir/IR/IntegerSet.h
+++ b/third_party/mlir/include/mlir/IR/IntegerSet.h
@@ -72,12 +72,22 @@ public:
   /// Returns true if this is the canonical integer set.
   bool isEmptyIntegerSet() const;
 
+  /// This method substitutes any uses of dimensions and symbols (e.g.
+  /// dim#0 with dimReplacements[0]) in subexpressions and returns the modified
+  /// integer set.  Because this can be used to eliminate dims and
+  /// symbols, the client needs to specify the number of dims and symbols in
+  /// the result.  The returned map always has the same number of results.
+  IntegerSet replaceDimsAndSymbols(ArrayRef<AffineExpr> dimReplacements,
+                                   ArrayRef<AffineExpr> symReplacements,
+                                   unsigned numResultDims,
+                                   unsigned numResultSyms);
+
   explicit operator bool() { return set; }
   bool operator==(IntegerSet other) const { return set == other.set; }
 
   unsigned getNumDims() const;
   unsigned getNumSymbols() const;
-  unsigned getNumOperands() const;
+  unsigned getNumInputs() const;
   unsigned getNumConstraints() const;
   unsigned getNumEqualities() const;
   unsigned getNumInequalities() const;
@@ -96,6 +106,10 @@ public:
 
   MLIRContext *getContext() const;
 
+  /// Walk all of the AffineExpr's in this set's constraints. Each node in an
+  /// expression tree is visited in postorder.
+  void walkExprs(llvm::function_ref<void(AffineExpr)> callback) const;
+
   void print(raw_ostream &os) const;
   void dump() const;
 
diff --git a/third_party/mlir/include/mlir/IR/OpBase.td b/third_party/mlir/include/mlir/IR/OpBase.td
index 1fed971b211..cee4c1d3aab 100644
--- a/third_party/mlir/include/mlir/IR/OpBase.td
+++ b/third_party/mlir/include/mlir/IR/OpBase.td
@@ -396,10 +396,10 @@ def F16Tensor  : TensorOf<[F16]>;
 def F32Tensor  : TensorOf<[F32]>;
 def F64Tensor  : TensorOf<[F64]>;
 
-// Whether a type is a ranked tensor type.
+// Whether a shaped type is ranked.
 def HasRankPred : CPred<"$_self.cast<ShapedType>().hasRank()">;
 
-// Whether a type is a ranked tensor type with one of the specified ranks.
+// Whether a shaped type has one of the specified ranks.
 class HasAnyRankOfPred<list<int> ranks> : And<[
     HasRankPred,
     Or<!foreach(rank, ranks,
@@ -866,9 +866,28 @@ class ElementsAttrBase<Pred condition, string description> :
   let convertFromStorage = "$_self";
 }
 
-def ElementsAttr: ElementsAttrBase<CPred<"$_self.isa<ElementsAttr>()">,
+def ElementsAttr : ElementsAttrBase<CPred<"$_self.isa<ElementsAttr>()">,
                                    "constant vector/tensor attribute">;
 
+class IntElementsAttr<int width> : ElementsAttrBase<
+  CPred<"$_self.isa<DenseIntElementsAttr>() &&"
+      "$_self.cast<DenseIntElementsAttr>().getType()."
+      "getElementType().isInteger(" # width # ")">,
+  width # "-bit integer elements attribute"> {
+
+  let storageType = [{ DenseIntElementsAttr }];
+  let returnType = [{ DenseIntElementsAttr }];
+
+  // Note that this is only constructing scalar elements attribute.
+  let constBuilderCall = "DenseElementsAttr::get("
+    "$_builder.getTensorType({}, $_builder.getIntegerType(" # width # ")), "
+    "llvm::makeArrayRef($0)).cast<DenseIntElementsAttr>()";
+  let convertFromStorage = "$_self";
+}
+
+def I32ElementsAttr : IntElementsAttr<32>;
+def I64ElementsAttr : IntElementsAttr<64>;
+
 // Base class for array attributes.
 class ArrayAttrBase<Pred condition, string description> :
     Attr<condition, description> {
@@ -916,18 +935,6 @@ def TypeArrayAttr : TypedArrayAttrBase<TypeAttr, "type array attribute"> {
   let constBuilderCall = ?;
 }
 
-def I32ElementsAttr : Attr<
-  CPred<"$_self.isa<DenseIntElementsAttr>() &&"
-      "$_self.cast<DenseIntElementsAttr>().getType()."
-      "getElementType().isInteger(32)">,
-  "32-bit integer elements attribute"> {
-  let storageType = [{ DenseIntElementsAttr }];
-  let returnType = [{ DenseIntElementsAttr }];
-  let constBuilderCall = "$_builder.getDenseElementsAttr("
-    "$_builder.getTensorType({}, $_builder.getIntegerType(32)), "
-      "{$_builder.getI32IntegerAttr($0)})";
-  let convertFromStorage = "$_self";
-}
 // Attribute information for an Attribute field within a StructAttr.
 class StructFieldAttr<string thisName, Attr thisType> {
   // Name of this field in the StructAttr.
@@ -945,6 +952,12 @@ class StructAttr<string name, Dialect dialect,
   // Name for this StructAttr.
   string className = name;
 
+  // Return type should match the name of the structure.
+  let returnType = name;
+
+  // Storage type should match the name of the structure.
+  let storageType = name;
+
   // The dialect this StructAttr belongs to.
   Dialect structDialect = dialect;
 
diff --git a/third_party/mlir/include/mlir/IR/OpImplementation.h b/third_party/mlir/include/mlir/IR/OpImplementation.h
index c97272b66e0..1b89cfc178d 100644
--- a/third_party/mlir/include/mlir/IR/OpImplementation.h
+++ b/third_party/mlir/include/mlir/IR/OpImplementation.h
@@ -213,6 +213,14 @@ public:
   // these to be chained together into a linear sequence of || expressions in
   // many cases.
 
+  /// Parse an operation in its generic form.
+  /// The parsed operation is parsed in the current context and inserted in the
+  /// provided block and insertion point. The results produced by this operation
+  /// aren't mapped to any named value in the parser. Returns nullptr on
+  /// failure.
+  virtual Operation *parseGenericOperation(Block *insertBlock,
+                                           Block::iterator insertPt) = 0;
+
   //===--------------------------------------------------------------------===//
   // Token Parsing
   //===--------------------------------------------------------------------===//
diff --git a/third_party/mlir/include/mlir/IR/Operation.h b/third_party/mlir/include/mlir/IR/Operation.h
index 76e6c773cab..a465280b592 100644
--- a/third_party/mlir/include/mlir/IR/Operation.h
+++ b/third_party/mlir/include/mlir/IR/Operation.h
@@ -686,6 +686,33 @@ public:
       : llvm::mapped_iterator<ResultIterator, Type (*)(Value *)>(it, &unwrap) {}
 };
 
+/// This class implements use iterator for the Operation. This iterates over all
+/// uses of all results of an Operation.
+class UseIterator final
+    : public llvm::iterator_facade_base<UseIterator, std::forward_iterator_tag,
+                                        Operation *> {
+public:
+  /// Initialize UseIterator for op, specify end to return iterator to last use.
+  explicit UseIterator(Operation *op, bool end = false);
+
+  UseIterator &operator++();
+  Operation *operator->() { return use->getOwner(); }
+  Operation *operator*() { return use->getOwner(); }
+
+  bool operator==(const UseIterator &other) const;
+  bool operator!=(const UseIterator &other) const;
+
+private:
+  void skipOverResultsWithNoUsers();
+
+  /// The operation whose uses are being iterated over.
+  Operation *op;
+  /// The result of op whoses uses are being iterated over.
+  Operation::result_iterator res;
+  /// The use of the result.
+  Value::use_iterator use;
+};
+
 // Implement the inline result iterator methods.
 inline auto Operation::result_begin() -> result_iterator {
   return result_iterator(this, 0);
diff --git a/third_party/mlir/include/mlir/Pass/PassInstrumentation.h b/third_party/mlir/include/mlir/Pass/PassInstrumentation.h
index 46df6fdd877..9f808f1a0b0 100644
--- a/third_party/mlir/include/mlir/Pass/PassInstrumentation.h
+++ b/third_party/mlir/include/mlir/Pass/PassInstrumentation.h
@@ -25,6 +25,7 @@
 namespace mlir {
 using AnalysisID = ClassID;
 class Operation;
+class OperationName;
 class Pass;
 
 namespace detail {
@@ -38,6 +39,20 @@ class PassInstrumentation {
 public:
   virtual ~PassInstrumentation() = 0;
 
+  /// A callback to run before a pass pipeline is executed. This function takes
+  /// the name of the operation type being operated on, and a thread id
+  /// corresponding to the parent thread this pipeline was spawned from.
+  /// Note: The parent thread id is collected via llvm::get_threadid().
+  virtual void runBeforePipeline(const OperationName &name,
+                                 uint64_t parentThreadID) {}
+
+  /// A callback to run after a pass pipeline has executed. This function takes
+  /// the name of the operation type being operated on, and a thread id
+  /// corresponding to the parent thread this pipeline was spawned from.
+  /// Note: The parent thread id is collected via llvm::get_threadid().
+  virtual void runAfterPipeline(const OperationName &name,
+                                uint64_t parentThreadID) {}
+
   /// A callback to run before a pass is executed. This function takes a pointer
   /// to the pass to be executed, as well as the current operation being
   /// operated on.
@@ -76,6 +91,12 @@ public:
   PassInstrumentor(const PassInstrumentor &) = delete;
   ~PassInstrumentor();
 
+  /// See PassInstrumentation::runBeforePipeline for details.
+  void runBeforePipeline(const OperationName &name, uint64_t parentThreadID);
+
+  /// See PassInstrumentation::runAfterPipeline for details.
+  void runAfterPipeline(const OperationName &name, uint64_t parentThreadID);
+
   /// See PassInstrumentation::runBeforePass for details.
   void runBeforePass(Pass *pass, Operation *op);
 
diff --git a/third_party/mlir/include/mlir/Pass/PassManager.h b/third_party/mlir/include/mlir/Pass/PassManager.h
index b240e5b816b..27c10502ce8 100644
--- a/third_party/mlir/include/mlir/Pass/PassManager.h
+++ b/third_party/mlir/include/mlir/Pass/PassManager.h
@@ -48,9 +48,10 @@ struct OpPassManagerImpl;
 /// other OpPassManagers or the top-level PassManager.
 class OpPassManager {
 public:
-  OpPassManager(OpPassManager &&) = default;
+  OpPassManager(OpPassManager &&rhs);
   OpPassManager(const OpPassManager &rhs);
   ~OpPassManager();
+  OpPassManager &operator=(const OpPassManager &rhs);
 
   /// Run the held passes over the given operation.
   LogicalResult run(Operation *op, AnalysisManager am);
@@ -77,12 +78,12 @@ public:
   /// Return the operation name that this pass manager operates on.
   const OperationName &getOpName() const;
 
-private:
-  OpPassManager(OperationName name, bool disableThreads, bool verifyPasses);
-
   /// Returns the internal implementation instance.
   detail::OpPassManagerImpl &getImpl();
 
+private:
+  OpPassManager(OperationName name, bool disableThreads, bool verifyPasses);
+
   /// A pointer to an internal implementation instance.
   std::unique_ptr<detail::OpPassManagerImpl> impl;
 
@@ -108,7 +109,7 @@ enum class PassTimingDisplayMode {
 };
 
 /// The main pass manager and pipeline builder.
-class PassManager {
+class PassManager : public OpPassManager {
 public:
   // If verifyPasses is true, the verifier is run after each pass.
   PassManager(MLIRContext *ctx, bool verifyPasses = true);
@@ -121,25 +122,6 @@ public:
   /// Disable support for multi-threading within the pass manager.
   void disableMultithreading(bool disable = true);
 
-  //===--------------------------------------------------------------------===//
-  // Pipeline Building
-  //===--------------------------------------------------------------------===//
-
-  /// Allow converting to the impl OpPassManager.
-  operator OpPassManager &() { return opPassManager; }
-
-  /// Add an opaque pass pointer to the current manager. This takes ownership
-  /// over the provided pass pointer.
-  void addPass(std::unique_ptr<Pass> pass);
-
-  /// Allow nesting other operation pass managers.
-  OpPassManager &nest(const OperationName &nestedName) {
-    return opPassManager.nest(nestedName);
-  }
-  template <typename OpT> OpPassManager &nest() {
-    return opPassManager.nest<OpT>();
-  }
-
   //===--------------------------------------------------------------------===//
   // Instrumentations
   //===--------------------------------------------------------------------===//
@@ -168,9 +150,6 @@ public:
       PassTimingDisplayMode displayMode = PassTimingDisplayMode::Pipeline);
 
 private:
-  /// The top level pass manager instance.
-  OpPassManager opPassManager;
-
   /// Flag that specifies if pass timing is enabled.
   bool passTiming : 1;
 
diff --git a/third_party/mlir/include/mlir/Pass/PassRegistry.h b/third_party/mlir/include/mlir/Pass/PassRegistry.h
index eea3778d8b1..3feffa1032f 100644
--- a/third_party/mlir/include/mlir/Pass/PassRegistry.h
+++ b/third_party/mlir/include/mlir/Pass/PassRegistry.h
@@ -32,11 +32,11 @@
 #include <memory>
 
 namespace mlir {
+class OpPassManager;
 class Pass;
-class PassManager;
 
 /// A registry function that adds passes to the given pass manager.
-using PassRegistryFunction = std::function<void(PassManager &)>;
+using PassRegistryFunction = std::function<void(OpPassManager &)>;
 
 using PassAllocatorFunction = std::function<std::unique_ptr<Pass>()>;
 
@@ -44,14 +44,18 @@ using PassAllocatorFunction = std::function<std::unique_ptr<Pass>()>;
 /// act as a unique identifier during pass registration.
 using PassID = ClassID;
 
+//===----------------------------------------------------------------------===//
+// PassRegistry
+//===----------------------------------------------------------------------===//
+
 /// Structure to group information about a passes and pass pipelines (argument
 /// to invoke via mlir-opt, description, pass pipeline builder).
 class PassRegistryEntry {
 public:
   /// Adds this pass registry entry to the given pass manager.
-  void addToPipeline(PassManager &pm) const {
+  void addToPipeline(OpPassManager &pm) const {
     assert(builder &&
-           "Cannot call addToPipeline on PassRegistryEntry without builder");
+           "cannot call addToPipeline on PassRegistryEntry without builder");
     builder(pm);
   }
 
@@ -95,6 +99,10 @@ public:
            PassAllocatorFunction allocator);
 };
 
+//===----------------------------------------------------------------------===//
+// PassRegistration
+//===----------------------------------------------------------------------===//
+
 /// Register a specific dialect pipeline registry function with the system,
 /// typically used through the PassPipelineRegistration template.
 void registerPassPipeline(StringRef arg, StringRef description,
@@ -134,7 +142,7 @@ template <typename ConcretePass> struct PassRegistration {
 /// Usage:
 ///
 ///   // At namespace scope.
-///   void pipelineBuilder(PassManager &pm) {
+///   void pipelineBuilder(OpPassManager &pm) {
 ///      pm.addPass(new MyPass());
 ///      pm.addPass(new MyOtherPass());
 ///   }
@@ -154,15 +162,40 @@ struct PassPipelineRegistration {
                            PassAllocatorFunction allocator);
 };
 
-/// Adds command line option for each registered pass.
-struct PassNameParser : public llvm::cl::parser<const PassRegistryEntry *> {
-  PassNameParser(llvm::cl::Option &opt);
+//===----------------------------------------------------------------------===//
+// PassPipelineCLParser
+//===----------------------------------------------------------------------===//
 
-  void initialize();
+namespace detail {
+struct PassPipelineCLParserImpl;
+} // end namespace detail
 
-  void printOptionInfo(const llvm::cl::Option &O,
-                       size_t GlobalWidth) const override;
+/// This class implements a command-line parser for MLIR passes. It registers a
+/// cl option with a given argument and description. This parser will register
+/// options for each of the passes and pipelines that have been registered with
+/// the pass registry; Meaning that `-cse` will refer to the CSE pass in MLIR.
+/// It also registers an argument, `pass-pipeline`, that supports parsing a
+/// textual description of a pipeline.
+class PassPipelineCLParser {
+public:
+  /// Construct a pass pipeline parser with the given command line description.
+  PassPipelineCLParser(StringRef arg, StringRef description);
+  ~PassPipelineCLParser();
+
+  /// Returns true if this parser contains any valid options to add.
+  bool hasAnyOccurrences() const;
+
+  /// Returns true if the given pass registry entry was registered at the
+  /// top-level of the parser, i.e. not within an explicit textual pipeline.
+  bool contains(const PassRegistryEntry *entry) const;
+
+  /// Adds the passes defined by this parser entry to the given pass manager.
+  void addToPipeline(OpPassManager &pm) const;
+
+private:
+  std::unique_ptr<detail::PassPipelineCLParserImpl> impl;
 };
+
 } // end namespace mlir
 
 #endif // MLIR_PASS_PASSREGISTRY_H_
diff --git a/third_party/mlir/include/mlir/Support/MlirOptMain.h b/third_party/mlir/include/mlir/Support/MlirOptMain.h
index 00a1e48c255..66b1a879cb5 100644
--- a/third_party/mlir/include/mlir/Support/MlirOptMain.h
+++ b/third_party/mlir/include/mlir/Support/MlirOptMain.h
@@ -28,11 +28,12 @@ class MemoryBuffer;
 } // end namespace llvm
 namespace mlir {
 struct LogicalResult;
-class PassRegistryEntry;
+class PassPipelineCLParser;
 
-LogicalResult
-MlirOptMain(llvm::raw_ostream &os, std::unique_ptr<llvm::MemoryBuffer> buffer,
-            const std::vector<const PassRegistryEntry *> &passList,
-            bool splitInputFile, bool verifyDiagnostics, bool verifyPasses);
+LogicalResult MlirOptMain(llvm::raw_ostream &os,
+                          std::unique_ptr<llvm::MemoryBuffer> buffer,
+                          const PassPipelineCLParser &passPipeline,
+                          bool splitInputFile, bool verifyDiagnostics,
+                          bool verifyPasses);
 
 } // end namespace mlir
diff --git a/third_party/mlir/include/mlir/Transforms/InliningUtils.h b/third_party/mlir/include/mlir/Transforms/InliningUtils.h
new file mode 100644
index 00000000000..693f20f9e10
--- /dev/null
+++ b/third_party/mlir/include/mlir/Transforms/InliningUtils.h
@@ -0,0 +1,196 @@
+//===- InliningUtils.h - Inliner utilities ----------------------*- C++ -*-===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+//
+// This header file defines interfaces for various inlining utility methods.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_TRANSFORMS_INLINING_UTILS_H
+#define MLIR_TRANSFORMS_INLINING_UTILS_H
+
+#include "mlir/IR/DialectInterface.h"
+#include "mlir/IR/Location.h"
+
+namespace mlir {
+
+class Block;
+class BlockAndValueMapping;
+class FuncOp;
+class Operation;
+class Region;
+class Value;
+
+//===----------------------------------------------------------------------===//
+// InlinerInterface
+//===----------------------------------------------------------------------===//
+
+/// This is the interface that must be implemented by the dialects of operations
+/// to be inlined. This interface should only handle the operations of the
+/// given dialect.
+class DialectInlinerInterface
+    : public DialectInterface::Base<DialectInlinerInterface> {
+public:
+  DialectInlinerInterface(Dialect *dialect) : Base(dialect) {}
+
+  //===--------------------------------------------------------------------===//
+  // Analysis Hooks
+  //===--------------------------------------------------------------------===//
+
+  /// Returns true if the given region 'src' can be inlined into the region
+  /// 'dest' that is attached to an operation registered to the current dialect.
+  /// 'valueMapping' contains any remapped values from within the 'src' region.
+  /// This can be used to examine what values will replace entry arguments into
+  /// the 'src' region for example.
+  virtual bool isLegalToInline(Region *dest, Region *src,
+                               BlockAndValueMapping &valueMapping) const {
+    return false;
+  }
+
+  /// Returns true if the given operation 'op', that is registered to this
+  /// dialect, can be inlined into the given region, false otherwise.
+  /// 'valueMapping' contains any remapped values from within the 'src' region.
+  /// This can be used to examine what values may potentially replace the
+  /// operands to 'op'.
+  virtual bool isLegalToInline(Operation *op, Region *dest,
+                               BlockAndValueMapping &valueMapping) const {
+    return false;
+  }
+
+  /// This hook is invoked on an operation that contains regions. It should
+  /// return true if the analyzer should recurse within the regions of this
+  /// operation when computing legality and cost, false otherwise. The default
+  /// implementation returns true.
+  virtual bool shouldAnalyzeRecursively(Operation *op) const { return true; }
+
+  //===--------------------------------------------------------------------===//
+  // Transformation Hooks
+  //===--------------------------------------------------------------------===//
+
+  /// Handle the given inlined terminator by replacing it with a new operation
+  /// as necessary. This overload is called when the inlined region has more
+  /// than one block. The 'newDest' block represents the new final branching
+  /// destination of blocks within this region, i.e. operations that release
+  /// control to the parent operation will likely now branch to this block.
+  /// Its block arguments correspond to any values that need to be replaced by
+  /// terminators within the inlined region.
+  virtual void handleTerminator(Operation *op, Block *newDest) const {
+    llvm_unreachable("must implement handleTerminator in the case of multiple "
+                     "inlined blocks");
+  }
+
+  /// Handle the given inlined terminator by replacing it with a new operation
+  /// as necessary. This overload is called when the inlined region only
+  /// contains one block. 'valuesToReplace' contains the previously returned
+  /// values of the call site before inlining. These values must be replaced by
+  /// this callback if they had any users (for example for traditional function
+  /// calls, these are directly replaced with the operands of the `return`
+  /// operation). The given 'op' will be removed by the caller, after this
+  /// function has been called.
+  virtual void handleTerminator(Operation *op,
+                                ArrayRef<Value *> valuesToReplace) const {
+    llvm_unreachable(
+        "must implement handleTerminator in the case of one inlined block");
+  }
+};
+
+/// This interface provides the hooks into the inlining interface.
+/// Note: this class automatically collects 'DialectInlinerInterface' objects
+/// registered to each dialect within the given context.
+class InlinerInterface
+    : public DialectInterfaceCollection<DialectInlinerInterface> {
+public:
+  using Base::Base;
+  virtual ~InlinerInterface();
+
+  /// These hooks mirror the hooks for the DialectInlinerInterface, with default
+  /// implementations that call the hook on the handler for the dialect 'op' is
+  /// registered to.
+
+  //===--------------------------------------------------------------------===//
+  // Analysis Hooks
+  //===--------------------------------------------------------------------===//
+
+  virtual bool isLegalToInline(Region *dest, Region *src,
+                               BlockAndValueMapping &valueMapping) const;
+  virtual bool isLegalToInline(Operation *op, Region *dest,
+                               BlockAndValueMapping &valueMapping) const;
+  virtual bool shouldAnalyzeRecursively(Operation *op) const;
+
+  //===--------------------------------------------------------------------===//
+  // Transformation Hooks
+  //===--------------------------------------------------------------------===//
+
+  virtual void handleTerminator(Operation *op, Block *newDest) const;
+  virtual void handleTerminator(Operation *op,
+                                ArrayRef<Value *> valuesToRepl) const;
+};
+
+//===----------------------------------------------------------------------===//
+// Inline Methods.
+//===----------------------------------------------------------------------===//
+
+/// This function inlines a region, 'src', into another. This function returns
+/// failure if it is not possible to inline this function. If the function
+/// returned failure, then no changes to the module have been made.
+///
+/// The provided 'inlinePoint' must be within a region, and corresponds to the
+/// location where the 'src' region should be inlined. 'mapping' contains any
+/// remapped operands that are used within the region, and *must* include
+/// remappings for the entry arguments to the region. 'resultsToReplace'
+/// corresponds to any results that should be replaced by terminators within the
+/// inlined region. 'inlineLoc' is an optional Location that, if provided, will
+/// be used to update the inlined operations's location information.
+/// 'shouldCloneInlinedRegion' corresponds to whether the source region should
+/// be cloned into the 'inlinePoint' or spliced directly.
+LogicalResult inlineRegion(InlinerInterface &interface, Region *src,
+                           Operation *inlinePoint, BlockAndValueMapping &mapper,
+                           ArrayRef<Value *> resultsToReplace,
+                           llvm::Optional<Location> inlineLoc = llvm::None,
+                           bool shouldCloneInlinedRegion = true);
+
+/// This function is an overload of the above 'inlineRegion' that allows for
+/// providing the set of operands ('inlinedOperands') that should be used
+/// in-favor of the region arguments when inlining.
+LogicalResult inlineRegion(InlinerInterface &interface, Region *src,
+                           Operation *inlinePoint,
+                           ArrayRef<Value *> inlinedOperands,
+                           ArrayRef<Value *> resultsToReplace,
+                           llvm::Optional<Location> inlineLoc = llvm::None,
+                           bool shouldCloneInlinedRegion = true);
+
+/// This function inlines a FuncOp into another. This function returns failure
+/// if it is not possible to inline this FuncOp. If the function returned
+/// failure, then no changes to the module have been made.
+///
+/// Note that this only does one level of inlining. For example, if the
+/// instruction 'call B' is inlined into function 'A', and function 'B' also
+/// calls 'C', then the call to 'C' now exists inside the body of 'A'. Similarly
+/// this will inline a recursive FuncOp by one level.
+///
+/// 'callOperands' must correspond, 1-1, with the arguments to the provided
+/// FuncOp. 'callResults' must correspond, 1-1, with the results of the
+/// provided FuncOp. These results will be replaced by the operands of any
+/// return operations that are inlined. 'inlineLoc' should refer to the location
+/// that the FuncOp is being inlined into.
+LogicalResult inlineFunction(InlinerInterface &interface, FuncOp callee,
+                             Operation *inlinePoint,
+                             ArrayRef<Value *> callOperands,
+                             ArrayRef<Value *> callResults, Location inlineLoc);
+
+} // end namespace mlir
+
+#endif // MLIR_TRANSFORMS_INLINING_UTILS_H
diff --git a/third_party/mlir/include/mlir/Transforms/RegionUtils.h b/third_party/mlir/include/mlir/Transforms/RegionUtils.h
index 6316b566373..10e6dfbae5e 100644
--- a/third_party/mlir/include/mlir/Transforms/RegionUtils.h
+++ b/third_party/mlir/include/mlir/Transforms/RegionUtils.h
@@ -40,6 +40,16 @@ bool areValuesDefinedAbove(Range values, Region &limit) {
 void replaceAllUsesInRegionWith(Value *orig, Value *replacement,
                                 Region &region);
 
+/// Calls `callback` for each use of a value within `region` or its descendants
+/// that was defined at the ancestors of the `limit`.
+void visitUsedValuesDefinedAbove(Region &region, Region &limit,
+                                 function_ref<void(OpOperand *)> callback);
+
+/// Calls `callback` for each use of a value within any of the regions provided
+/// that was defined in one of the ancestors.
+void visitUsedValuesDefinedAbove(llvm::MutableArrayRef<Region> regions,
+                                 function_ref<void(OpOperand *)> callback);
+
 /// Fill `values` with a list of values defined at the ancestors of the `limit`
 /// region and used within `region` or its descendants.
 void getUsedValuesDefinedAbove(Region &region, Region &limit,
diff --git a/third_party/mlir/include/mlir/Transforms/ViewOpGraph.h b/third_party/mlir/include/mlir/Transforms/ViewOpGraph.h
new file mode 100644
index 00000000000..9ba85c242ea
--- /dev/null
+++ b/third_party/mlir/include/mlir/Transforms/ViewOpGraph.h
@@ -0,0 +1,51 @@
+//===- ViewOpGraph.h - View/write op graphviz graphs ------------*- C++ -*-===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+//
+// Defines interface to produce Graphviz outputs of MLIR op within block.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_TRANSFORMS_VIEWOPGRAPH_H_
+#define MLIR_TRANSFORMS_VIEWOPGRAPH_H_
+
+#include "mlir/Support/LLVM.h"
+#include "llvm/Support/GraphWriter.h"
+#include "llvm/Support/raw_ostream.h"
+
+namespace mlir {
+class Block;
+class ModuleOp;
+template <typename T> class OpPassBase;
+using ModulePassBase = OpPassBase<ModuleOp>;
+
+/// Displays the graph in a window. This is for use from the debugger and
+/// depends on Graphviz to generate the graph.
+void viewGraph(Block &block, const Twine &name, bool shortNames = false,
+               const Twine &title = "",
+               llvm::GraphProgram::Name program = llvm::GraphProgram::DOT);
+
+llvm::raw_ostream &writeGraph(llvm::raw_ostream &os, Block &block,
+                              bool shortNames = false, const Twine &title = "");
+
+/// Creates a pass to print op graphs.
+std::unique_ptr<ModulePassBase>
+createPrintOpGraphPass(llvm::raw_ostream &os = llvm::errs(),
+                       bool shortNames = false, const llvm::Twine &title = "");
+
+} // end namespace mlir
+
+#endif // MLIR_TRANSFORMS_VIEWOPGRAPH_H_
diff --git a/third_party/mlir/lib/Analysis/AffineStructures.cpp b/third_party/mlir/lib/Analysis/AffineStructures.cpp
index f660fff7df6..2804ac68b4b 100644
--- a/third_party/mlir/lib/Analysis/AffineStructures.cpp
+++ b/third_party/mlir/lib/Analysis/AffineStructures.cpp
@@ -308,7 +308,7 @@ std::unique_ptr<FlatAffineConstraints> FlatAffineConstraints::clone() const {
 
 // Construct from an IntegerSet.
 FlatAffineConstraints::FlatAffineConstraints(IntegerSet set)
-    : numReservedCols(set.getNumOperands() + 1),
+    : numReservedCols(set.getNumInputs() + 1),
       numIds(set.getNumDims() + set.getNumSymbols()), numDims(set.getNumDims()),
       numSymbols(set.getNumSymbols()) {
   equalities.reserve(set.getNumEqualities() * numReservedCols);
diff --git a/third_party/mlir/lib/Conversion/StandardToLLVM/ConvertStandardToLLVM.cpp b/third_party/mlir/lib/Conversion/StandardToLLVM/ConvertStandardToLLVM.cpp
index a5d20941421..8d0dc6bb6b2 100644
--- a/third_party/mlir/lib/Conversion/StandardToLLVM/ConvertStandardToLLVM.cpp
+++ b/third_party/mlir/lib/Conversion/StandardToLLVM/ConvertStandardToLLVM.cpp
@@ -124,14 +124,14 @@ Type LLVMTypeConverter::convertFunctionType(FunctionType type) {
 
 // Convert a MemRef to an LLVM type. If the memref is statically-shaped, then
 // we return a pointer to the converted element type. Otherwise we return an
-// LLVM stucture type, where the first element of the structure type is a
+// LLVM structure type, where the first element of the structure type is a
 // pointer to the elemental type of the MemRef and the following N elements are
 // values of the Index type, one for each of N dynamic dimensions of the MemRef.
 Type LLVMTypeConverter::convertMemRefType(MemRefType type) {
   LLVM::LLVMType elementType = unwrap(convertType(type.getElementType()));
   if (!elementType)
     return {};
-  auto ptrType = elementType.getPointerTo();
+  auto ptrType = elementType.getPointerTo(type.getMemorySpace());
 
   // Extra value for the memory space.
   unsigned numDynamicSizes = type.getNumDynamicDims();
@@ -189,7 +189,7 @@ static Type getMemRefElementPtrType(MemRefType t, LLVMTypeConverter &lowering) {
   auto converted = lowering.convertType(elementType);
   if (!converted)
     return {};
-  return converted.cast<LLVM::LLVMType>().getPointerTo();
+  return converted.cast<LLVM::LLVMType>().getPointerTo(t.getMemorySpace());
 }
 
 LLVMOpLowering::LLVMOpLowering(StringRef rootOpName, MLIRContext *context,
@@ -514,14 +514,11 @@ struct ConstLLVMOpLowering
   using Super::Super;
 };
 
-// Check if the MemRefType `type` is supported by the lowering. We currently do
-// not support memrefs with affine maps and non-default memory spaces.
+// Check if the MemRefType `type` is supported by the lowering. We currently
+// only support memrefs with identity maps.
 static bool isSupportedMemRefType(MemRefType type) {
-  if (!type.getAffineMaps().empty())
-    return false;
-  if (type.getMemorySpace() != 0)
-    return false;
-  return true;
+  return llvm::all_of(type.getAffineMaps(),
+                      [](AffineMap map) { return map.isIdentity(); });
 }
 
 // An `alloc` is converted into a definition of a memref descriptor value and
@@ -598,8 +595,8 @@ struct AllocOpLowering : public LLVMLegalizationPattern<AllocOp> {
                                   cumulativeSize)
             .getResult(0);
     auto structElementType = lowering.convertType(elementType);
-    auto elementPtrType =
-        structElementType.cast<LLVM::LLVMType>().getPointerTo();
+    auto elementPtrType = structElementType.cast<LLVM::LLVMType>().getPointerTo(
+        type.getMemorySpace());
     allocated = rewriter.create<LLVM::BitcastOp>(op->getLoc(), elementPtrType,
                                                  ArrayRef<Value *>(allocated));
 
@@ -1045,16 +1042,15 @@ struct ReturnOpLowering : public LLVMLegalizationPattern<ReturnOp> {
 
     // If ReturnOp has 0 or 1 operand, create it and return immediately.
     if (numArguments == 0) {
-      rewriter.replaceOpWithNewOp<LLVM::ReturnOp>(
-          op, llvm::ArrayRef<Value *>(), llvm::ArrayRef<Block *>(),
-          llvm::ArrayRef<llvm::ArrayRef<Value *>>(), op->getAttrs());
+      rewriter.replaceOpWithNewOp<LLVM::ReturnOp>(op, llvm::ArrayRef<Value *>(),
+                                                  llvm::ArrayRef<Block *>(),
+                                                  op->getAttrs());
       return matchSuccess();
     }
     if (numArguments == 1) {
       rewriter.replaceOpWithNewOp<LLVM::ReturnOp>(
           op, llvm::ArrayRef<Value *>(operands.front()),
-          llvm::ArrayRef<Block *>(), llvm::ArrayRef<llvm::ArrayRef<Value *>>(),
-          op->getAttrs());
+          llvm::ArrayRef<Block *>(), op->getAttrs());
       return matchSuccess();
     }
 
@@ -1069,9 +1065,9 @@ struct ReturnOpLowering : public LLVMLegalizationPattern<ReturnOp> {
           op->getLoc(), packedType, packed, operands[i],
           rewriter.getIndexArrayAttr(i));
     }
-    rewriter.replaceOpWithNewOp<LLVM::ReturnOp>(
-        op, llvm::makeArrayRef(packed), llvm::ArrayRef<Block *>(),
-        llvm::ArrayRef<llvm::ArrayRef<Value *>>(), op->getAttrs());
+    rewriter.replaceOpWithNewOp<LLVM::ReturnOp>(op, llvm::makeArrayRef(packed),
+                                                llvm::ArrayRef<Block *>(),
+                                                op->getAttrs());
     return matchSuccess();
   }
 };
diff --git a/third_party/mlir/lib/Dialect/AffineOps/AffineOps.cpp b/third_party/mlir/lib/Dialect/AffineOps/AffineOps.cpp
index 1c46b77fcfe..84d709562d5 100644
--- a/third_party/mlir/lib/Dialect/AffineOps/AffineOps.cpp
+++ b/third_party/mlir/lib/Dialect/AffineOps/AffineOps.cpp
@@ -24,6 +24,7 @@
 #include "mlir/IR/Matchers.h"
 #include "mlir/IR/OpImplementation.h"
 #include "mlir/IR/PatternMatch.h"
+#include "mlir/Transforms/InliningUtils.h"
 #include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/SmallBitVector.h"
 #include "llvm/Support/Debug.h"
@@ -32,6 +33,45 @@ using llvm::dbgs;
 
 #define DEBUG_TYPE "affine-analysis"
 
+//===----------------------------------------------------------------------===//
+// AffineOpsDialect Interfaces
+//===----------------------------------------------------------------------===//
+
+namespace {
+/// This class defines the interface for handling inlining with affine
+/// operations.
+struct AffineInlinerInterface : public DialectInlinerInterface {
+  using DialectInlinerInterface::DialectInlinerInterface;
+
+  //===--------------------------------------------------------------------===//
+  // Analysis Hooks
+  //===--------------------------------------------------------------------===//
+
+  /// Returns true if the given region 'src' can be inlined into the region
+  /// 'dest' that is attached to an operation registered to the current dialect.
+  bool isLegalToInline(Region *dest, Region *src,
+                       BlockAndValueMapping &valueMapping) const final {
+    // Conservatively don't allow inlining into affine structures.
+    return false;
+  }
+
+  /// Returns true if the given operation 'op', that is registered to this
+  /// dialect, can be inlined into the given region, false otherwise.
+  bool isLegalToInline(Operation *op, Region *region,
+                       BlockAndValueMapping &valueMapping) const final {
+    // Always allow inlining affine operations into the top-level region of a
+    // function. There are some edge cases when inlining *into* affine
+    // structures, but that is handled in the other 'isLegalToInline' hook
+    // above.
+    // TODO: We should be able to inline into other regions than functions.
+    return isa<FuncOp>(region->getParentOp());
+  }
+
+  /// Affine regions should be analyzed recursively.
+  bool shouldAnalyzeRecursively(Operation *op) const final { return true; }
+};
+} // end anonymous namespace
+
 //===----------------------------------------------------------------------===//
 // AffineOpsDialect
 //===----------------------------------------------------------------------===//
@@ -43,6 +83,7 @@ AffineOpsDialect::AffineOpsDialect(MLIRContext *context)
 #define GET_OP_LIST
 #include "mlir/Dialect/AffineOps/AffineOps.cpp.inc"
                 >();
+  addInterfaces<AffineInlinerInterface>();
 }
 
 /// A utility function to check if a given region is attached to a function.
@@ -581,26 +622,27 @@ AffineApplyOp mlir::makeComposedAffineApply(OpBuilder &b, Location loc,
 
 // A symbol may appear as a dim in affine.apply operations. This function
 // canonicalizes dims that are valid symbols into actual symbols.
+template <class MapOrSet>
 static void
-canonicalizePromotedSymbols(AffineMap *map,
+canonicalizePromotedSymbols(MapOrSet *mapOrSet,
                             llvm::SmallVectorImpl<Value *> *operands) {
-  if (!map || operands->empty())
+  if (!mapOrSet || operands->empty())
     return;
 
-  assert(map->getNumInputs() == operands->size() &&
-         "map inputs must match number of operands");
+  assert(mapOrSet->getNumInputs() == operands->size() &&
+         "map/set inputs must match number of operands");
 
-  auto *context = map->getContext();
+  auto *context = mapOrSet->getContext();
   SmallVector<Value *, 8> resultOperands;
   resultOperands.reserve(operands->size());
   SmallVector<Value *, 8> remappedSymbols;
   remappedSymbols.reserve(operands->size());
   unsigned nextDim = 0;
   unsigned nextSym = 0;
-  unsigned oldNumSyms = map->getNumSymbols();
-  SmallVector<AffineExpr, 8> dimRemapping(map->getNumDims());
-  for (unsigned i = 0, e = map->getNumInputs(); i != e; ++i) {
-    if (i < map->getNumDims()) {
+  unsigned oldNumSyms = mapOrSet->getNumSymbols();
+  SmallVector<AffineExpr, 8> dimRemapping(mapOrSet->getNumDims());
+  for (unsigned i = 0, e = mapOrSet->getNumInputs(); i != e; ++i) {
+    if (i < mapOrSet->getNumDims()) {
       if (isValidSymbol((*operands)[i])) {
         // This is a valid symbol that appears as a dim, canonicalize it.
         dimRemapping[i] = getAffineSymbolExpr(oldNumSyms + nextSym++, context);
@@ -616,42 +658,49 @@ canonicalizePromotedSymbols(AffineMap *map,
 
   resultOperands.append(remappedSymbols.begin(), remappedSymbols.end());
   *operands = resultOperands;
-  *map = map->replaceDimsAndSymbols(dimRemapping, {}, nextDim,
-                                    oldNumSyms + nextSym);
+  *mapOrSet = mapOrSet->replaceDimsAndSymbols(dimRemapping, {}, nextDim,
+                                              oldNumSyms + nextSym);
 
-  assert(map->getNumInputs() == operands->size() &&
-         "map inputs must match number of operands");
+  assert(mapOrSet->getNumInputs() == operands->size() &&
+         "map/set inputs must match number of operands");
 }
 
-void mlir::canonicalizeMapAndOperands(
-    AffineMap *map, llvm::SmallVectorImpl<Value *> *operands) {
-  if (!map || operands->empty())
+// Works for either an affine map or an integer set.
+template <class MapOrSet>
+static void
+canonicalizeMapOrSetAndOperands(MapOrSet *mapOrSet,
+                                llvm::SmallVectorImpl<Value *> *operands) {
+  static_assert(std::is_same<MapOrSet, AffineMap>::value ||
+                    std::is_same<MapOrSet, IntegerSet>::value,
+                "Argument must be either of AffineMap or IntegerSet type");
+
+  if (!mapOrSet || operands->empty())
     return;
 
-  assert(map->getNumInputs() == operands->size() &&
-         "map inputs must match number of operands");
+  assert(mapOrSet->getNumInputs() == operands->size() &&
+         "map/set inputs must match number of operands");
 
-  canonicalizePromotedSymbols(map, operands);
+  canonicalizePromotedSymbols<MapOrSet>(mapOrSet, operands);
 
   // Check to see what dims are used.
-  llvm::SmallBitVector usedDims(map->getNumDims());
-  llvm::SmallBitVector usedSyms(map->getNumSymbols());
-  map->walkExprs([&](AffineExpr expr) {
+  llvm::SmallBitVector usedDims(mapOrSet->getNumDims());
+  llvm::SmallBitVector usedSyms(mapOrSet->getNumSymbols());
+  mapOrSet->walkExprs([&](AffineExpr expr) {
     if (auto dimExpr = expr.dyn_cast<AffineDimExpr>())
       usedDims[dimExpr.getPosition()] = true;
     else if (auto symExpr = expr.dyn_cast<AffineSymbolExpr>())
       usedSyms[symExpr.getPosition()] = true;
   });
 
-  auto *context = map->getContext();
+  auto *context = mapOrSet->getContext();
 
   SmallVector<Value *, 8> resultOperands;
   resultOperands.reserve(operands->size());
 
   llvm::SmallDenseMap<Value *, AffineExpr, 8> seenDims;
-  SmallVector<AffineExpr, 8> dimRemapping(map->getNumDims());
+  SmallVector<AffineExpr, 8> dimRemapping(mapOrSet->getNumDims());
   unsigned nextDim = 0;
-  for (unsigned i = 0, e = map->getNumDims(); i != e; ++i) {
+  for (unsigned i = 0, e = mapOrSet->getNumDims(); i != e; ++i) {
     if (usedDims[i]) {
       // Remap dim positions for duplicate operands.
       auto it = seenDims.find((*operands)[i]);
@@ -665,37 +714,47 @@ void mlir::canonicalizeMapAndOperands(
     }
   }
   llvm::SmallDenseMap<Value *, AffineExpr, 8> seenSymbols;
-  SmallVector<AffineExpr, 8> symRemapping(map->getNumSymbols());
+  SmallVector<AffineExpr, 8> symRemapping(mapOrSet->getNumSymbols());
   unsigned nextSym = 0;
-  for (unsigned i = 0, e = map->getNumSymbols(); i != e; ++i) {
+  for (unsigned i = 0, e = mapOrSet->getNumSymbols(); i != e; ++i) {
     if (!usedSyms[i])
       continue;
     // Handle constant operands (only needed for symbolic operands since
     // constant operands in dimensional positions would have already been
     // promoted to symbolic positions above).
     IntegerAttr operandCst;
-    if (matchPattern((*operands)[i + map->getNumDims()],
+    if (matchPattern((*operands)[i + mapOrSet->getNumDims()],
                      m_Constant(&operandCst))) {
       symRemapping[i] =
           getAffineConstantExpr(operandCst.getValue().getSExtValue(), context);
       continue;
     }
     // Remap symbol positions for duplicate operands.
-    auto it = seenSymbols.find((*operands)[i + map->getNumDims()]);
+    auto it = seenSymbols.find((*operands)[i + mapOrSet->getNumDims()]);
     if (it == seenSymbols.end()) {
       symRemapping[i] = getAffineSymbolExpr(nextSym++, context);
-      resultOperands.push_back((*operands)[i + map->getNumDims()]);
-      seenSymbols.insert(
-          std::make_pair((*operands)[i + map->getNumDims()], symRemapping[i]));
+      resultOperands.push_back((*operands)[i + mapOrSet->getNumDims()]);
+      seenSymbols.insert(std::make_pair((*operands)[i + mapOrSet->getNumDims()],
+                                        symRemapping[i]));
     } else {
       symRemapping[i] = it->second;
     }
   }
-  *map =
-      map->replaceDimsAndSymbols(dimRemapping, symRemapping, nextDim, nextSym);
+  *mapOrSet = mapOrSet->replaceDimsAndSymbols(dimRemapping, symRemapping,
+                                              nextDim, nextSym);
   *operands = resultOperands;
 }
 
+void mlir::canonicalizeMapAndOperands(
+    AffineMap *map, llvm::SmallVectorImpl<Value *> *operands) {
+  canonicalizeMapOrSetAndOperands<AffineMap>(map, operands);
+}
+
+void mlir::canonicalizeSetAndOperands(
+    IntegerSet *set, llvm::SmallVectorImpl<Value *> *operands) {
+  canonicalizeMapOrSetAndOperands<IntegerSet>(set, operands);
+}
+
 namespace {
 /// Simplify AffineApply operations.
 ///
@@ -1501,7 +1560,7 @@ static LogicalResult verify(AffineIfOp op) {
 
   // Verify that there are enough operands for the condition.
   IntegerSet condition = conditionAttr.getValue();
-  if (op.getNumOperands() != condition.getNumOperands())
+  if (op.getNumOperands() != condition.getNumInputs())
     return op.emitOpError(
         "operand count and condition integer set dimension and "
         "symbol count must match");
@@ -1600,6 +1659,55 @@ void AffineIfOp::setIntegerSet(IntegerSet newSet) {
   setAttr(getConditionAttrName(), IntegerSetAttr::get(newSet));
 }
 
+void AffineIfOp::setConditional(IntegerSet set, ArrayRef<Value *> operands) {
+  setIntegerSet(set);
+  getOperation()->setOperands(operands);
+}
+
+void AffineIfOp::build(Builder *builder, OperationState *result, IntegerSet set,
+                       ArrayRef<Value *> args, bool withElseRegion) {
+  result->addOperands(args);
+  result->addAttribute(getConditionAttrName(), IntegerSetAttr::get(set));
+  Region *thenRegion = result->addRegion();
+  Region *elseRegion = result->addRegion();
+  AffineIfOp::ensureTerminator(*thenRegion, *builder, result->location);
+  if (withElseRegion)
+    AffineIfOp::ensureTerminator(*elseRegion, *builder, result->location);
+}
+
+namespace {
+// This is a pattern to canonicalize an affine if op's conditional (integer
+// set + operands).
+struct AffineIfOpCanonicalizer : public OpRewritePattern<AffineIfOp> {
+  using OpRewritePattern<AffineIfOp>::OpRewritePattern;
+
+  PatternMatchResult matchAndRewrite(AffineIfOp ifOp,
+                                     PatternRewriter &rewriter) const override {
+    auto set = ifOp.getIntegerSet();
+    SmallVector<Value *, 4> operands(ifOp.getOperands());
+
+    canonicalizeSetAndOperands(&set, &operands);
+
+    // Any canonicalization change always leads to either a reduction in the
+    // number of operands or a change in the number of symbolic operands
+    // (promotion of dims to symbols).
+    if (operands.size() < ifOp.getIntegerSet().getNumInputs() ||
+        set.getNumSymbols() > ifOp.getIntegerSet().getNumSymbols()) {
+      ifOp.setConditional(set, operands);
+      rewriter.updatedRootInPlace(ifOp);
+      return matchSuccess();
+    }
+
+    return matchFailure();
+  }
+};
+} // end anonymous namespace
+
+void AffineIfOp::getCanonicalizationPatterns(OwningRewritePatternList &results,
+                                             MLIRContext *context) {
+  results.insert<AffineIfOpCanonicalizer>(context);
+}
+
 //===----------------------------------------------------------------------===//
 // AffineLoadOp
 //===----------------------------------------------------------------------===//
diff --git a/third_party/mlir/lib/Dialect/GPU/Transforms/KernelOutlining.cpp b/third_party/mlir/lib/Dialect/GPU/Transforms/KernelOutlining.cpp
index f464b091fc8..26449f6e6f1 100644
--- a/third_party/mlir/lib/Dialect/GPU/Transforms/KernelOutlining.cpp
+++ b/third_party/mlir/lib/Dialect/GPU/Transforms/KernelOutlining.cpp
@@ -74,22 +74,22 @@ static gpu::LaunchFuncOp inlineConstants(FuncOp kernelFunc,
     firstBlock.getArgument(i)->replaceAllUsesWith(newConstant->getResult(0));
     firstBlock.eraseArgument(i);
   }
-  if (newLaunchArgs.size() != launch.getNumKernelOperands()) {
-    std::reverse(newLaunchArgs.begin(), newLaunchArgs.end());
-    OpBuilder LaunchBuilder(launch);
-    SmallVector<Type, 8> newArgumentTypes;
-    newArgumentTypes.reserve(firstBlock.getNumArguments());
-    for (auto value : firstBlock.getArguments()) {
-      newArgumentTypes.push_back(value->getType());
-    }
-    kernelFunc.setType(LaunchBuilder.getFunctionType(newArgumentTypes, {}));
-    auto newLaunch = LaunchBuilder.create<gpu::LaunchFuncOp>(
-        launch.getLoc(), kernelFunc, launch.getGridSizeOperandValues(),
-        launch.getBlockSizeOperandValues(), newLaunchArgs);
-    launch.erase();
-    return newLaunch;
+  if (newLaunchArgs.size() == launch.getNumKernelOperands())
+    return launch;
+
+  std::reverse(newLaunchArgs.begin(), newLaunchArgs.end());
+  OpBuilder LaunchBuilder(launch);
+  SmallVector<Type, 8> newArgumentTypes;
+  newArgumentTypes.reserve(firstBlock.getNumArguments());
+  for (auto value : firstBlock.getArguments()) {
+    newArgumentTypes.push_back(value->getType());
   }
-  return launch;
+  kernelFunc.setType(LaunchBuilder.getFunctionType(newArgumentTypes, {}));
+  auto newLaunch = LaunchBuilder.create<gpu::LaunchFuncOp>(
+      launch.getLoc(), kernelFunc, launch.getGridSizeOperandValues(),
+      launch.getBlockSizeOperandValues(), newLaunchArgs);
+  launch.erase();
+  return newLaunch;
 }
 
 // Outline the `gpu.launch` operation body into a kernel function. Replace
diff --git a/third_party/mlir/lib/Dialect/Linalg/Transforms/LowerToLLVMDialect.cpp b/third_party/mlir/lib/Dialect/Linalg/Transforms/LowerToLLVMDialect.cpp
index e29827efe8a..59d78d2e870 100644
--- a/third_party/mlir/lib/Dialect/Linalg/Transforms/LowerToLLVMDialect.cpp
+++ b/third_party/mlir/lib/Dialect/Linalg/Transforms/LowerToLLVMDialect.cpp
@@ -72,7 +72,7 @@ using llvm_select = ValueBuilder<LLVM::SelectOp>;
 using mul = ValueBuilder<mlir::LLVM::MulOp>;
 using ptrtoint = ValueBuilder<mlir::LLVM::PtrToIntOp>;
 using sub = ValueBuilder<mlir::LLVM::SubOp>;
-using undef = ValueBuilder<mlir::LLVM::UndefOp>;
+using llvm_undef = ValueBuilder<mlir::LLVM::UndefOp>;
 using urem = ValueBuilder<mlir::LLVM::URemOp>;
 using llvm_alloca = ValueBuilder<LLVM::AllocaOp>;
 using llvm_return = OperationBuilder<LLVM::ReturnOp>;
@@ -123,10 +123,10 @@ static Type convertLinalgType(Type t, LLVMTypeConverter &lowering) {
   if (t.isa<RangeType>())
     return LLVMType::getStructTy(int64Ty, int64Ty, int64Ty);
 
-  // A linalg.view type converts to a *pointer to* a view descriptor. The view
-  // descriptor contains the pointer to the data buffer, followed by a 64-bit
-  // integer containing the distance between the beginning of the buffer and the
-  // first element to be accessed through the view, followed by two arrays, each
+  // A linalg.view type converts to a view descriptor. The view descriptor
+  // contains the pointer to the data buffer, followed by a 64-bit integer
+  // containing the distance between the beginning of the buffer and the first
+  // element to be accessed through the view, followed by two arrays, each
   // containing as many 64-bit integers as the rank of the View. The first array
   // represents the size, in number of original elements, of the view along the
   // given dimension.  When taking the view, the size is the difference between
@@ -146,12 +146,11 @@ static Type convertLinalgType(Type t, LLVMTypeConverter &lowering) {
   //   int64_t offset;
   //   int64_t sizes[Rank];
   //   int64_t strides[Rank];
-  // } *;
+  // };
   if (auto viewType = t.dyn_cast<ViewType>()) {
     auto ptrTy = getPtrToElementType(viewType, lowering);
     auto arrayTy = LLVMType::getArrayTy(int64Ty, viewType.getRank());
-    return LLVMType::getStructTy(ptrTy, int64Ty, arrayTy, arrayTy)
-        .getPointerTo();
+    return LLVMType::getStructTy(ptrTy, int64Ty, arrayTy, arrayTy);
   }
 
   return Type();
@@ -179,7 +178,7 @@ namespace {
 /// Factor out the common information for all view conversions:
 ///   1. common types in (standard and LLVM dialects)
 ///   2. `pos` method
-///   3. op of the FuncOp alloca'ed value and descriptor.
+///   3. view descriptor construction `desc`.
 class BaseViewConversionHelper {
 public:
   BaseViewConversionHelper(Operation *op, ViewType viewType,
@@ -189,28 +188,17 @@ public:
         int64Ty(
             lowering.convertType(rewriter.getIntegerType(64)).cast<LLVMType>()),
         rewriter(rewriter) {
-    IndexType indexType = rewriter.getIndexType();
-    viewDescriptorPtrTy =
-        convertLinalgType(viewType, lowering).cast<LLVMType>();
-    OpBuilder::InsertionGuard insertGuard(rewriter);
-    rewriter.setInsertionPointToStart(
-        &op->getParentOfType<FuncOp>().getBlocks().front());
-
-    edsc::ScopedContext context(rewriter, op->getLoc());
-    Value *one = constant(int64Ty, IntegerAttr::get(indexType, 1));
-    // Alloca with proper alignment.
-    allocatedDesc = llvm_alloca(viewDescriptorPtrTy, one, /*alignment=*/8);
-    // Load the alloca'ed descriptor.
-    desc = llvm_load(allocatedDesc);
+    viewDescriptorTy = convertLinalgType(viewType, lowering).cast<LLVMType>();
+    desc = rewriter.create<LLVM::UndefOp>(op->getLoc(), viewDescriptorTy);
   }
 
   ArrayAttr pos(ArrayRef<int> values) const {
     return positionAttr(rewriter, values);
   };
 
-  LLVMType elementTy, int64Ty, viewDescriptorPtrTy;
+  LLVMType elementTy, int64Ty, viewDescriptorTy;
   ConversionPatternRewriter &rewriter;
-  Value *allocatedDesc, *desc;
+  Value *desc;
 };
 } // namespace
 
@@ -283,7 +271,7 @@ public:
       data = gep(voidPtrTy, allocated, offset);
     }
     data = bitcast(elementPtrType, data);
-    Value *desc = undef(bufferDescriptorTy);
+    Value *desc = llvm_undef(bufferDescriptorTy);
     desc = insertvalue(bufferDescriptorTy, desc, allocated,
                        positionAttr(rewriter, kBasePtrPosInBuffer));
     desc = insertvalue(bufferDescriptorTy, desc, data,
@@ -362,7 +350,7 @@ public:
     auto pos = positionAttr(
         rewriter, {kSizePosInView, static_cast<int>(dimOp.getIndex())});
     linalg::DimOpOperandAdaptor adaptor(operands);
-    Value *viewDescriptor = llvm_load(adaptor.view());
+    Value *viewDescriptor = adaptor.view();
     rewriter.replaceOp(op, {extractvalue(indexTy, viewDescriptor, pos)});
     return matchSuccess();
   }
@@ -382,7 +370,7 @@ public:
   // current view indices.  Use the base offset and strides stored in the view
   // descriptor to emit IR iteratively computing the actual offset, followed by
   // a getelementptr. This must be called under an edsc::ScopedContext.
-  Value *obtainDataPtr(Operation *op, Value *viewDescriptorPtr,
+  Value *obtainDataPtr(Operation *op, Value *viewDescriptor,
                        ArrayRef<Value *> indices,
                        ConversionPatternRewriter &rewriter) const {
     auto loadOp = cast<Op>(op);
@@ -394,7 +382,6 @@ public:
 
     // Linearize subscripts as:
     //   base_offset + SUM_i index_i * stride_i.
-    Value *viewDescriptor = llvm_load(viewDescriptorPtr);
     Value *base = extractvalue(elementTy, viewDescriptor, pos(kPtrPosInView));
     Value *offset =
         extractvalue(int64Ty, viewDescriptor, pos(kOffsetPosInView));
@@ -442,7 +429,7 @@ public:
 
     // Fill in an aggregate value of the descriptor.
     RangeOpOperandAdaptor adaptor(operands);
-    Value *desc = undef(rangeDescriptorTy);
+    Value *desc = llvm_undef(rangeDescriptorTy);
     desc = insertvalue(desc, adaptor.min(), positionAttr(rewriter, 0));
     desc = insertvalue(desc, adaptor.max(), positionAttr(rewriter, 1));
     desc = insertvalue(desc, adaptor.step(), positionAttr(rewriter, 2));
@@ -468,57 +455,42 @@ public:
   matchAndRewrite(Operation *op, ArrayRef<Value *> operands,
                   ConversionPatternRewriter &rewriter) const override {
     SliceOpOperandAdaptor adaptor(operands);
-    auto sliceOp = cast<SliceOp>(op);
-    auto viewDescriptorPtrTy =
-        convertLinalgType(sliceOp.getViewType(), lowering);
-    auto viewType = sliceOp.getBaseViewType();
-    auto int64Ty = lowering.convertType(rewriter.getIntegerType(64));
+    Value *baseDesc = adaptor.view();
 
-    // Helper function to create an integer array attribute out of a list of
-    // values.
-    auto pos = [&rewriter](ArrayRef<int> values) {
-      return positionAttr(rewriter, values);
-    };
+    auto sliceOp = cast<SliceOp>(op);
+    BaseViewConversionHelper helper(op, sliceOp.getViewType(), rewriter,
+                                    lowering);
+    LLVMType elementTy = helper.elementTy, int64Ty = helper.int64Ty,
+             viewDescriptorTy = helper.viewDescriptorTy;
+    Value *desc = helper.desc;
+
+    auto viewType = sliceOp.getBaseViewType();
 
     edsc::ScopedContext context(rewriter, op->getLoc());
-    // Declare the view descriptor and insert data ptr *at the entry block of
-    // the function*, which is the preferred location for LLVM's analyses.
-    auto ip = rewriter.getInsertionPoint();
-    auto ib = rewriter.getInsertionBlock();
-    rewriter.setInsertionPointToStart(
-        &op->getParentOfType<FuncOp>().getBlocks().front());
     Value *zero =
         constant(int64Ty, rewriter.getIntegerAttr(rewriter.getIndexType(), 0));
-    Value *one =
-        constant(int64Ty, rewriter.getIntegerAttr(rewriter.getIndexType(), 1));
-    // Alloca with proper alignment.
-    Value *allocatedDesc =
-        llvm_alloca(viewDescriptorPtrTy, one, /*alignment=*/8);
-    Value *desc = llvm_load(allocatedDesc);
-    rewriter.setInsertionPoint(ib, ip);
 
-    Value *baseDesc = llvm_load(adaptor.view());
-
-    auto ptrPos = pos(kPtrPosInView);
-    auto elementTy = getPtrToElementType(sliceOp.getViewType(), lowering);
+    auto ptrPos = helper.pos(kPtrPosInView);
     desc = insertvalue(desc, extractvalue(elementTy, baseDesc, ptrPos), ptrPos);
 
     // TODO(ntv): extract sizes and emit asserts.
     SmallVector<Value *, 4> strides(viewType.getRank());
     for (int i = 0, e = viewType.getRank(); i < e; ++i) {
-      strides[i] = extractvalue(int64Ty, baseDesc, pos({kStridePosInView, i}));
+      strides[i] =
+          extractvalue(int64Ty, baseDesc, helper.pos({kStridePosInView, i}));
     }
 
     // Compute and insert base offset.
-    Value *baseOffset = extractvalue(int64Ty, baseDesc, pos(kOffsetPosInView));
+    Value *baseOffset =
+        extractvalue(int64Ty, baseDesc, helper.pos(kOffsetPosInView));
     for (int i = 0, e = viewType.getRank(); i < e; ++i) {
       Value *indexing = adaptor.indexings()[i];
       Value *min = indexing;
       if (sliceOp.indexing(i)->getType().isa<RangeType>())
-        min = extractvalue(int64Ty, indexing, pos(0));
+        min = extractvalue(int64Ty, indexing, helper.pos(0));
       baseOffset = add(baseOffset, mul(min, strides[i]));
     }
-    desc = insertvalue(desc, baseOffset, pos(kOffsetPosInView));
+    desc = insertvalue(desc, baseOffset, helper.pos(kOffsetPosInView));
 
     // Compute and insert view sizes (max - min along the range) and strides.
     // Skip the non-range operands as they will be projected away from the view.
@@ -528,11 +500,11 @@ public:
       if (indexing->getType().isa<RangeType>()) {
         int rank = en.index();
         Value *rangeDescriptor = adaptor.indexings()[rank];
-        Value *min = extractvalue(int64Ty, rangeDescriptor, pos(0));
-        Value *max = extractvalue(int64Ty, rangeDescriptor, pos(1));
-        Value *step = extractvalue(int64Ty, rangeDescriptor, pos(2));
+        Value *min = extractvalue(int64Ty, rangeDescriptor, helper.pos(0));
+        Value *max = extractvalue(int64Ty, rangeDescriptor, helper.pos(1));
+        Value *step = extractvalue(int64Ty, rangeDescriptor, helper.pos(2));
         Value *baseSize =
-            extractvalue(int64Ty, baseDesc, pos({kSizePosInView, rank}));
+            extractvalue(int64Ty, baseDesc, helper.pos({kSizePosInView, rank}));
         // Bound upper by base view upper bound.
         max = llvm_select(llvm_icmp(ICmpPredicate::slt, max, baseSize), max,
                           baseSize);
@@ -541,15 +513,15 @@ public:
         size =
             llvm_select(llvm_icmp(ICmpPredicate::slt, size, zero), zero, size);
         Value *stride = mul(strides[rank], step);
-        desc = insertvalue(desc, size, pos({kSizePosInView, numNewDims}));
-        desc = insertvalue(desc, stride, pos({kStridePosInView, numNewDims}));
+        desc =
+            insertvalue(desc, size, helper.pos({kSizePosInView, numNewDims}));
+        desc = insertvalue(desc, stride,
+                           helper.pos({kStridePosInView, numNewDims}));
         ++numNewDims;
       }
     }
 
-    // Store back in alloca'ed region.
-    llvm_store(desc, allocatedDesc);
-    rewriter.replaceOp(op, allocatedDesc);
+    rewriter.replaceOp(op, desc);
     return matchSuccess();
   }
 };
@@ -588,16 +560,19 @@ public:
                   ConversionPatternRewriter &rewriter) const override {
     // Initialize the common boilerplate and alloca at the top of the FuncOp.
     TransposeOpOperandAdaptor adaptor(operands);
+    Value *baseDesc = adaptor.view();
+
     auto tranposeOp = cast<TransposeOp>(op);
+    // No permutation, early exit.
+    if (tranposeOp.permutation().isIdentity())
+      return rewriter.replaceOp(op, baseDesc), matchSuccess();
+
     BaseViewConversionHelper helper(op, tranposeOp.getViewType(), rewriter,
                                     lowering);
     LLVMType elementTy = helper.elementTy, int64Ty = helper.int64Ty;
-    Value *allocatedDesc = helper.allocatedDesc, *desc = helper.desc;
+    Value *desc = helper.desc;
 
     edsc::ScopedContext context(rewriter, op->getLoc());
-    // Load the descriptor of the view constructed by the helper.
-    Value *baseDesc = llvm_load(adaptor.view());
-
     // Copy the base pointer from the old descriptor to the new one.
     ArrayAttr ptrPos = helper.pos(kPtrPosInView);
     desc = insertvalue(desc, extractvalue(elementTy, baseDesc, ptrPos), ptrPos);
@@ -606,12 +581,6 @@ public:
     ArrayAttr offPos = helper.pos(kOffsetPosInView);
     desc = insertvalue(desc, extractvalue(int64Ty, baseDesc, offPos), offPos);
 
-    if (tranposeOp.permutation().isIdentity()) {
-      // No permutation, just store back in alloca'ed region.
-      llvm_store(desc, allocatedDesc);
-      return rewriter.replaceOp(op, allocatedDesc), matchSuccess();
-    }
-
     // Iterate over the dimensions and apply size/stride permutation.
     for (auto en : llvm::enumerate(tranposeOp.permutation().getResults())) {
       int sourcePos = en.index();
@@ -625,9 +594,7 @@ public:
           insertvalue(desc, stride, helper.pos({kStridePosInView, targetPos}));
     }
 
-    // Store back in alloca'ed region.
-    llvm_store(desc, allocatedDesc);
-    rewriter.replaceOp(op, allocatedDesc);
+    rewriter.replaceOp(op, desc);
     return matchSuccess();
   }
 };
@@ -647,45 +614,31 @@ public:
   PatternMatchResult
   matchAndRewrite(Operation *op, ArrayRef<Value *> operands,
                   ConversionPatternRewriter &rewriter) const override {
-    auto viewOp = cast<ViewOp>(op);
     ViewOpOperandAdaptor adaptor(operands);
-    auto viewDescriptorPtrTy =
-        convertLinalgType(viewOp.getViewType(), lowering);
-    auto elementTy = getPtrToElementType(viewOp.getViewType(), lowering);
-    auto int64Ty = lowering.convertType(rewriter.getIntegerType(64));
 
-    auto pos = [&rewriter](ArrayRef<int> values) {
-      return positionAttr(rewriter, values);
-    };
+    auto viewOp = cast<ViewOp>(op);
+    BaseViewConversionHelper helper(op, viewOp.getViewType(), rewriter,
+                                    lowering);
+    LLVMType elementTy = helper.elementTy, int64Ty = helper.int64Ty;
+    Value *desc = helper.desc;
 
     Value *bufferDescriptor = adaptor.buffer();
     auto bufferTy = getPtrToElementType(
         viewOp.buffer()->getType().cast<BufferType>(), lowering);
 
-    // Declare the descriptor of the view.
     edsc::ScopedContext context(rewriter, op->getLoc());
-    auto ip = rewriter.getInsertionPoint();
-    auto ib = rewriter.getInsertionBlock();
-    rewriter.setInsertionPointToStart(
-        &op->getParentOfType<FuncOp>().getBlocks().front());
-    Value *one =
-        constant(int64Ty, rewriter.getIntegerAttr(rewriter.getIndexType(), 1));
-    // Alloca for proper alignment.
-    Value *allocatedDesc =
-        llvm_alloca(viewDescriptorPtrTy, one, /*alignment=*/8);
-    Value *desc = llvm_load(allocatedDesc);
-    rewriter.setInsertionPoint(ib, ip);
 
     // Copy the buffer pointer from the old descriptor to the new one.
     Value *bufferAsViewElementType =
-        bitcast(elementTy,
-                extractvalue(bufferTy, bufferDescriptor, pos(kPtrPosInBuffer)));
-    desc = insertvalue(desc, bufferAsViewElementType, pos(kPtrPosInView));
+        bitcast(elementTy, extractvalue(bufferTy, bufferDescriptor,
+                                        helper.pos(kPtrPosInBuffer)));
+    desc =
+        insertvalue(desc, bufferAsViewElementType, helper.pos(kPtrPosInView));
 
     // Zero base offset.
     auto indexTy = rewriter.getIndexType();
     Value *baseOffset = constant(int64Ty, IntegerAttr::get(indexTy, 0));
-    desc = insertvalue(desc, baseOffset, pos(kOffsetPosInView));
+    desc = insertvalue(desc, baseOffset, helper.pos(kOffsetPosInView));
 
     // Compute and insert view sizes (max - min along the range).
     int numRanges = llvm::size(viewOp.ranges());
@@ -693,26 +646,72 @@ public:
     for (int i = numRanges - 1; i >= 0; --i) {
       // Update stride.
       Value *rangeDescriptor = operands[1 + i];
-      Value *step = extractvalue(int64Ty, rangeDescriptor, pos(2));
+      Value *step = extractvalue(int64Ty, rangeDescriptor, helper.pos(2));
       Value *stride = mul(runningStride, step);
-      desc = insertvalue(desc, stride, pos({kStridePosInView, i}));
+      desc = insertvalue(desc, stride, helper.pos({kStridePosInView, i}));
       // Update size.
-      Value *min = extractvalue(int64Ty, rangeDescriptor, pos(0));
-      Value *max = extractvalue(int64Ty, rangeDescriptor, pos(1));
+      Value *min = extractvalue(int64Ty, rangeDescriptor, helper.pos(0));
+      Value *max = extractvalue(int64Ty, rangeDescriptor, helper.pos(1));
       Value *size = sub(max, min);
-      desc = insertvalue(desc, size, pos({kSizePosInView, i}));
+      desc = insertvalue(desc, size, helper.pos({kSizePosInView, i}));
       // Update stride for the next dimension.
       if (i > 0)
         runningStride = mul(runningStride, max);
     }
 
-    // Store back in alloca'ed region.
-    llvm_store(desc, allocatedDesc);
-    rewriter.replaceOp(op, allocatedDesc);
+    rewriter.replaceOp(op, desc);
     return matchSuccess();
   }
 };
 
+// Promote LLVM struct types to pointer to struct types to avoid ABI issues
+// related to C struct packing.
+static SmallVector<Type, 4>
+promoteStructTypes(Operation::operand_range operands,
+                   LLVMTypeConverter &lowering) {
+  SmallVector<Type, 4> res;
+  for (auto operand : operands) {
+    auto type = lowering.convertType(operand->getType()).cast<LLVM::LLVMType>();
+    if (type.isStructTy())
+      res.push_back(type.getPointerTo());
+    else
+      res.push_back(type);
+  }
+  return res;
+}
+
+// Promote LLVM struct to pointer to struct to avoid ABI issues related to
+// C struct packing.
+static SmallVector<Value *, 4>
+promoteStructs(Location loc, ArrayRef<Value *> operands,
+               ConversionPatternRewriter &rewriter,
+               LLVMTypeConverter &lowering) {
+  auto *context = rewriter.getContext();
+  auto int64Ty = LLVM::LLVMType::getInt64Ty(lowering.getDialect());
+  auto indexType = IndexType::get(context);
+  edsc::ScopedContext scope(rewriter, loc);
+  SmallVector<Value *, 4> promotedOperands;
+  promotedOperands.reserve(operands.size());
+  for (auto *operand : operands) {
+    auto type = operand->getType().cast<LLVM::LLVMType>();
+    if (!type.isStructTy()) {
+      promotedOperands.push_back(operand);
+      continue;
+    }
+    // Alloca with proper alignment. This is purely for solving ABI issues
+    // related to C struct packing across external library call boundaries. We
+    // do not expect optimizations of this alloca op and so we omit
+    // allocating at the entry block.
+    auto ptrType = type.cast<LLVM::LLVMType>().getPointerTo();
+    Value *one = constant(int64Ty, IntegerAttr::get(indexType, 1));
+    Value *allocated = llvm_alloca(ptrType, one, /*alignment=*/8);
+    // Store into the alloca'ed descriptor.
+    llvm_store(operand, allocated);
+    promotedOperands.push_back(allocated);
+  }
+  return promotedOperands;
+}
+
 // Get function definition for the LinalgOp. If it doesn't exist, insert a
 // definition.
 template <typename LinalgOp>
@@ -731,9 +730,9 @@ getLLVMLibraryCallDeclaration(Operation *op, LLVMTypeConverter &lowering,
   }
 
   // Get the Function type consistent with LLVM Lowering.
-  SmallVector<Type, 4> inputTypes;
-  for (auto operand : op->getOperands())
-    inputTypes.push_back(lowering.convertType(operand->getType()));
+  // Structs are automatically promoted to pointer to struct in order to avoid
+  // ABI issues related to C struct packing that we don't want to handle here.
+  auto inputTypes = promoteStructTypes(op->getOperands(), lowering);
   assert(op->getNumResults() == 0 &&
          "Library call for linalg operation can be generated only for ops that "
          "have void return types");
@@ -778,8 +777,9 @@ public:
 
     auto fAttr = rewriter.getSymbolRefAttr(f);
     auto named = rewriter.getNamedAttr("callee", fAttr);
-    rewriter.replaceOpWithNewOp<LLVM::CallOp>(op, operands,
-                                              ArrayRef<NamedAttribute>{named});
+    rewriter.replaceOpWithNewOp<LLVM::CallOp>(
+        op, promoteStructs(op->getLoc(), operands, rewriter, lowering),
+        ArrayRef<NamedAttribute>{named});
     return matchSuccess();
   }
 };
@@ -809,8 +809,9 @@ public:
 
     auto fAttr = rewriter.getSymbolRefAttr(f);
     auto named = rewriter.getNamedAttr("callee", fAttr);
-    rewriter.replaceOpWithNewOp<LLVM::CallOp>(op, operands,
-                                              ArrayRef<NamedAttribute>{named});
+    rewriter.replaceOpWithNewOp<LLVM::CallOp>(
+        op, promoteStructs(op->getLoc(), operands, rewriter, lowering),
+        ArrayRef<NamedAttribute>{named});
     return matchSuccess();
   }
 };
diff --git a/third_party/mlir/lib/Dialect/QuantOps/Transforms/ConvertSimQuant.cpp b/third_party/mlir/lib/Dialect/QuantOps/Transforms/ConvertSimQuant.cpp
index 129671979ca..1000b1fabbf 100644
--- a/third_party/mlir/lib/Dialect/QuantOps/Transforms/ConvertSimQuant.cpp
+++ b/third_party/mlir/lib/Dialect/QuantOps/Transforms/ConvertSimQuant.cpp
@@ -37,54 +37,53 @@ public:
 
 } // end anonymous namespace
 
-/// Rewrites ConstFakeQuant into a qbarrier/dbarrier pair.
-class ConstFakeQuantRewrite : public RewritePattern {
+/// Base class rewrites ConstFakeQuant into a qbarrier/dbarrier pair.
+template <typename ConcretRewriteClass, typename FakeQuantOp>
+class FakeQuantRewrite : public OpRewritePattern<FakeQuantOp> {
 public:
-  bool *hadFailure;
+  using OpRewritePattern<FakeQuantOp>::OpRewritePattern;
 
-  ConstFakeQuantRewrite(MLIRContext *context, bool *hadFailure)
-      : RewritePattern(ConstFakeQuant::getOperationName(), 1, context),
-        hadFailure(hadFailure) {}
+  FakeQuantRewrite(MLIRContext *ctx, bool *hadFailure)
+      : OpRewritePattern<FakeQuantOp>(ctx), hadFailure(hadFailure) {}
 
-  PatternMatchResult matchAndRewrite(Operation *op,
+  PatternMatchResult matchAndRewrite(FakeQuantOp op,
                                      PatternRewriter &rewriter) const override {
     // TODO: If this pattern comes up more frequently, consider adding core
     // support for failable rewrites.
     if (failableRewrite(op, rewriter)) {
       *hadFailure = true;
-      return matchFailure();
+      return Pattern::matchFailure();
     }
 
-    return matchSuccess();
+    return Pattern::matchSuccess();
   }
 
-  bool failableRewrite(Operation *op, PatternRewriter &rewriter) const {
-    auto fqOp = cast<ConstFakeQuant>(op);
+private:
+  bool *hadFailure;
 
-    auto converter =
-        ExpressedToUniformQuantizedConverter::forInputType(fqOp.getType());
+  bool failableRewrite(FakeQuantOp op, PatternRewriter &rewriter) const {
+    auto converter = ExpressedToQuantizedConverter::forInputType(op.getType());
     if (!converter) {
-      return (op->emitError("unsupported quantized type conversion"), true);
+      return (op.emitError("unsupported quantized type conversion"), true);
     }
 
-    UniformQuantizedType uniformElementType = fakeQuantAttrsToType(
-        fqOp.getLoc(), fqOp.num_bits().getSExtValue(),
-        fqOp.min().convertToFloat(), fqOp.max().convertToFloat(),
-        fqOp.narrow_range(), converter.expressedType, fqOp.is_signed());
+    QuantizedType elementType =
+        static_cast<const ConcretRewriteClass *>(this)
+            ->convertFakeQuantAttrsToType(op, converter.expressedType);
 
-    if (!uniformElementType) {
+    if (!elementType) {
       // Note that the fakeQuantAttrsToType will have emitted the error.
       return true;
     }
 
-    Type quantizedType = converter.convert(uniformElementType);
+    Type quantizedType = converter.convert(elementType);
     assert(quantizedType &&
            "Converter accepted a type that it did not convert");
 
     // TODO: Map to a qbarrier with an attribute like [Forced] to signal that
     // this is a forced/hard-coded constraint.
-    auto qbarrier = rewriter.create<QuantizeCastOp>(op->getLoc(), quantizedType,
-                                                    fqOp.inputs());
+    auto qbarrier = rewriter.create<QuantizeCastOp>(op.getLoc(), quantizedType,
+                                                    op.inputs());
     rewriter.replaceOpWithNewOp<DequantizeCastOp>(op, converter.inputType,
                                                   qbarrier.getResult());
 
@@ -92,12 +91,57 @@ public:
   }
 };
 
+class ConstFakeQuantRewrite
+    : public FakeQuantRewrite<ConstFakeQuantRewrite, ConstFakeQuant> {
+public:
+  using BaseRewrite = FakeQuantRewrite<ConstFakeQuantRewrite, ConstFakeQuant>;
+
+  ConstFakeQuantRewrite(MLIRContext *ctx, bool *hadFailure)
+      : BaseRewrite(ctx, hadFailure) {}
+
+  QuantizedType convertFakeQuantAttrsToType(ConstFakeQuant fqOp,
+                                            Type expressedType) const {
+    return fakeQuantAttrsToType(
+        fqOp.getLoc(), fqOp.num_bits().getSExtValue(),
+        fqOp.min().convertToFloat(), fqOp.max().convertToFloat(),
+        fqOp.narrow_range(), expressedType, fqOp.is_signed());
+  }
+};
+
+class ConstFakeQuantPerAxisRewrite
+    : public FakeQuantRewrite<ConstFakeQuantPerAxisRewrite,
+                              ConstFakeQuantPerAxis> {
+public:
+  using BaseRewrite =
+      FakeQuantRewrite<ConstFakeQuantPerAxisRewrite, ConstFakeQuantPerAxis>;
+
+  ConstFakeQuantPerAxisRewrite(MLIRContext *ctx, bool *hadFailure)
+      : BaseRewrite(ctx, hadFailure) {}
+
+  QuantizedType convertFakeQuantAttrsToType(ConstFakeQuantPerAxis fqOp,
+                                            Type expressedType) const {
+    SmallVector<double, 4> min, max;
+    min.reserve(fqOp.min().size());
+    max.reserve(fqOp.max().size());
+    for (auto m : fqOp.min())
+      min.push_back(m.cast<FloatAttr>().getValueAsDouble());
+    for (auto m : fqOp.max())
+      max.push_back(m.cast<FloatAttr>().getValueAsDouble());
+
+    return fakeQuantAttrsToType(fqOp.getLoc(), fqOp.num_bits().getSExtValue(),
+                                fqOp.axis().getSExtValue(), min, max,
+                                fqOp.narrow_range(), expressedType,
+                                fqOp.is_signed());
+  }
+};
+
 void ConvertSimulatedQuantPass::runOnFunction() {
   bool hadFailure = false;
   OwningRewritePatternList patterns;
   auto func = getFunction();
-  auto *context = &getContext();
-  patterns.insert<ConstFakeQuantRewrite>(context, &hadFailure);
+  auto ctx = func.getContext();
+  patterns.insert<ConstFakeQuantRewrite, ConstFakeQuantPerAxisRewrite>(
+      ctx, &hadFailure);
   applyPatternsGreedily(func, patterns);
   if (hadFailure)
     signalPassFailure();
diff --git a/third_party/mlir/lib/Dialect/QuantOps/Utils/FakeQuantSupport.cpp b/third_party/mlir/lib/Dialect/QuantOps/Utils/FakeQuantSupport.cpp
index 637f6a04988..2e1bd958b79 100644
--- a/third_party/mlir/lib/Dialect/QuantOps/Utils/FakeQuantSupport.cpp
+++ b/third_party/mlir/lib/Dialect/QuantOps/Utils/FakeQuantSupport.cpp
@@ -18,71 +18,57 @@
 #include "mlir/Dialect/QuantOps/FakeQuantSupport.h"
 #include "mlir/Dialect/QuantOps/QuantTypes.h"
 
-using namespace mlir;
-using namespace mlir::quant;
-
-UniformQuantizedType
-mlir::quant::fakeQuantAttrsToType(Location loc, unsigned numBits, double rmin,
-                                  double rmax, bool narrowRange,
-                                  Type expressedType, bool isSigned) {
-  MLIRContext *ctx = expressedType.getContext();
-  Type storageType;
-  unsigned flags;
-  int64_t qmin;
-  int64_t qmax;
-
+namespace mlir {
+namespace quant {
+namespace {
+bool getDefaultStorageParams(unsigned numBits, bool narrowRange, bool isSigned,
+                             MLIRContext *ctx, Type &storageType, int64_t &qmin,
+                             int64_t &qmax) {
   // Hard-coded type mapping from TFLite.
   if (numBits <= 8) {
     storageType = IntegerType::get(8, ctx);
     if (isSigned) {
-      flags = QuantizationFlags::Signed;
       qmin = -128;
       qmax = 127;
     } else {
-      flags = 0;
       qmin = 0;
       qmax = 255;
     }
   } else if (numBits <= 16) {
     storageType = IntegerType::get(16, ctx);
     if (isSigned) {
-      flags = QuantizationFlags::Signed;
       qmin = -32768;
       qmax = 32767;
     } else {
-      flags = 0;
       qmin = 0;
       qmax = 65535;
     }
   } else {
-    emitError(loc, "unsupported FakeQuant number of bits: ") << numBits;
-    return nullptr;
+    return true;
   }
 
   // Handle narrowRange.
   if (narrowRange) {
     qmin += 1;
   }
+  return false;
+}
 
-  // Range must straddle zero.
-  if (rmin > 0.0 || rmax < 0.0) {
-    return (emitError(loc, "FakeQuant range must straddle zero: [")
-                << rmin << "," << rmax << "]",
-            nullptr);
-  }
-
-  // Special case where min/max is close enough. The tensor contents are all
-  // 0.0s, so the scale is set to 1.0 and the tensor can be quantized to zero
-  // points and dequantized to 0.0.
-  if (std::fabs(rmax - rmin) < std::numeric_limits<double>::epsilon()) {
-    return UniformQuantizedType::getChecked(flags, storageType, expressedType,
-                                            1.0, qmin, qmin, qmax, loc);
-  }
-
+// This is a specific implementation of nudging:
+// If 0.0 < rmin < rmax or rmin < rmax < 0.0, the range will be shifted
+// to include 0.0, but the range width size (rmax-rmin) isn't changed. The zero
+// point is derived from the shifted range, and the scale isn't changed. As
+// a consequence some values, which are supposeed in the original [rmin, rmax]
+// range will be outside the shifted range and be clamped during quantization.
+// TODO(fengliuai): we should nudge the scale as well, but that requires the
+// fake quant op used in the training to use the nudged scale as well.
+void getNudgedScaleAndZeroPoint(int64_t qmin, int64_t qmax, double rmin,
+                                double rmax, double &scale,
+                                int64_t &nudgedZeroPoint) {
   // Determine the scale.
   const double qminDouble = qmin;
   const double qmaxDouble = qmax;
-  const double scale = (rmax - rmin) / (qmaxDouble - qminDouble);
+  scale = (rmax - rmin) / (qmaxDouble - qminDouble);
 
   // Zero point computation.
   // In float, solve the affine equation for any known pair
@@ -103,7 +89,7 @@ mlir::quant::fakeQuantAttrsToType(Location loc, unsigned numBits, double rmin,
                                      : zeroPointFromMax;
 
   // Now nudge the zero point to be an integer.
-  int64_t nudgedZeroPoint = 0;
+  nudgedZeroPoint = 0;
   if (zeroPointDouble < qminDouble) {
     nudgedZeroPoint = qmin;
   } else if (zeroPointDouble > qmaxDouble) {
@@ -115,8 +101,88 @@ mlir::quant::fakeQuantAttrsToType(Location loc, unsigned numBits, double rmin,
   // By construction, the nudged zero point should always be in range.
   assert(nudgedZeroPoint >= qmin);
   assert(nudgedZeroPoint <= qmax);
+}
+
+} // end namespace
+
+UniformQuantizedType fakeQuantAttrsToType(Location loc, unsigned numBits,
+                                          double rmin, double rmax,
+                                          bool narrowRange, Type expressedType,
+                                          bool isSigned) {
+  MLIRContext *ctx = expressedType.getContext();
+  unsigned flags = isSigned ? QuantizationFlags::Signed : 0;
+  Type storageType;
+  int64_t qmin;
+  int64_t qmax;
+  if (getDefaultStorageParams(numBits, narrowRange, isSigned, ctx, storageType,
+                              qmin, qmax)) {
+    return (emitError(loc, "unsupported FakeQuant number of bits: ") << numBits,
+            nullptr);
+  }
+
+  // Special case where min/max is close enough. The tensor contents are all
+  // 0.0s, so the scale is set to 1.0 and the tensor can be quantized to zero
+  // points and dequantized to 0.0.
+  if (std::fabs(rmax - rmin) < std::numeric_limits<double>::epsilon()) {
+    return UniformQuantizedType::getChecked(flags, storageType, expressedType,
+                                            1.0, qmin, qmin, qmax, loc);
+  }
+
+  double scale;
+  int64_t nudgedZeroPoint;
+  getNudgedScaleAndZeroPoint(qmin, qmax, rmin, rmax, scale, nudgedZeroPoint);
 
   return UniformQuantizedType::getChecked(flags, storageType, expressedType,
                                           scale, nudgedZeroPoint, qmin, qmax,
                                           loc);
 }
+
+UniformQuantizedPerAxisType
+fakeQuantAttrsToType(Location loc, unsigned numBits, int32_t quantizedDimension,
+                     ArrayRef<double> rmins, ArrayRef<double> rmaxs,
+                     bool narrowRange, Type expressedType, bool isSigned) {
+  size_t axis_size = rmins.size();
+  if (axis_size != rmaxs.size()) {
+    return (emitError(loc, "mismatched per-axis min and max size: ")
+                << axis_size << " vs. " << rmaxs.size(),
+            nullptr);
+  }
+
+  MLIRContext *ctx = expressedType.getContext();
+  Type storageType;
+  int64_t qmin;
+  int64_t qmax;
+  if (getDefaultStorageParams(numBits, narrowRange, isSigned, ctx, storageType,
+                              qmin, qmax)) {
+    return (emitError(loc, "unsupported FakeQuant number of bits: ") << numBits,
+            nullptr);
+  }
+
+  SmallVector<double, 4> scales;
+  SmallVector<int64_t, 4> zeroPoints;
+  scales.reserve(axis_size);
+  zeroPoints.reserve(axis_size);
+  for (size_t axis = 0; axis != axis_size; ++axis) {
+    double rmin = rmins[axis];
+    double rmax = rmaxs[axis];
+    if (std::fabs(rmax - rmin) < std::numeric_limits<double>::epsilon()) {
+      scales.push_back(1.0);
+      zeroPoints.push_back(qmin);
+      continue;
+    }
+
+    double scale;
+    int64_t nudgedZeroPoint;
+    getNudgedScaleAndZeroPoint(qmin, qmax, rmin, rmax, scale, nudgedZeroPoint);
+    scales.push_back(scale);
+    zeroPoints.push_back(nudgedZeroPoint);
+  }
+
+  unsigned flags = isSigned ? QuantizationFlags::Signed : 0;
+  return UniformQuantizedPerAxisType::getChecked(
+      flags, storageType, expressedType, scales, zeroPoints, quantizedDimension,
+      qmin, qmax, loc);
+}
+
+} // namespace quant
+} // namespace mlir
diff --git a/third_party/mlir/lib/Dialect/QuantOps/Utils/UniformSupport.cpp b/third_party/mlir/lib/Dialect/QuantOps/Utils/UniformSupport.cpp
index db8a5848981..aec45d4076b 100644
--- a/third_party/mlir/lib/Dialect/QuantOps/Utils/UniformSupport.cpp
+++ b/third_party/mlir/lib/Dialect/QuantOps/Utils/UniformSupport.cpp
@@ -25,32 +25,31 @@ static bool isQuantizablePrimitiveType(Type inputType) {
   return inputType.isa<FloatType>();
 }
 
-const ExpressedToUniformQuantizedConverter
-ExpressedToUniformQuantizedConverter::forInputType(Type inputType) {
+const ExpressedToQuantizedConverter
+ExpressedToQuantizedConverter::forInputType(Type inputType) {
   switch (inputType.getKind()) {
   default:
     if (isQuantizablePrimitiveType(inputType)) {
       // Supported primitive type (which just is the expressed type).
-      return ExpressedToUniformQuantizedConverter{inputType, inputType};
+      return ExpressedToQuantizedConverter{inputType, inputType};
     }
     // Unsupported.
-    return ExpressedToUniformQuantizedConverter{inputType, nullptr};
+    return ExpressedToQuantizedConverter{inputType, nullptr};
   case StandardTypes::RankedTensor:
   case StandardTypes::UnrankedTensor:
   case StandardTypes::Vector: {
     Type elementType = inputType.cast<ShapedType>().getElementType();
     if (!isQuantizablePrimitiveType(elementType)) {
       // Unsupported.
-      return ExpressedToUniformQuantizedConverter{inputType, nullptr};
+      return ExpressedToQuantizedConverter{inputType, nullptr};
     }
-    return ExpressedToUniformQuantizedConverter{
+    return ExpressedToQuantizedConverter{
         inputType, inputType.cast<ShapedType>().getElementType()};
   }
   }
 }
 
-Type ExpressedToUniformQuantizedConverter::convert(
-    UniformQuantizedType elementalType) const {
+Type ExpressedToQuantizedConverter::convert(QuantizedType elementalType) const {
   assert(expressedType && "convert() on unsupported conversion");
 
   switch (inputType.getKind()) {
diff --git a/third_party/mlir/lib/Dialect/SDBM/SDBM.cpp b/third_party/mlir/lib/Dialect/SDBM/SDBM.cpp
index 5450a61b17b..b3e648300e5 100644
--- a/third_party/mlir/lib/Dialect/SDBM/SDBM.cpp
+++ b/third_party/mlir/lib/Dialect/SDBM/SDBM.cpp
@@ -354,8 +354,8 @@ SDBM SDBM::get(ArrayRef<SDBMExpr> inequalities, ArrayRef<SDBMExpr> equalities) {
 // If one of the expressions is derived from another using a stripe operation,
 // check if the inequalities induced by the stripe operation subsume the
 // inequalities defined in the DBM and if so, elide these inequalities.
-void SDBM::convertDBMElement(unsigned row, unsigned col,
-                             SDBMPositiveExpr rowExpr, SDBMPositiveExpr colExpr,
+void SDBM::convertDBMElement(unsigned row, unsigned col, SDBMTermExpr rowExpr,
+                             SDBMTermExpr colExpr,
                              SmallVectorImpl<SDBMExpr> &inequalities,
                              SmallVectorImpl<SDBMExpr> &equalities) {
   using ops_assertions::operator+;
@@ -388,13 +388,13 @@ void SDBM::convertDBMElement(unsigned row, unsigned col,
                          SDBMExpr x1Expr, int64_t value) {
     if (stripeToPoint.count(x0)) {
       auto stripe = stripeToPoint[x0].cast<SDBMStripeExpr>();
-      SDBMPositiveExpr var = stripe.getVar();
+      SDBMTermExpr var = stripe.getVar();
       if (x1Expr == var && value >= 0)
         return true;
     }
     if (stripeToPoint.count(x1)) {
       auto stripe = stripeToPoint[x1].cast<SDBMStripeExpr>();
-      SDBMPositiveExpr var = stripe.getVar();
+      SDBMTermExpr var = stripe.getVar();
       if (x0Expr == var && value >= stripe.getStripeFactor().getValue() - 1)
         return true;
     }
@@ -418,7 +418,7 @@ void SDBM::convertDBMElement(unsigned row, unsigned col,
 // to -C <= 0.  Only construct the inequalities when C is negative, which
 // are trivially false but necessary for the returned system of inequalities
 // to indicate that the set it defines is empty.
-void SDBM::convertDBMDiagonalElement(unsigned pos, SDBMPositiveExpr expr,
+void SDBM::convertDBMDiagonalElement(unsigned pos, SDBMTermExpr expr,
                                      SmallVectorImpl<SDBMExpr> &inequalities) {
   auto selfDifference = at(pos, pos);
   if (selfDifference.isFinite() && selfDifference < 0) {
diff --git a/third_party/mlir/lib/Dialect/SDBM/SDBMExpr.cpp b/third_party/mlir/lib/Dialect/SDBM/SDBMExpr.cpp
index a174c8c84f2..f1c02a36312 100644
--- a/third_party/mlir/lib/Dialect/SDBM/SDBMExpr.cpp
+++ b/third_party/mlir/lib/Dialect/SDBM/SDBMExpr.cpp
@@ -166,20 +166,20 @@ void SDBMExpr::print(raw_ostream &os) const {
       visitConstant(expr.getRHS());
     }
     void visitDiff(SDBMDiffExpr expr) {
-      visitPositive(expr.getLHS());
+      visitTerm(expr.getLHS());
       prn << " - ";
-      visitPositive(expr.getRHS());
+      visitTerm(expr.getRHS());
     }
     void visitDim(SDBMDimExpr expr) { prn << 'd' << expr.getPosition(); }
     void visitSymbol(SDBMSymbolExpr expr) { prn << 's' << expr.getPosition(); }
     void visitStripe(SDBMStripeExpr expr) {
-      visitPositive(expr.getVar());
+      visitTerm(expr.getVar());
       prn << " # ";
       visitConstant(expr.getStripeFactor());
     }
     void visitNeg(SDBMNegExpr expr) {
       prn << '-';
-      visitPositive(expr.getVar());
+      visitTerm(expr.getVar());
     }
     void visitConstant(SDBMConstantExpr expr) { prn << expr.getValue(); }
 
@@ -197,11 +197,9 @@ void SDBMExpr::dump() const {
 namespace {
 // Helper class to perform negation of an SDBM expression.
 struct SDBMNegator : public SDBMVisitor<SDBMNegator, SDBMExpr> {
-  // Any positive expression is wrapped into a negation expression.
+  // Any term expression is wrapped into a negation expression.
   //  -(x) = -x
-  SDBMExpr visitPositive(SDBMPositiveExpr expr) {
-    return SDBMNegExpr::get(expr);
-  }
+  SDBMExpr visitTerm(SDBMTermExpr expr) { return SDBMNegExpr::get(expr); }
   // A negation expression is unwrapped.
   //  -(-x) = x
   SDBMExpr visitNeg(SDBMNegExpr expr) { return expr.getVar(); }
@@ -305,7 +303,7 @@ Optional<SDBMExpr> SDBMExpr::tryConvertAffineExpr(AffineExpr affine) {
         if (auto convertedLHS = visit(x.matched())) {
           // TODO(ntv): return convertedLHS.stripe(C);
           return SDBMStripeExpr::get(
-              convertedLHS.cast<SDBMPositiveExpr>(),
+              convertedLHS.cast<SDBMTermExpr>(),
               visit(C.matched()).cast<SDBMConstantExpr>());
         }
       }
@@ -328,8 +326,8 @@ Optional<SDBMExpr> SDBMExpr::tryConvertAffineExpr(AffineExpr affine) {
       // difference, supported as a special kind in SDBM.  Because AffineExprs
       // don't have first-class difference kind, check both LHS and RHS for
       // negation.
-      auto lhsPos = lhs.dyn_cast<SDBMPositiveExpr>();
-      auto rhsPos = rhs.dyn_cast<SDBMPositiveExpr>();
+      auto lhsPos = lhs.dyn_cast<SDBMTermExpr>();
+      auto rhsPos = rhs.dyn_cast<SDBMTermExpr>();
       auto lhsNeg = lhs.dyn_cast<SDBMNegExpr>();
       auto rhsNeg = rhs.dyn_cast<SDBMNegExpr>();
       if (lhsNeg && rhsVar)
@@ -347,7 +345,7 @@ Optional<SDBMExpr> SDBMExpr::tryConvertAffineExpr(AffineExpr affine) {
       AffineExprMatcher pattern = (x.floorDiv(C)) * C;
       if (pattern.match(expr)) {
         if (SDBMExpr converted = visit(x.matched())) {
-          if (auto varConverted = converted.dyn_cast<SDBMPositiveExpr>())
+          if (auto varConverted = converted.dyn_cast<SDBMTermExpr>())
             // TODO(ntv): return varConverted.stripe(C.getConstantValue());
             return SDBMStripeExpr::get(
                 varConverted,
@@ -369,7 +367,7 @@ Optional<SDBMExpr> SDBMExpr::tryConvertAffineExpr(AffineExpr affine) {
 
       // The only supported "multiplication" expression is an SDBM is dimension
       // negation, that is a product of dimension and constant -1.
-      auto lhsVar = lhs.dyn_cast<SDBMPositiveExpr>();
+      auto lhsVar = lhs.dyn_cast<SDBMTermExpr>();
       if (lhsVar && rhsConstant.getValue() == -1)
         return SDBMNegExpr::get(lhsVar);
 
@@ -385,7 +383,7 @@ Optional<SDBMExpr> SDBMExpr::tryConvertAffineExpr(AffineExpr affine) {
       // 'mod' can only be converted to SDBM if its LHS is a variable
       // and its RHS is a constant.  Then it `x mod c = x - x stripe c`.
       auto rhsConstant = rhs.dyn_cast<SDBMConstantExpr>();
-      auto lhsVar = rhs.dyn_cast<SDBMPositiveExpr>();
+      auto lhsVar = rhs.dyn_cast<SDBMTermExpr>();
       if (!lhsVar || !rhsConstant)
         return {};
       return SDBMDiffExpr::get(lhsVar,
@@ -420,7 +418,7 @@ Optional<SDBMExpr> SDBMExpr::tryConvertAffineExpr(AffineExpr affine) {
 // SDBMDiffExpr
 //===----------------------------------------------------------------------===//
 
-SDBMDiffExpr SDBMDiffExpr::get(SDBMPositiveExpr lhs, SDBMPositiveExpr rhs) {
+SDBMDiffExpr SDBMDiffExpr::get(SDBMTermExpr lhs, SDBMTermExpr rhs) {
   assert(lhs && "expected SDBM dimension");
   assert(rhs && "expected SDBM dimension");
 
@@ -429,11 +427,11 @@ SDBMDiffExpr SDBMDiffExpr::get(SDBMPositiveExpr lhs, SDBMPositiveExpr rhs) {
       /*initFn=*/{}, static_cast<unsigned>(SDBMExprKind::Diff), lhs, rhs);
 }
 
-SDBMPositiveExpr SDBMDiffExpr::getLHS() const {
+SDBMTermExpr SDBMDiffExpr::getLHS() const {
   return static_cast<ImplType *>(impl)->lhs;
 }
 
-SDBMPositiveExpr SDBMDiffExpr::getRHS() const {
+SDBMTermExpr SDBMDiffExpr::getRHS() const {
   return static_cast<ImplType *>(impl)->rhs;
 }
 
@@ -441,7 +439,7 @@ SDBMPositiveExpr SDBMDiffExpr::getRHS() const {
 // SDBMStripeExpr
 //===----------------------------------------------------------------------===//
 
-SDBMStripeExpr SDBMStripeExpr::get(SDBMPositiveExpr var,
+SDBMStripeExpr SDBMStripeExpr::get(SDBMTermExpr var,
                                    SDBMConstantExpr stripeFactor) {
   assert(var && "expected SDBM variable expression");
   assert(stripeFactor && "expected non-null stripe factor");
@@ -454,9 +452,9 @@ SDBMStripeExpr SDBMStripeExpr::get(SDBMPositiveExpr var,
       stripeFactor);
 }
 
-SDBMPositiveExpr SDBMStripeExpr::getVar() const {
+SDBMTermExpr SDBMStripeExpr::getVar() const {
   if (SDBMVaryingExpr lhs = static_cast<ImplType *>(impl)->lhs)
-    return lhs.cast<SDBMPositiveExpr>();
+    return lhs.cast<SDBMTermExpr>();
   return {};
 }
 
@@ -479,12 +477,12 @@ unsigned SDBMInputExpr::getPosition() const {
 SDBMDimExpr SDBMDimExpr::get(SDBMDialect *dialect, unsigned position) {
   assert(dialect && "expected non-null dialect");
 
-  auto assignDialect = [dialect](detail::SDBMPositiveExprStorage *storage) {
+  auto assignDialect = [dialect](detail::SDBMTermExprStorage *storage) {
     storage->dialect = dialect;
   };
 
   StorageUniquer &uniquer = dialect->getUniquer();
-  return uniquer.get<detail::SDBMPositiveExprStorage>(
+  return uniquer.get<detail::SDBMTermExprStorage>(
       assignDialect, static_cast<unsigned>(SDBMExprKind::DimId), position);
 }
 
@@ -495,12 +493,12 @@ SDBMDimExpr SDBMDimExpr::get(SDBMDialect *dialect, unsigned position) {
 SDBMSymbolExpr SDBMSymbolExpr::get(SDBMDialect *dialect, unsigned position) {
   assert(dialect && "expected non-null dialect");
 
-  auto assignDialect = [dialect](detail::SDBMPositiveExprStorage *storage) {
+  auto assignDialect = [dialect](detail::SDBMTermExprStorage *storage) {
     storage->dialect = dialect;
   };
 
   StorageUniquer &uniquer = dialect->getUniquer();
-  return uniquer.get<detail::SDBMPositiveExprStorage>(
+  return uniquer.get<detail::SDBMTermExprStorage>(
       assignDialect, static_cast<unsigned>(SDBMExprKind::SymbolId), position);
 }
 
@@ -528,7 +526,7 @@ int64_t SDBMConstantExpr::getValue() const {
 // SDBMNegExpr
 //===----------------------------------------------------------------------===//
 
-SDBMNegExpr SDBMNegExpr::get(SDBMPositiveExpr var) {
+SDBMNegExpr SDBMNegExpr::get(SDBMTermExpr var) {
   assert(var && "expected non-null SDBM variable expression");
 
   StorageUniquer &uniquer = var.getDialect()->getUniquer();
@@ -536,7 +534,7 @@ SDBMNegExpr SDBMNegExpr::get(SDBMPositiveExpr var) {
       /*initFn=*/{}, static_cast<unsigned>(SDBMExprKind::Neg), var);
 }
 
-SDBMPositiveExpr SDBMNegExpr::getVar() const {
+SDBMTermExpr SDBMNegExpr::getVar() const {
   return static_cast<ImplType *>(impl)->dim;
 }
 
@@ -627,8 +625,7 @@ SDBMExpr operator-(SDBMExpr lhs, SDBMExpr rhs) {
   }
 
   // This calls into operator+ for futher simplification in case value == 0.
-  return SDBMDiffExpr::get(lhs.cast<SDBMPositiveExpr>(),
-                           rhs.cast<SDBMPositiveExpr>()) +
+  return SDBMDiffExpr::get(lhs.cast<SDBMTermExpr>(), rhs.cast<SDBMTermExpr>()) +
          value;
 }
 
@@ -640,7 +637,7 @@ SDBMExpr stripe(SDBMExpr expr, SDBMExpr factor) {
   if (constantFactor.getValue() == 1)
     return expr;
 
-  return SDBMStripeExpr::get(expr.cast<SDBMPositiveExpr>(), constantFactor);
+  return SDBMStripeExpr::get(expr.cast<SDBMTermExpr>(), constantFactor);
 }
 
 } // namespace ops_assertions
diff --git a/third_party/mlir/lib/Dialect/SDBM/SDBMExprDetail.h b/third_party/mlir/lib/Dialect/SDBM/SDBMExprDetail.h
index 1721b02dae7..b202ab5efb4 100644
--- a/third_party/mlir/lib/Dialect/SDBM/SDBMExprDetail.h
+++ b/third_party/mlir/lib/Dialect/SDBM/SDBMExprDetail.h
@@ -64,7 +64,7 @@ struct SDBMBinaryExprStorage : public SDBMExprStorage {
 
 // Storage class for SDBM difference expressions.
 struct SDBMDiffExprStorage : public SDBMExprStorage {
-  using KeyTy = std::pair<SDBMPositiveExpr, SDBMPositiveExpr>;
+  using KeyTy = std::pair<SDBMTermExpr, SDBMTermExpr>;
 
   bool operator==(const KeyTy &key) const {
     return std::get<0>(key) == lhs && std::get<1>(key) == rhs;
@@ -79,8 +79,8 @@ struct SDBMDiffExprStorage : public SDBMExprStorage {
     return result;
   }
 
-  SDBMPositiveExpr lhs;
-  SDBMPositiveExpr rhs;
+  SDBMTermExpr lhs;
+  SDBMTermExpr rhs;
 };
 
 // Storage class for SDBM constant expressions.
@@ -100,14 +100,14 @@ struct SDBMConstantExprStorage : public SDBMExprStorage {
 };
 
 // Storage class for SDBM dimension and symbol expressions.
-struct SDBMPositiveExprStorage : public SDBMExprStorage {
+struct SDBMTermExprStorage : public SDBMExprStorage {
   using KeyTy = unsigned;
 
   bool operator==(const KeyTy &key) const { return position == key; }
 
-  static SDBMPositiveExprStorage *
+  static SDBMTermExprStorage *
   construct(StorageUniquer::StorageAllocator &allocator, const KeyTy &key) {
-    auto *result = allocator.allocate<SDBMPositiveExprStorage>();
+    auto *result = allocator.allocate<SDBMTermExprStorage>();
     result->position = key;
     return result;
   }
@@ -117,7 +117,7 @@ struct SDBMPositiveExprStorage : public SDBMExprStorage {
 
 // Storage class for SDBM negation expressions.
 struct SDBMNegExprStorage : public SDBMExprStorage {
-  using KeyTy = SDBMPositiveExpr;
+  using KeyTy = SDBMTermExpr;
 
   bool operator==(const KeyTy &key) const { return key == dim; }
 
@@ -129,7 +129,7 @@ struct SDBMNegExprStorage : public SDBMExprStorage {
     return result;
   }
 
-  SDBMPositiveExpr dim;
+  SDBMTermExpr dim;
 };
 
 } // end namespace detail
diff --git a/third_party/mlir/lib/Dialect/SPIRV/SPIRVOps.cpp b/third_party/mlir/lib/Dialect/SPIRV/SPIRVOps.cpp
index 3338c104a7c..743065ca28e 100644
--- a/third_party/mlir/lib/Dialect/SPIRV/SPIRVOps.cpp
+++ b/third_party/mlir/lib/Dialect/SPIRV/SPIRVOps.cpp
@@ -27,6 +27,7 @@
 #include "mlir/IR/Function.h"
 #include "mlir/IR/OpImplementation.h"
 #include "mlir/IR/StandardTypes.h"
+#include "mlir/Support/Functional.h"
 #include "mlir/Support/StringExtras.h"
 
 using namespace mlir;
@@ -311,6 +312,28 @@ static void printVariableDecorations(Operation *op, OpAsmPrinter *printer,
   printer->printOptionalAttrDict(op->getAttrs(), elidedAttrs);
 }
 
+// Extracts an element from the given `composite` by following the given
+// `indices`. Returns a null Attribute if error happens.
+static Attribute extractCompositeElement(Attribute composite,
+                                         ArrayRef<unsigned> indices) {
+  // Return composite itself if we reach the end of the index chain.
+  if (indices.empty())
+    return composite;
+
+  if (auto vector = composite.dyn_cast<ElementsAttr>()) {
+    assert(indices.size() == 1 && "must have exactly one index for a vector");
+    return vector.getValue({indices[0]});
+  }
+
+  if (auto array = composite.dyn_cast<ArrayAttr>()) {
+    assert(!indices.empty() && "must have at least one index for an array");
+    return extractCompositeElement(array.getValue()[indices[0]],
+                                   indices.drop_front());
+  }
+
+  return {};
+}
+
 //===----------------------------------------------------------------------===//
 // spv.AccessChainOp
 //===----------------------------------------------------------------------===//
@@ -568,9 +591,9 @@ static void print(spirv::BranchConditionalOp branchOp, OpAsmPrinter *printer) {
 
   if (auto weights = branchOp.branch_weights()) {
     *printer << " [";
-    mlir::interleaveComma(
-        weights->getValue(), printer->getStream(),
-        [&](Attribute a) { *printer << a.cast<IntegerAttr>().getInt(); });
+    interleaveComma(weights->getValue(), *printer, [&](Attribute a) {
+      *printer << a.cast<IntegerAttr>().getInt();
+    });
     *printer << "]";
   }
 
@@ -700,6 +723,16 @@ static LogicalResult verify(spirv::CompositeExtractOp compExOp) {
   return success();
 }
 
+OpFoldResult spirv::CompositeExtractOp::fold(ArrayRef<Attribute> operands) {
+  assert(operands.size() == 1 && "spv.CompositeExtract expects one operand");
+  auto indexVector = functional::map(
+      [](Attribute attr) {
+        return static_cast<unsigned>(attr.cast<IntegerAttr>().getInt());
+      },
+      indices());
+  return extractCompositeElement(operands[0], indexVector);
+}
+
 //===----------------------------------------------------------------------===//
 // spv.constant
 //===----------------------------------------------------------------------===//
@@ -768,7 +801,7 @@ static LogicalResult verify(spirv::ConstantOp constOp) {
 }
 
 OpFoldResult spirv::ConstantOp::fold(ArrayRef<Attribute> operands) {
-  assert(operands.empty() && "constant has no operands");
+  assert(operands.empty() && "spv.constant has no operands");
   return value();
 }
 
@@ -827,8 +860,7 @@ static void print(spirv::EntryPointOp entryPointOp, OpAsmPrinter *printer) {
            << entryPointOp.fn();
   if (auto interface = entryPointOp.interface()) {
     *printer << ", ";
-    mlir::interleaveComma(interface.getValue().getValue(), printer->getStream(),
-                          [&](Attribute a) { printer->printAttribute(a); });
+    interleaveComma(interface.getValue().getValue(), *printer);
   }
 }
 
@@ -875,8 +907,8 @@ static void print(spirv::ExecutionModeOp execModeOp, OpAsmPrinter *printer) {
     return;
   }
   *printer << ", ";
-  mlir::interleaveComma(
-      values.getValue().cast<ArrayAttr>(), printer->getStream(),
+  interleaveComma(
+      values.getValue().cast<ArrayAttr>(), *printer,
       [&](Attribute a) { *printer << a.cast<IntegerAttr>().getInt(); });
 }
 
@@ -1017,6 +1049,169 @@ static LogicalResult verify(spirv::LoadOp loadOp) {
   return verifyMemoryAccessAttribute(loadOp);
 }
 
+//===----------------------------------------------------------------------===//
+// spv.loop
+//===----------------------------------------------------------------------===//
+
+static ParseResult parseLoopOp(OpAsmParser *parser, OperationState *state) {
+  // TODO(antiagainst): support loop control properly
+  Builder builder = parser->getBuilder();
+  state->addAttribute("loop_control",
+                      builder.getI32IntegerAttr(
+                          static_cast<uint32_t>(spirv::LoopControl::None)));
+
+  return parser->parseRegion(*state->addRegion(), /*arguments=*/{},
+                             /*argTypes=*/{});
+}
+
+static void print(spirv::LoopOp loopOp, OpAsmPrinter *printer) {
+  auto *op = loopOp.getOperation();
+
+  *printer << spirv::LoopOp::getOperationName();
+  printer->printRegion(op->getRegion(0), /*printEntryBlockArgs=*/false,
+                       /*printBlockTerminators=*/true);
+}
+
+/// Returns true if the given `block` only contains one `spv._merge` op.
+static inline bool isMergeBlock(Block &block) {
+  return std::next(block.begin()) == block.end() &&
+         isa<spirv::MergeOp>(block.front());
+}
+
+/// Returns true if the given `srcBlock` contains only one `spv.Branch` to the
+/// given `dstBlock`.
+static inline bool hasOneBranchOpTo(Block &srcBlock, Block &dstBlock) {
+  // Check that there is only one op in the `srcBlock`.
+  if (srcBlock.empty() || std::next(srcBlock.begin()) != srcBlock.end())
+    return false;
+
+  auto branchOp = dyn_cast<spirv::BranchOp>(srcBlock.back());
+  return branchOp && branchOp.getSuccessor(0) == &dstBlock;
+}
+
+static LogicalResult verify(spirv::LoopOp loopOp) {
+  auto *op = loopOp.getOperation();
+
+  // We need to verify that the blocks follow the following layout:
+  //
+  //                     +-------------+
+  //                     | entry block |
+  //                     +-------------+
+  //                            |
+  //                            v
+  //                     +-------------+
+  //                     | loop header | <-----+
+  //                     +-------------+       |
+  //                                           |
+  //                           ...             |
+  //                          \ | /            |
+  //                            v              |
+  //                    +---------------+      |
+  //                    | loop continue | -----+
+  //                    +---------------+
+  //
+  //                           ...
+  //                          \ | /
+  //                            v
+  //                     +-------------+
+  //                     | merge block |
+  //                     +-------------+
+
+  auto &region = op->getRegion(0);
+  // Allow empty region as a degenerated case, which can come from
+  // optimizations.
+  if (region.empty())
+    return success();
+
+  // The last block is the merge block.
+  Block &merge = region.back();
+  if (!isMergeBlock(merge))
+    return loopOp.emitOpError(
+        "last block must be the merge block with only one 'spv._merge' op");
+
+  if (std::next(region.begin()) == region.end())
+    return loopOp.emitOpError(
+        "must have an entry block branching to the loop header block");
+  // The first block is the entry block.
+  Block &entry = region.front();
+
+  if (std::next(region.begin(), 2) == region.end())
+    return loopOp.emitOpError(
+        "must have a loop header block branched from the entry block");
+  // The second block is the loop header block.
+  Block &header = *std::next(region.begin(), 1);
+
+  if (!hasOneBranchOpTo(entry, header))
+    return loopOp.emitOpError(
+        "entry block must only have one 'spv.Branch' op to the second block");
+
+  if (std::next(region.begin(), 3) == region.end())
+    return loopOp.emitOpError(
+        "requires a loop continue block branching to the loop header block");
+  // The second to last block is the loop continue block.
+  Block &cont = *std::prev(region.end(), 2);
+
+  // Make sure that we have a branch from the loop continue block to the loop
+  // header block.
+  if (llvm::none_of(
+          llvm::seq<unsigned>(0, cont.getNumSuccessors()),
+          [&](unsigned index) { return cont.getSuccessor(index) == &header; }))
+    return loopOp.emitOpError("second to last block must be the loop continue "
+                              "block that branches to the loop header block");
+
+  // Make sure that no other blocks (except the entry and loop continue block)
+  // branches to the loop header block.
+  for (auto &block : llvm::make_range(std::next(region.begin(), 2),
+                                      std::prev(region.end(), 2))) {
+    for (auto i : llvm::seq<unsigned>(0, block.getNumSuccessors())) {
+      if (block.getSuccessor(i) == &header) {
+        return loopOp.emitOpError("can only have the entry and loop continue "
+                                  "block branching to the loop header block");
+      }
+    }
+  }
+
+  return success();
+}
+
+Block *spirv::LoopOp::getHeaderBlock() {
+  // The second block is the loop header block.
+  return &*std::next(body().begin());
+}
+
+Block *spirv::LoopOp::getContinueBlock() {
+  // The second to last block is the loop continue block.
+  return &*std::prev(body().end(), 2);
+}
+
+Block *spirv::LoopOp::getMergeBlock() {
+  // The last block is the loop merge block.
+  return &body().back();
+}
+
+void spirv::LoopOp::addEntryAndMergeBlock() {
+  assert(body().empty() && "entry and merge block already exist");
+  body().push_back(new Block());
+  auto *mergeBlock = new Block();
+  body().push_back(mergeBlock);
+  OpBuilder builder(mergeBlock);
+
+  // Add a spv._merge op into the merge block.
+  builder.create<spirv::MergeOp>(builder.getUnknownLoc());
+}
+
+//===----------------------------------------------------------------------===//
+// spv._merge
+//===----------------------------------------------------------------------===//
+
+static LogicalResult verify(spirv::MergeOp mergeOp) {
+  Block &parentLastBlock = mergeOp.getParentRegion()->back();
+  if (mergeOp.getOperation() != parentLastBlock.getTerminator())
+    return mergeOp.emitOpError(
+        "can only be used in the last block of 'spv.loop'");
+  return success();
+}
+
 //===----------------------------------------------------------------------===//
 // spv.module
 //===----------------------------------------------------------------------===//
diff --git a/third_party/mlir/lib/Dialect/SPIRV/Serialization/Deserializer.cpp b/third_party/mlir/lib/Dialect/SPIRV/Serialization/Deserializer.cpp
index dc0d886fa88..dcc7d19af62 100644
--- a/third_party/mlir/lib/Dialect/SPIRV/Serialization/Deserializer.cpp
+++ b/third_party/mlir/lib/Dialect/SPIRV/Serialization/Deserializer.cpp
@@ -24,17 +24,23 @@
 #include "mlir/Dialect/SPIRV/SPIRVBinaryUtils.h"
 #include "mlir/Dialect/SPIRV/SPIRVOps.h"
 #include "mlir/Dialect/SPIRV/SPIRVTypes.h"
+#include "mlir/IR/BlockAndValueMapping.h"
 #include "mlir/IR/Builders.h"
 #include "mlir/IR/Location.h"
 #include "mlir/Support/LogicalResult.h"
 #include "mlir/Support/StringExtras.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/Sequence.h"
 #include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/bit.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
 
 using namespace mlir;
 
+#define DEBUG_TYPE "spirv-deserialization"
+
 // Decodes a string literal in `words` starting at `wordIndex`. Update the
 // latter to point to the position in words after the string literal.
 static inline StringRef decodeStringLiteral(ArrayRef<uint32_t> words,
@@ -117,6 +123,8 @@ private:
   /// them to their handler method accordingly.
   LogicalResult processFunction(ArrayRef<uint32_t> operands);
 
+  LogicalResult processFunctionEnd(ArrayRef<uint32_t> operands);
+
   /// Gets the constant's attribute and type associated with the given <id>.
   Optional<std::pair<Attribute, Type>> getConstant(uint32_t id);
 
@@ -186,9 +194,67 @@ private:
   // Control flow
   //===--------------------------------------------------------------------===//
 
+  // In SPIR-V, structured control flow is explicitly declared using merge
+  // instructions (OpSelectionMerge and OpLoopMerge). In the SPIR-V dialect,
+  // we use spv.selection and spv.loop to group structured control flow.
+  // The deserializer need to turn structured control flow marked with merge
+  // instructions into using spv.selection/spv.loop ops.
+  //
+  // Because structured control flow can nest and the basic block order have
+  // flexibility, we cannot isolate a structured selection/loop without
+  // deserializing all the blocks. So we use the following approach:
+  //
+  // 1. Deserialize all basic blocks in a function and create MLIR blocks for
+  //    them into the function's region. In the meanwhile, keep a map between
+  //    selection/loop header blocks to their corresponding merge (and continue)
+  //    target blocks.
+  // 2. For each selection/loop header block, recursively get all basic blocks
+  //    reachable (except the merge block) and put them in a newly created
+  //    spv.selection/spv.loop's region. Structured control flow guarantees
+  //    that we enter and exit in structured ways and the construct is nestable.
+  // 3. Put the new spv.selection/spv.loop op at the beginning of the old merge
+  //    block and redirect all branches to the old header block to the old
+  //    merge block (which contains the spv.selection/spv.loop op now).
+
+  /// Returns the block for the given label <id>.
+  Block *getBlock(uint32_t id) const { return blockMap.lookup(id); }
+
+  /// A struct for containing a header block's merge and continue targets.
+  struct BlockMergeInfo {
+    Block *mergeBlock;
+    Block *continueBlock;
+
+    BlockMergeInfo() : mergeBlock(nullptr), continueBlock(nullptr) {}
+    BlockMergeInfo(Block *m, Block *c) : mergeBlock(m), continueBlock(c) {}
+
+    operator bool() const { return continueBlock && mergeBlock; }
+  };
+
+  /// Returns the merge and continue target info for the given `block` if it is
+  /// a header block.
+  BlockMergeInfo getBlockMergeInfo(Block *block) const {
+    return blockMergeInfo.lookup(block);
+  }
+
+  /// Gets or creates the block corresponding to the given label <id>. The newly
+  /// created block will always be placed at the end of the current function.
+  Block *getOrCreateBlock(uint32_t id);
+
+  LogicalResult processBranch(ArrayRef<uint32_t> operands);
+
+  LogicalResult processBranchConditional(ArrayRef<uint32_t> operands);
+
   /// Processes a SPIR-V OpLabel instruction with the given `operands`.
   LogicalResult processLabel(ArrayRef<uint32_t> operands);
 
+  /// Processes a SPIR-V OpLoopMerge instruction with the given `operands`.
+  LogicalResult processLoopMerge(ArrayRef<uint32_t> operands);
+
+  /// Extracts blocks belonging to a structured selection/loop into a
+  /// spv.selection/spv.loop op. This method iterates until all blocks
+  /// declared as selection/loop headers are handled.
+  LogicalResult structurizeControlFlow();
+
   //===--------------------------------------------------------------------===//
   // Instruction
   //===--------------------------------------------------------------------===//
@@ -253,6 +319,12 @@ private:
   /// The SPIR-V ModuleOp.
   Optional<spirv::ModuleOp> module;
 
+  /// The current function under construction.
+  Optional<FuncOp> curFunction;
+
+  /// The current block under construction.
+  Block *curBlock = nullptr;
+
   OpBuilder opBuilder;
 
   /// The list of capabilities used by the module.
@@ -283,6 +355,12 @@ private:
   // Result <id> to function mapping.
   DenseMap<uint32_t, FuncOp> funcMap;
 
+  // Result <id> to block mapping.
+  DenseMap<uint32_t, Block *> blockMap;
+
+  // Header block to its merge (and continue) target mapping.
+  DenseMap<Block *, BlockMergeInfo> blockMergeInfo;
+
   // Result <id> to value mapping.
   DenseMap<uint32_t, Value *> valueMap;
 
@@ -312,10 +390,10 @@ private:
 
 Deserializer::Deserializer(ArrayRef<uint32_t> binary, MLIRContext *context)
     : binary(binary), context(context), unknownLoc(UnknownLoc::get(context)),
-      module(createModuleOp()),
-      opBuilder(module->getOperation()->getRegion(0)) {}
+      module(createModuleOp()), opBuilder(module->body()) {}
 
 LogicalResult Deserializer::deserialize() {
+  LLVM_DEBUG(llvm::dbgs() << "++ deserialization started\n");
   if (failed(processHeader()))
     return failure();
 
@@ -345,6 +423,7 @@ LogicalResult Deserializer::deserialize() {
   attachCapabilities();
   attachExtensions();
 
+  LLVM_DEBUG(llvm::dbgs() << "++ deserialization succeeded\n");
   return success();
 }
 
@@ -515,6 +594,10 @@ LogicalResult Deserializer::processMemberDecoration(ArrayRef<uint32_t> words) {
 }
 
 LogicalResult Deserializer::processFunction(ArrayRef<uint32_t> operands) {
+  if (curFunction) {
+    return emitError(unknownLoc, "found function inside function");
+  }
+
   // Get the result type
   if (operands.size() != 4) {
     return emitError(unknownLoc, "OpFunction must have 4 parameters");
@@ -524,9 +607,11 @@ LogicalResult Deserializer::processFunction(ArrayRef<uint32_t> operands) {
     return emitError(unknownLoc, "undefined result type from <id> ")
            << operands[0];
   }
+
   if (funcMap.count(operands[1])) {
     return emitError(unknownLoc, "duplicate function definition/declaration");
   }
+
   auto functionControl = spirv::symbolizeFunctionControl(operands[2]);
   if (!functionControl) {
     return emitError(unknownLoc, "unknown Function Control: ") << operands[2];
@@ -537,12 +622,14 @@ LogicalResult Deserializer::processFunction(ArrayRef<uint32_t> operands) {
            << spirv::stringifyFunctionControl(functionControl.getValue())
            << "'";
   }
+
   Type fnType = getType(operands[3]);
   if (!fnType || !fnType.isa<FunctionType>()) {
     return emitError(unknownLoc, "unknown function type from <id> ")
            << operands[3];
   }
   auto functionType = fnType.cast<FunctionType>();
+
   if ((isVoidType(resultType) && functionType.getNumResults() != 0) ||
       (functionType.getNumResults() == 1 &&
        functionType.getResult(0) != resultType)) {
@@ -556,8 +643,12 @@ LogicalResult Deserializer::processFunction(ArrayRef<uint32_t> operands) {
   }
   auto funcOp = opBuilder.create<FuncOp>(unknownLoc, fnName, functionType,
                                          ArrayRef<NamedAttribute>());
-  funcMap[operands[1]] = funcOp;
-  funcOp.addEntryBlock();
+  curFunction = funcMap[operands[1]] = funcOp;
+  LLVM_DEBUG(llvm::dbgs() << "[fn] processing function " << fnName << " (type="
+                          << fnType << ", id=" << operands[1] << ")\n");
+  auto *entryBlock = funcOp.addEntryBlock();
+  LLVM_DEBUG(llvm::dbgs() << "[block] created entry block @ " << entryBlock
+                          << "\n");
 
   // Parse the op argument instructions
   if (functionType.getNumInputs()) {
@@ -597,20 +688,40 @@ LogicalResult Deserializer::processFunction(ArrayRef<uint32_t> operands) {
     }
   }
 
-  // Create a new builder for building the body.
-  OpBuilder funcBody(funcOp.getBody());
-  std::swap(funcBody, opBuilder);
-
-  // Make sure the first basic block, if exists, starts with an OpLabel
-  // instruction.
-  if (auto nextOpcode = peekOpcode()) {
-    if (*nextOpcode != spirv::Opcode::OpFunctionEnd &&
-        *nextOpcode != spirv::Opcode::OpLabel)
-      return emitError(unknownLoc, "a basic block must start with OpLabel");
-  }
+  // RAII guard to reset the insertion point to the module's region after
+  // deserializing the body of this function.
+  OpBuilder::InsertionGuard moduleInsertionGuard(opBuilder);
 
   spirv::Opcode opcode = spirv::Opcode::OpNop;
   ArrayRef<uint32_t> instOperands;
+
+  // Special handling for the entry block. We need to make sure it starts with
+  // an OpLabel instruction. The entry block takes the same parameters as the
+  // function. All other blocks do not take any parameter. We have already
+  // created the entry block, here we need to register it to the correct label
+  // <id>.
+  if (failed(sliceInstruction(opcode, instOperands,
+                              spirv::Opcode::OpFunctionEnd))) {
+    return failure();
+  }
+  if (opcode == spirv::Opcode::OpFunctionEnd) {
+    LLVM_DEBUG(llvm::dbgs() << "[fn] completed function " << fnName << " (type="
+                            << fnType << ", id=" << operands[1] << ")\n");
+    return processFunctionEnd(instOperands);
+  }
+  if (opcode != spirv::Opcode::OpLabel) {
+    return emitError(unknownLoc, "a basic block must start with OpLabel");
+  }
+  if (instOperands.size() != 1) {
+    return emitError(unknownLoc, "OpLabel should only have result <id>");
+  }
+  blockMap[instOperands[0]] = entryBlock;
+  if (failed(processLabel(instOperands))) {
+    return failure();
+  }
+
+  // Then process all the other instructions in the function until we hit
+  // OpFunctionEnd.
   while (succeeded(sliceInstruction(opcode, instOperands,
                                     spirv::Opcode::OpFunctionEnd)) &&
          opcode != spirv::Opcode::OpFunctionEnd) {
@@ -622,12 +733,25 @@ LogicalResult Deserializer::processFunction(ArrayRef<uint32_t> operands) {
     return failure();
   }
 
+  LLVM_DEBUG(llvm::dbgs() << "[fn] completed function " << fnName << " (type="
+                          << fnType << ", id=" << operands[1] << ")\n");
+  return processFunctionEnd(instOperands);
+}
+
+LogicalResult Deserializer::processFunctionEnd(ArrayRef<uint32_t> operands) {
   // Process OpFunctionEnd.
-  if (!instOperands.empty()) {
+  if (!operands.empty()) {
     return emitError(unknownLoc, "unexpected operands for OpFunctionEnd");
   }
 
-  std::swap(funcBody, opBuilder);
+  // Put all structured control flow in spv.selection/spv.loop ops.
+  if (failed(structurizeControlFlow())) {
+    return failure();
+  }
+
+  curBlock = nullptr;
+  curFunction = llvm::None;
+
   return success();
 }
 
@@ -1155,11 +1279,292 @@ LogicalResult Deserializer::processConstantNull(ArrayRef<uint32_t> operands) {
 // Control flow
 //===----------------------------------------------------------------------===//
 
+Block *Deserializer::getOrCreateBlock(uint32_t id) {
+  if (auto *block = getBlock(id)) {
+    LLVM_DEBUG(llvm::dbgs() << "[block] got exiting block for id=" << id
+                            << " @ " << block << "\n");
+    return block;
+  }
+
+  // We don't know where this block will be placed finally (in a spv.selection
+  // or spv.loop or function). Create it into the function for now and sort
+  // out the proper place later.
+  auto *block = curFunction->addBlock();
+  LLVM_DEBUG(llvm::dbgs() << "[block] created block for id=" << id << " @ "
+                          << block << "\n");
+  return blockMap[id] = block;
+}
+
+LogicalResult Deserializer::processBranch(ArrayRef<uint32_t> operands) {
+  if (!curBlock) {
+    return emitError(unknownLoc, "OpBranch must appear inside a block");
+  }
+
+  if (operands.size() != 1) {
+    return emitError(unknownLoc, "OpBranch must take exactly one target label");
+  }
+
+  auto *target = getOrCreateBlock(operands[0]);
+  opBuilder.create<spirv::BranchOp>(unknownLoc, target);
+
+  return success();
+}
+
+LogicalResult
+Deserializer::processBranchConditional(ArrayRef<uint32_t> operands) {
+  if (!curBlock) {
+    return emitError(unknownLoc,
+                     "OpBranchConditional must appear inside a block");
+  }
+
+  if (operands.size() != 3 && operands.size() != 5) {
+    return emitError(unknownLoc,
+                     "OpBranchConditional must have condition, true label, "
+                     "false label, and optionally two branch weights");
+  }
+
+  auto *condition = getValue(operands[0]);
+  auto *trueBlock = getOrCreateBlock(operands[1]);
+  auto *falseBlock = getOrCreateBlock(operands[2]);
+
+  Optional<std::pair<uint32_t, uint32_t>> weights;
+  if (operands.size() == 5) {
+    weights = std::make_pair(operands[3], operands[4]);
+  }
+
+  opBuilder.create<spirv::BranchConditionalOp>(unknownLoc, condition, trueBlock,
+                                               falseBlock, weights);
+
+  return success();
+}
+
 LogicalResult Deserializer::processLabel(ArrayRef<uint32_t> operands) {
+  if (!curFunction) {
+    return emitError(unknownLoc, "OpLabel must appear inside a function");
+  }
+
   if (operands.size() != 1) {
     return emitError(unknownLoc, "OpLabel should only have result <id>");
   }
-  // TODO(antiagainst): support basic blocks and control flow properly.
+
+  auto labelID = operands[0];
+  // We may have forward declared this block.
+  auto *block = getOrCreateBlock(labelID);
+  LLVM_DEBUG(llvm::dbgs() << "[block] populating block @ " << block << "\n");
+  // If we have seen this block, make sure it was just a forward declaration.
+  assert(block->empty() && "re-deserialize the same block!");
+
+  opBuilder.setInsertionPointToStart(block);
+  blockMap[labelID] = curBlock = block;
+
+  return success();
+}
+
+LogicalResult Deserializer::processLoopMerge(ArrayRef<uint32_t> operands) {
+  if (!curBlock) {
+    return emitError(unknownLoc, "OpLoopMerge must appear in a block");
+  }
+
+  if (operands.size() < 3) {
+    return emitError(unknownLoc, "OpLoopMerge must specify merge target, "
+                                 "continue target and loop control");
+  }
+
+  if (static_cast<uint32_t>(spirv::LoopControl::None) != operands[2]) {
+    return emitError(unknownLoc, "unimplmented OpLoopMerge loop control: ")
+           << operands[2];
+  }
+
+  auto *mergeBlock = getOrCreateBlock(operands[0]);
+  auto *continueBlock = getOrCreateBlock(operands[1]);
+
+  if (!blockMergeInfo.try_emplace(curBlock, mergeBlock, continueBlock).second) {
+    return emitError(
+        unknownLoc,
+        "a block cannot have more than one OpLoopMerge instruction");
+  }
+
+  return success();
+}
+
+namespace {
+/// A class for putting all blocks in a structured loop in a spv.loop op.
+class LoopStructurizer {
+public:
+  /// Structurizes the loop at the given `headerBlock`.
+  ///
+  /// This method will create an spv.loop op in the `mergeBlock` and move all
+  /// blocks in the structured loop into the spv.loop's region. All branches to
+  /// the `headerBlock` will be redirected to the `mergeBlock`.
+  static LogicalResult structurize(Location loc, Block *headerBlock,
+                                   Block *mergeBlock, Block *continueBlock) {
+    return LoopStructurizer(loc, headerBlock, mergeBlock, continueBlock)
+        .structurizeImpl();
+  }
+
+private:
+  LoopStructurizer(Location loc, Block *header, Block *merge, Block *cont)
+      : location(loc), headerBlock(header), mergeBlock(merge),
+        continueBlock(cont) {}
+
+  /// Creates a new spv.loop op at the beginning of the `mergeBlock`.
+  spirv::LoopOp createLoopOp();
+
+  /// Collects all blocks reachable from `headerBlock` except `mergeBlock` and
+  /// `continueBlock` into `constructBlocks`.
+  void collectBlocksInConstruct();
+
+  LogicalResult structurizeImpl();
+
+  Location location;
+
+  Block *headerBlock;
+  Block *mergeBlock;
+  Block *continueBlock;
+
+  llvm::SetVector<Block *> constructBlocks;
+};
+} // namespace
+
+spirv::LoopOp LoopStructurizer::createLoopOp() {
+  // Create a builder and set the insertion point to the beginning of the
+  // merge block so that the newly created LoopOp will be inserted there.
+  OpBuilder builder(&mergeBlock->front());
+
+  auto control = builder.getI32IntegerAttr(
+      static_cast<uint32_t>(spirv::LoopControl::None));
+  auto loopOp = builder.create<spirv::LoopOp>(location, control);
+  loopOp.addEntryAndMergeBlock();
+
+  return loopOp;
+}
+
+void LoopStructurizer::collectBlocksInConstruct() {
+  assert(constructBlocks.empty() && "expected empty constructBlocks");
+
+  // Put the header block in the work list first.
+  constructBlocks.insert(headerBlock);
+
+  // For each item in the work list, add its successors under conditions.
+  for (unsigned i = 0; i < constructBlocks.size(); ++i) {
+    for (auto *successor : constructBlocks[i]->getSuccessors())
+      if (successor != mergeBlock && successor != continueBlock &&
+          constructBlocks.count(successor) == 0) {
+        constructBlocks.insert(successor);
+      }
+  }
+}
+
+LogicalResult LoopStructurizer::structurizeImpl() {
+  auto loopOp = createLoopOp();
+  if (!loopOp)
+    return failure();
+
+  BlockAndValueMapping mapper;
+  // All references to the old merge block should be directed to the loop
+  // merge block in the LoopOp's region.
+  mapper.map(mergeBlock, &loopOp.body().back());
+
+  collectBlocksInConstruct();
+  // Add the loop continue block at the last so it's the second to last block
+  // in LoopOp's region.
+  constructBlocks.insert(continueBlock);
+
+  // We've identified all blocks belonging to the loop's region. Now need to
+  // "move" them into the loop. Instead of really moving the blocks, in the
+  // following we copy them and remap all values and branches. This is because:
+  // * Inserting a block into a region requires the block not in any region
+  //   before. But loops can nest so we can create loop ops in a nested manner,
+  //   which means some blocks may already be in a loop region when to be moved
+  //   again.
+  // * It's much trickier to fix up the branches into and out of the loop's
+  //   region: we need to treat not-moved blocks and moved blocks differently:
+  //   Not-moved blocks jumping to the loop header block need to jump to the
+  //   merge point containing the new loop op but not the loop continue block's
+  //   back edge. Moved blocks jumping out of the loop need to jump to the
+  //   merge block inside the loop region but not other not-moved blocks.
+  //   We cannot use replaceAllUsesWith clearly and it's harder to follow the
+  //   logic.
+
+  // Create a corresponding block in the LoopOp's region for each block in
+  // this loop construct.
+  OpBuilder loopBuilder(loopOp.body());
+  for (auto *block : constructBlocks) {
+    assert(block->getNumArguments() == 0 &&
+           "block in loop construct should not have arguments");
+
+    // Create an block and insert it before the loop merge block in the
+    // LoopOp's region.
+    auto *newBlock = loopBuilder.createBlock(&loopOp.body().back());
+    mapper.map(block, newBlock);
+
+    for (auto &op : *block)
+      newBlock->push_back(op.clone(mapper));
+  }
+
+  // Go through all ops and remap the operands.
+  auto remapOperands = [&](Operation *op) {
+    for (auto &operand : op->getOpOperands())
+      if (auto *mappedOp = mapper.lookupOrNull(operand.get()))
+        operand.set(mappedOp);
+    for (auto &succOp : op->getBlockOperands())
+      if (auto *mappedOp = mapper.lookupOrNull(succOp.get()))
+        succOp.set(mappedOp);
+  };
+  for (auto &block : loopOp.body()) {
+    block.walk(remapOperands);
+  }
+
+  // We have created the LoopOp and "moved" all blocks belonging to the loop
+  // construct into its region. Next we need to fix the connections between
+  // this new LoopOp with existing blocks.
+
+  // All existing incoming branches should go to the merge block, where the
+  // LoopOp resides right now.
+  headerBlock->replaceAllUsesWith(mergeBlock);
+
+  // The loop entry block should have a unconditional branch jumping to the
+  // loop header block.
+  loopBuilder.setInsertionPointToEnd(&loopOp.body().front());
+  loopBuilder.create<spirv::BranchOp>(location,
+                                      mapper.lookupOrNull(headerBlock));
+
+  // All the blocks cloned into the LoopOp's region can now be deleted.
+  for (auto *block : constructBlocks) {
+    block->clear();
+    block->erase();
+  }
+
+  return success();
+}
+
+LogicalResult Deserializer::structurizeControlFlow() {
+  LLVM_DEBUG(llvm::dbgs() << "[cf] structurizing control flow\n");
+
+  while (!blockMergeInfo.empty()) {
+    auto *headerBlock = blockMergeInfo.begin()->first;
+    const auto &mergeInfo = blockMergeInfo.begin()->second;
+
+    auto *mergeBlock = mergeInfo.mergeBlock;
+    auto *continueBlock = mergeInfo.continueBlock;
+    LLVM_DEBUG(llvm::dbgs() << "[cf] header block @ " << headerBlock << "\n");
+    assert(mergeBlock && "merge block cannot be nullptr");
+    LLVM_DEBUG(llvm::dbgs() << "[cf] merge block @ " << mergeBlock << "\n");
+    if (!continueBlock) {
+      return emitError(unknownLoc, "structurizing selection unimplemented");
+    }
+    LLVM_DEBUG(llvm::dbgs()
+               << "[cf] continue block @ " << continueBlock << "\n");
+
+    if (failed(LoopStructurizer::structurize(unknownLoc, headerBlock,
+                                             mergeBlock, continueBlock))) {
+      return failure();
+    }
+
+    blockMergeInfo.erase(headerBlock);
+  }
+
+  LLVM_DEBUG(llvm::dbgs() << "[cf] completed structurizing control flow\n");
   return success();
 }
 
@@ -1227,6 +1632,9 @@ Optional<spirv::Opcode> Deserializer::peekOpcode() {
 LogicalResult Deserializer::processInstruction(spirv::Opcode opcode,
                                                ArrayRef<uint32_t> operands,
                                                bool deferInstructions) {
+  LLVM_DEBUG(llvm::dbgs() << "[inst] processing instruction "
+                          << spirv::stringifyOpcode(opcode) << "\n");
+
   // First dispatch all the instructions whose opcode does not correspond to
   // those that have a direct mirror in the SPIR-V dialect
   switch (opcode) {
@@ -1284,6 +1692,12 @@ LogicalResult Deserializer::processInstruction(spirv::Opcode opcode,
     return processFunction(operands);
   case spirv::Opcode::OpLabel:
     return processLabel(operands);
+  case spirv::Opcode::OpBranch:
+    return processBranch(operands);
+  case spirv::Opcode::OpBranchConditional:
+    return processBranchConditional(operands);
+  case spirv::Opcode::OpLoopMerge:
+    return processLoopMerge(operands);
   default:
     break;
   }
diff --git a/third_party/mlir/lib/Dialect/SPIRV/Serialization/Serializer.cpp b/third_party/mlir/lib/Dialect/SPIRV/Serialization/Serializer.cpp
index 43a1d08cf6c..e05c0d4f8e6 100644
--- a/third_party/mlir/lib/Dialect/SPIRV/Serialization/Serializer.cpp
+++ b/third_party/mlir/lib/Dialect/SPIRV/Serialization/Serializer.cpp
@@ -29,6 +29,7 @@
 #include "mlir/Support/LogicalResult.h"
 #include "mlir/Support/StringExtras.h"
 #include "llvm/ADT/Sequence.h"
+#include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/bit.h"
 #include "llvm/Support/raw_ostream.h"
@@ -247,6 +248,28 @@ private:
   uint32_t prepareConstantFp(Location loc, FloatAttr floatAttr,
                              bool isSpec = false);
 
+  //===--------------------------------------------------------------------===//
+  // Control flow
+  //===--------------------------------------------------------------------===//
+
+  uint32_t findBlockID(Block *block) const { return blockIDMap.lookup(block); }
+
+  uint32_t assignBlockID(Block *block);
+
+  // Processes the given `block` and emits SPIR-V instructions for all ops
+  // inside. `actionBeforeTerminator` is a callback that will be invoked before
+  // handling the terminator op. It can be used to inject the Op*Merge
+  // instruction if this is a SPIR-V selection/loop header block.
+  LogicalResult
+  processBlock(Block *block,
+               llvm::function_ref<void()> actionBeforeTerminator = nullptr);
+
+  LogicalResult processLoopOp(spirv::LoopOp loopOp);
+
+  LogicalResult processBranchConditionalOp(spirv::BranchConditionalOp);
+
+  LogicalResult processBranchOp(spirv::BranchOp branchOp);
+
   //===--------------------------------------------------------------------===//
   // Operations
   //===--------------------------------------------------------------------===//
@@ -313,6 +336,9 @@ private:
   /// Map from FuncOps name to <id>s.
   llvm::StringMap<uint32_t> funcIDMap;
 
+  /// Map from blocks to their <id>s.
+  DenseMap<Block *, uint32_t> blockIDMap;
+
   /// Map from results of normal operations to their <id>s.
   DenseMap<Value *, uint32_t> valueIDMap;
 };
@@ -503,8 +529,7 @@ LogicalResult Serializer::processFuncOp(FuncOp op) {
   uint32_t resTypeID = 0;
   auto resultTypes = op.getType().getResults();
   if (resultTypes.size() > 1) {
-    return emitError(op.getLoc(),
-                     "cannot serialize function with multiple return types");
+    return op.emitError("cannot serialize function with multiple return types");
   }
   if (failed(processType(op.getLoc(),
                          (resultTypes.empty() ? getVoidType() : resultTypes[0]),
@@ -539,20 +564,15 @@ LogicalResult Serializer::processFuncOp(FuncOp op) {
 
   // Process the body.
   if (op.isExternal()) {
-    return emitError(op.getLoc(), "external function is unhandled");
+    return op.emitError("external function is unhandled");
   }
 
-  for (auto &b : op) {
-    // TODO(antiagainst): support basic blocks and control flow properly.
-    encodeInstructionInto(functions, spirv::Opcode::OpLabel, {getNextID()});
-    for (auto &op : b) {
-      if (failed(processOperation(&op))) {
-        return failure();
-      }
-    }
+  for (auto &block : op) {
+    if (failed(processBlock(&block)))
+      return failure();
   }
 
-  // Insert Function End.
+  // Insert OpFunctionEnd.
   return encodeInstructionInto(functions, spirv::Opcode::OpFunctionEnd, {});
 }
 
@@ -1136,6 +1156,181 @@ uint32_t Serializer::prepareConstantFp(Location loc, FloatAttr floatAttr,
   return resultID;
 }
 
+//===----------------------------------------------------------------------===//
+// Control flow
+//===----------------------------------------------------------------------===//
+
+uint32_t Serializer::assignBlockID(Block *block) {
+  assert(blockIDMap.lookup(block) == 0 && "block already has <id>");
+  return blockIDMap[block] = getNextID();
+}
+
+LogicalResult
+Serializer::processBlock(Block *block,
+                         llvm::function_ref<void()> actionBeforeTerminator) {
+  auto blockID = findBlockID(block);
+  if (blockID == 0) {
+    blockID = assignBlockID(block);
+  }
+
+  // Emit OpLabel for this block.
+  encodeInstructionInto(functions, spirv::Opcode::OpLabel, {blockID});
+
+  // Process each op in this block except the terminator.
+  for (auto &op : llvm::make_range(block->begin(), std::prev(block->end()))) {
+    if (failed(processOperation(&op)))
+      return failure();
+  }
+
+  // Process the terminator.
+  if (actionBeforeTerminator)
+    actionBeforeTerminator();
+  if (failed(processOperation(&block->back())))
+    return failure();
+
+  return success();
+}
+
+namespace {
+/// A pre-order depth-first vistor for processing basic blocks in a spv.loop op.
+///
+/// This visitor is special tailored for spv.loop block serialization to satisfy
+/// SPIR-V validation rules. It should not be used as a general depth-first
+/// block visitor.
+class LoopBlockVisitor {
+public:
+  using BlockHandlerType = llvm::function_ref<LogicalResult(Block *)>;
+
+  /// Visits the basic blocks starting from the given `headerBlock`'s successors
+  /// in pre-order depth-first manner and calls `blockHandler` on each block.
+  /// Skips handling the `headerBlock` and blocks in the `skipBlocks` list.
+  static LogicalResult visit(Block *headerBlock, BlockHandlerType blockHandler,
+                             ArrayRef<Block *> skipBlocks) {
+    return LoopBlockVisitor(blockHandler, skipBlocks)
+        .visitHeaderBlock(headerBlock);
+  }
+
+private:
+  LoopBlockVisitor(BlockHandlerType blockHandler, ArrayRef<Block *> skipBlocks)
+      : blockHandler(blockHandler),
+        doneBlocks(skipBlocks.begin(), skipBlocks.end()) {}
+
+  LogicalResult visitHeaderBlock(Block *header) {
+    // Skip processing the header block.
+    doneBlocks.insert(header);
+
+    for (auto *successor : header->getSuccessors()) {
+      if (failed(visitNormalBlock(successor)))
+        return failure();
+    }
+
+    return success();
+  }
+
+  LogicalResult visitNormalBlock(Block *block) {
+    if (doneBlocks.count(block))
+      return success();
+
+    if (failed(blockHandler(block)))
+      return failure();
+    doneBlocks.insert(block);
+
+    for (auto *successor : block->getSuccessors()) {
+      if (failed(visitNormalBlock(successor)))
+        return failure();
+    }
+
+    return success();
+  }
+
+  BlockHandlerType blockHandler;
+  SmallPtrSet<Block *, 4> doneBlocks;
+};
+} // namespace
+
+LogicalResult Serializer::processLoopOp(spirv::LoopOp loopOp) {
+  // SPIR-V spec "2.16.1. Universal Validation Rules" requires that "the order
+  // of blocks in a function must satisfy the rule that blocks appear before all
+  // blocks they dominate." This can be achieved by a pre-order CFG traversal
+  // algorithm. To make the serialization output more logical and readable to
+  // human, we perform depth-first CFG traversal and delay the serialization of
+  // the continue block and the merge block until after all other blocks have
+  // been processed.
+
+  // Assign <id>s to all blocks so that branchs inside the LoopOp can resolve
+  // properly. We don't need to assign for the entry block, which is just for
+  // satisfying MLIR region's structural requirement.
+  auto &body = loopOp.body();
+  for (Block &block :
+       llvm::make_range(std::next(body.begin(), 1), body.end())) {
+    assignBlockID(&block);
+  }
+  auto *headerBlock = loopOp.getHeaderBlock();
+  auto *continueBlock = loopOp.getContinueBlock();
+  auto *mergeBlock = loopOp.getMergeBlock();
+  auto headerID = findBlockID(headerBlock);
+  auto continueID = findBlockID(continueBlock);
+  auto mergeID = findBlockID(mergeBlock);
+
+  // This LoopOp is in some MLIR block with preceding and following ops. In the
+  // binary format, it should reside in separate SPIR-V blocks from its
+  // preceding and following ops. So we need to emit unconditional branches to
+  // jump to this LoopOp's SPIR-V blocks and jumping back to the normal flow
+  // afterwards.
+
+  encodeInstructionInto(functions, spirv::Opcode::OpBranch, {headerID});
+
+  // Emit the loop header block, which dominates all other blocks, first. We
+  // need to emit an OpLoopMerge instruction before the loop header block's
+  // terminator.
+  auto emitLoopMerge = [&]() {
+    // TODO(antiagainst): properly support loop control here
+    encodeInstructionInto(
+        functions, spirv::Opcode::OpLoopMerge,
+        {mergeID, continueID, static_cast<uint32_t>(spirv::LoopControl::None)});
+  };
+  if (failed(processBlock(headerBlock, emitLoopMerge)))
+    return failure();
+
+  // Process all blocks with a depth-first visitor starting from the header
+  // block. The loop header block, loop continue block, and loop merge block are
+  // skipped by this visitor and handled later in this function.
+  auto handleBlock = [&](Block *block) { return processBlock(block); };
+  if (failed(LoopBlockVisitor::visit(headerBlock, handleBlock,
+                                     {continueBlock, mergeBlock})))
+    return failure();
+
+  // We have handled all other blocks. Now get to the loop continue block.
+  if (failed(processBlock(continueBlock)))
+    return failure();
+
+  // There is nothing to do for the merge block in the loop, which just contains
+  // a spv._merge op, itself. But we need to have an OpLabel instruction to
+  // start a new SPIR-V block for ops following this LoopOp.
+  return encodeInstructionInto(functions, spirv::Opcode::OpLabel, {mergeID});
+}
+
+LogicalResult Serializer::processBranchConditionalOp(
+    spirv::BranchConditionalOp condBranchOp) {
+  auto conditionID = findValueID(condBranchOp.condition());
+  auto trueLabelID = findBlockID(condBranchOp.getTrueBlock());
+  auto falseLabelID = findBlockID(condBranchOp.getFalseBlock());
+  SmallVector<uint32_t, 5> arguments{conditionID, trueLabelID, falseLabelID};
+
+  if (auto weights = condBranchOp.branch_weights()) {
+    for (auto val : weights->getValue())
+      arguments.push_back(val.cast<IntegerAttr>().getInt());
+  }
+
+  return encodeInstructionInto(functions, spirv::Opcode::OpBranchConditional,
+                               arguments);
+}
+
+LogicalResult Serializer::processBranchOp(spirv::BranchOp branchOp) {
+  return encodeInstructionInto(functions, spirv::Opcode::OpBranch,
+                               {findBlockID(branchOp.getTarget())});
+}
+
 //===----------------------------------------------------------------------===//
 // Operation
 //===----------------------------------------------------------------------===//
@@ -1165,29 +1360,41 @@ Serializer::processReferenceOfOp(spirv::ReferenceOfOp referenceOfOp) {
 }
 
 LogicalResult Serializer::processOperation(Operation *op) {
-  // First dispatch the methods that do not directly mirror an operation from
-  // the SPIR-V spec
+  // First dispatch the ops that do not directly mirror an instruction from
+  // the SPIR-V spec.
+  if (auto addressOfOp = dyn_cast<spirv::AddressOfOp>(op)) {
+    return processAddressOfOp(addressOfOp);
+  }
+  if (auto branchOp = dyn_cast<spirv::BranchOp>(op)) {
+    return processBranchOp(branchOp);
+  }
+  if (auto condBranchOp = dyn_cast<spirv::BranchConditionalOp>(op)) {
+    return processBranchConditionalOp(condBranchOp);
+  }
   if (auto constOp = dyn_cast<spirv::ConstantOp>(op)) {
     return processConstantOp(constOp);
   }
-  if (auto specConstOp = dyn_cast<spirv::SpecConstantOp>(op)) {
-    return processSpecConstantOp(specConstOp);
-  }
-  if (auto refOpOp = dyn_cast<spirv::ReferenceOfOp>(op)) {
-    return processReferenceOfOp(refOpOp);
-  }
   if (auto fnOp = dyn_cast<FuncOp>(op)) {
     return processFuncOp(fnOp);
   }
-  if (isa<spirv::ModuleEndOp>(op)) {
-    return success();
-  }
   if (auto varOp = dyn_cast<spirv::GlobalVariableOp>(op)) {
     return processGlobalVariableOp(varOp);
   }
-  if (auto addressOfOp = dyn_cast<spirv::AddressOfOp>(op)) {
-    return processAddressOfOp(addressOfOp);
+  if (auto loopOp = dyn_cast<spirv::LoopOp>(op)) {
+    return processLoopOp(loopOp);
   }
+  if (isa<spirv::ModuleEndOp>(op)) {
+    return success();
+  }
+  if (auto refOpOp = dyn_cast<spirv::ReferenceOfOp>(op)) {
+    return processReferenceOfOp(refOpOp);
+  }
+  if (auto specConstOp = dyn_cast<spirv::SpecConstantOp>(op)) {
+    return processSpecConstantOp(specConstOp);
+  }
+
+  // Then handle all the ops that directly mirror SPIR-V instructions with
+  // auto-generated methods.
   return dispatchToAutogenSerialization(op);
 }
 
diff --git a/third_party/mlir/lib/Dialect/StandardOps/Ops.cpp b/third_party/mlir/lib/Dialect/StandardOps/Ops.cpp
index 9f490530292..ef7b795d5f0 100644
--- a/third_party/mlir/lib/Dialect/StandardOps/Ops.cpp
+++ b/third_party/mlir/lib/Dialect/StandardOps/Ops.cpp
@@ -29,6 +29,7 @@
 #include "mlir/IR/Value.h"
 #include "mlir/Support/MathExtras.h"
 #include "mlir/Support/STLExtras.h"
+#include "mlir/Transforms/InliningUtils.h"
 #include "llvm/ADT/StringSwitch.h"
 #include "llvm/Support/FormatVariadic.h"
 #include "llvm/Support/raw_ostream.h"
@@ -68,6 +69,54 @@ struct StdOpAsmInterface : public OpAsmDialectInterface {
     }
   }
 };
+
+/// This class defines the interface for handling inlining with standard
+/// operations.
+struct StdInlinerInterface : public DialectInlinerInterface {
+  using DialectInlinerInterface::DialectInlinerInterface;
+
+  //===--------------------------------------------------------------------===//
+  // Analysis Hooks
+  //===--------------------------------------------------------------------===//
+
+  /// All operations within standard ops can be inlined.
+  bool isLegalToInline(Operation *, Region *,
+                       BlockAndValueMapping &) const final {
+    return true;
+  }
+
+  //===--------------------------------------------------------------------===//
+  // Transformation Hooks
+  //===--------------------------------------------------------------------===//
+
+  /// Handle the given inlined terminator by replacing it with a new operation
+  /// as necessary.
+  void handleTerminator(Operation *op, Block *newDest) const final {
+    // Only "std.return" needs to be handled here.
+    auto returnOp = dyn_cast<ReturnOp>(op);
+    if (!returnOp)
+      return;
+
+    // Replace the return with a branch to the dest.
+    OpBuilder builder(op);
+    builder.create<BranchOp>(op->getLoc(), newDest,
+                             llvm::to_vector<4>(returnOp.getOperands()));
+    op->erase();
+  }
+
+  /// Handle the given inlined terminator by replacing it with a new operation
+  /// as necessary.
+  void handleTerminator(Operation *op,
+                        ArrayRef<Value *> valuesToRepl) const final {
+    // Only "std.return" needs to be handled here.
+    auto returnOp = cast<ReturnOp>(op);
+
+    // Replace the values directly with the return operands.
+    assert(returnOp.getNumOperands() == valuesToRepl.size());
+    for (const auto &it : llvm::enumerate(returnOp.getOperands()))
+      valuesToRepl[it.index()]->replaceAllUsesWith(it.value());
+  }
+};
 } // end anonymous namespace
 
 //===----------------------------------------------------------------------===//
@@ -122,7 +171,7 @@ StandardOpsDialect::StandardOpsDialect(MLIRContext *context)
 #define GET_OP_LIST
 #include "mlir/Dialect/StandardOps/Ops.cpp.inc"
                 >();
-  addInterfaces<StdOpAsmInterface>();
+  addInterfaces<StdInlinerInterface, StdOpAsmInterface>();
 }
 
 void mlir::printDimAndSymbolList(Operation::operand_iterator begin,
@@ -962,10 +1011,10 @@ OpFoldResult CmpFOp::fold(ArrayRef<Attribute> operands) {
 
   auto lhs = operands.front().dyn_cast_or_null<FloatAttr>();
   auto rhs = operands.back().dyn_cast_or_null<FloatAttr>();
-  if (!lhs || !rhs ||
-      // TODO(b/122019992) Implement and test constant folding for nan/inf when
-      // it is possible to have constant nan/inf
-      !lhs.getValue().isFinite() || !rhs.getValue().isFinite())
+
+  // TODO(gcmn) We could actually do some intelligent things if we know only one
+  // of the operands, but it's inf or nan.
+  if (!lhs || !rhs)
     return {};
 
   auto val = applyCmpPredicate(getPredicate(), lhs.getValue(), rhs.getValue());
diff --git a/third_party/mlir/lib/ExecutionEngine/ExecutionEngine.cpp b/third_party/mlir/lib/ExecutionEngine/ExecutionEngine.cpp
index 08b808674a0..82e816dfe4b 100644
--- a/third_party/mlir/lib/ExecutionEngine/ExecutionEngine.cpp
+++ b/third_party/mlir/lib/ExecutionEngine/ExecutionEngine.cpp
@@ -36,10 +36,13 @@
 #include "llvm/ExecutionEngine/Orc/RTDyldObjectLinkingLayer.h"
 #include "llvm/ExecutionEngine/SectionMemoryManager.h"
 #include "llvm/IR/IRBuilder.h"
+#include "llvm/Support/Debug.h"
 #include "llvm/Support/Error.h"
 #include "llvm/Support/TargetRegistry.h"
 #include "llvm/Support/ToolOutputFile.h"
 
+#define DEBUG_TYPE "execution-engine"
+
 using namespace mlir;
 using llvm::dbgs;
 using llvm::Error;
@@ -77,12 +80,12 @@ void SimpleObjectCache::notifyObjectCompiled(const Module *M,
 std::unique_ptr<MemoryBuffer> SimpleObjectCache::getObject(const Module *M) {
   auto I = cachedObjects.find(M->getModuleIdentifier());
   if (I == cachedObjects.end()) {
-    dbgs() << "No object for " << M->getModuleIdentifier()
-           << " in cache. Compiling.\n";
+    LLVM_DEBUG(dbgs() << "No object for " << M->getModuleIdentifier()
+                      << " in cache. Compiling.\n");
     return nullptr;
   }
-  dbgs() << "Object for " << M->getModuleIdentifier()
-         << " loaded from cache.\n";
+  LLVM_DEBUG(dbgs() << "Object for " << M->getModuleIdentifier()
+                    << " loaded from cache.\n");
   return MemoryBuffer::getMemBuffer(I->second->getMemBufferRef());
 }
 
@@ -116,8 +119,8 @@ bool ExecutionEngine::setupTargetTriple(Module *llvmModule) {
     errs() << "NO target: " << errorMessage << "\n";
     return true;
   }
-  auto machine =
-      target->createTargetMachine(targetTriple, "generic", "", {}, {});
+  std::unique_ptr<llvm::TargetMachine> machine(
+      target->createTargetMachine(targetTriple, "generic", "", {}, {}));
   llvmModule->setDataLayout(machine->createDataLayout());
   llvmModule->setTargetTriple(targetTriple);
   return false;
@@ -195,6 +198,7 @@ ExecutionEngine::ExecutionEngine(bool enableObjectCache)
 
 Expected<std::unique_ptr<ExecutionEngine>> ExecutionEngine::create(
     ModuleOp m, std::function<Error(llvm::Module *)> transformer,
+    Optional<llvm::CodeGenOpt::Level> jitCodeGenOptLevel,
     ArrayRef<StringRef> sharedLibPaths, bool enableObjectCache) {
   auto engine = std::make_unique<ExecutionEngine>(enableObjectCache);
 
@@ -261,6 +265,8 @@ Expected<std::unique_ptr<ExecutionEngine>> ExecutionEngine::create(
   // LLJITWithObjectCache example.
   auto compileFunctionCreator = [&](JITTargetMachineBuilder JTMB)
       -> Expected<IRCompileLayer::CompileFunction> {
+    if (jitCodeGenOptLevel)
+      JTMB.setCodeGenOptLevel(jitCodeGenOptLevel.getValue());
     auto TM = JTMB.createTargetMachine();
     if (!TM)
       return TM.takeError();
@@ -277,6 +283,9 @@ Expected<std::unique_ptr<ExecutionEngine>> ExecutionEngine::create(
 
   // Add a ThreadSafemodule to the engine and return.
   ThreadSafeModule tsm(std::move(deserModule), std::move(ctx));
+  if (transformer)
+    cantFail(tsm.withModuleDo(
+        [&](llvm::Module &module) { return transformer(&module); }));
   cantFail(jit->addIRModule(std::move(tsm)));
   engine->jit = std::move(jit);
 
diff --git a/third_party/mlir/lib/IR/Function.cpp b/third_party/mlir/lib/IR/Function.cpp
index fb54f85594c..7fe2bbc77fe 100644
--- a/third_party/mlir/lib/IR/Function.cpp
+++ b/third_party/mlir/lib/IR/Function.cpp
@@ -122,6 +122,14 @@ Block *FuncOp::addEntryBlock() {
   return entry;
 }
 
+/// Add a normal block to the end of the function's block list. The function
+/// should at least already have an entry block.
+Block *FuncOp::addBlock() {
+  assert(!empty() && "function should at least have an entry block");
+  push_back(new Block());
+  return &back();
+}
+
 /// Clone the internal blocks from this function into dest and all attributes
 /// from this function to dest.
 void FuncOp::cloneInto(FuncOp dest, BlockAndValueMapping &mapper) {
diff --git a/third_party/mlir/lib/IR/IntegerSet.cpp b/third_party/mlir/lib/IR/IntegerSet.cpp
index 74a1297dcdd..139ca504b58 100644
--- a/third_party/mlir/lib/IR/IntegerSet.cpp
+++ b/third_party/mlir/lib/IR/IntegerSet.cpp
@@ -24,7 +24,7 @@ using namespace mlir::detail;
 
 unsigned IntegerSet::getNumDims() const { return set->dimCount; }
 unsigned IntegerSet::getNumSymbols() const { return set->symbolCount; }
-unsigned IntegerSet::getNumOperands() const {
+unsigned IntegerSet::getNumInputs() const {
   return set->dimCount + set->symbolCount;
 }
 
@@ -70,3 +70,23 @@ bool IntegerSet::isEq(unsigned idx) const { return getEqFlags()[idx]; }
 MLIRContext *IntegerSet::getContext() const {
   return getConstraint(0).getContext();
 }
+
+/// Walk all of the AffineExpr's in this set. Each node in an expression
+/// tree is visited in postorder.
+void IntegerSet::walkExprs(
+    llvm::function_ref<void(AffineExpr)> callback) const {
+  for (auto expr : getConstraints())
+    expr.walk(callback);
+}
+
+IntegerSet IntegerSet::replaceDimsAndSymbols(
+    ArrayRef<AffineExpr> dimReplacements, ArrayRef<AffineExpr> symReplacements,
+    unsigned numResultDims, unsigned numResultSyms) {
+  SmallVector<AffineExpr, 8> constraints;
+  constraints.reserve(getNumConstraints());
+  for (auto cst : getConstraints())
+    constraints.push_back(
+        cst.replaceDimsAndSymbols(dimReplacements, symReplacements));
+
+  return get(numResultDims, numResultSyms, constraints, getEqFlags());
+}
diff --git a/third_party/mlir/lib/IR/Operation.cpp b/third_party/mlir/lib/IR/Operation.cpp
index 56c74312007..a237b28e109 100644
--- a/third_party/mlir/lib/IR/Operation.cpp
+++ b/third_party/mlir/lib/IR/Operation.cpp
@@ -1026,3 +1026,46 @@ void impl::ensureRegionTerminator(
 
   block.push_back(buildTerminatorOp());
 }
+
+UseIterator::UseIterator(Operation *op, bool end)
+    : op(op), res(end ? op->result_end() : op->result_begin()) {
+  // Only initialize current use if there are results/can be uses.
+  if (op->getNumResults())
+    skipOverResultsWithNoUsers();
+}
+
+UseIterator &UseIterator::operator++() {
+  // We increment over uses, if we reach the last use then move to next
+  // result.
+  if (use != (*res)->use_end())
+    ++use;
+  if (use == (*res)->use_end()) {
+    ++res;
+    skipOverResultsWithNoUsers();
+  }
+  return *this;
+}
+
+bool UseIterator::operator==(const UseIterator &other) const {
+  if (op != other.op)
+    return false;
+  if (op->getNumResults() == 0)
+    return true;
+  return res == other.res && use == other.use;
+}
+
+bool UseIterator::operator!=(const UseIterator &other) const {
+  return !(*this == other);
+}
+
+void UseIterator::skipOverResultsWithNoUsers() {
+  while (res != op->result_end() && (*res)->use_empty())
+    ++res;
+
+  // If we are at the last result, then set use to first use of
+  // first result (sentinel value used for end).
+  if (res == op->result_end())
+    use = {};
+  else
+    use = (*res)->use_begin();
+}
diff --git a/third_party/mlir/lib/Parser/Parser.cpp b/third_party/mlir/lib/Parser/Parser.cpp
index a6ccd76e806..a22b1d86550 100644
--- a/third_party/mlir/lib/Parser/Parser.cpp
+++ b/third_party/mlir/lib/Parser/Parser.cpp
@@ -2606,6 +2606,11 @@ public:
   /// Parse an operation instance that is in the generic form.
   Operation *parseGenericOperation();
 
+  /// Parse an operation instance that is in the generic form and insert it at
+  /// the provided insertion point.
+  Operation *parseGenericOperation(Block *insertBlock,
+                                   Block::iterator insertPt);
+
   /// Parse an operation instance that is in the op-defined custom form.
   Operation *parseCustomOperation();
 
@@ -3255,6 +3260,13 @@ Operation *OperationParser::parseGenericOperation() {
   return opBuilder.createOperation(result);
 }
 
+Operation *OperationParser::parseGenericOperation(Block *insertBlock,
+                                                  Block::iterator insertPt) {
+  OpBuilder::InsertionGuard restoreInsertionPoint(opBuilder);
+  opBuilder.setInsertionPoint(insertBlock, insertPt);
+  return parseGenericOperation();
+}
+
 namespace {
 class CustomOpAsmParser : public OpAsmParser {
 public:
@@ -3270,6 +3282,11 @@ public:
     return success();
   }
 
+  Operation *parseGenericOperation(Block *insertBlock,
+                                   Block::iterator insertPt) final {
+    return parser.parseGenericOperation(insertBlock, insertPt);
+  }
+
   //===--------------------------------------------------------------------===//
   // Utilities
   //===--------------------------------------------------------------------===//
diff --git a/third_party/mlir/lib/Pass/Pass.cpp b/third_party/mlir/lib/Pass/Pass.cpp
index 18c4de470e7..ee5ad2e9166 100644
--- a/third_party/mlir/lib/Pass/Pass.cpp
+++ b/third_party/mlir/lib/Pass/Pass.cpp
@@ -25,7 +25,6 @@
 #include "mlir/IR/Diagnostics.h"
 #include "mlir/IR/Dialect.h"
 #include "mlir/IR/Module.h"
-#include "mlir/Pass/PassManager.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Mutex.h"
 #include "llvm/Support/Parallel.h"
@@ -81,7 +80,7 @@ void VerifierPass::runOnOperation() {
 }
 
 //===----------------------------------------------------------------------===//
-// OpPassManager
+// OpPassManagerImpl
 //===----------------------------------------------------------------------===//
 
 namespace mlir {
@@ -91,26 +90,18 @@ struct OpPassManagerImpl {
       : name(name), disableThreads(disableThreads), verifyPasses(verifyPasses) {
   }
 
-  /// Returns the pass manager instance corresponding to the last pass added
-  /// if that pass was a PassAdaptor.
-  OpPassManager *getLastNestedPM() {
-    if (passes.empty())
-      return nullptr;
-    auto lastPassIt = passes.rbegin();
-
-    // If this pass was a verifier, skip it as it is opaque to ordering for
-    // pipeline construction.
-    if (isa<VerifierPass>(*lastPassIt))
-      ++lastPassIt;
-
-    // Get the internal pass manager if this pass is an adaptor.
-    if (auto *adaptor = dyn_cast<OpToOpPassAdaptor>(lastPassIt->get()))
-      return &adaptor->getPassManager();
-    if (auto *adaptor = dyn_cast<OpToOpPassAdaptorParallel>(lastPassIt->get()))
-      return &adaptor->getPassManager();
-    return nullptr;
+  /// Merge the passes of this pass manager into the one provided.
+  void mergeInto(OpPassManagerImpl &rhs) {
+    assert(name == rhs.name && "merging unrelated pass managers");
+    for (auto &pass : passes)
+      rhs.passes.push_back(std::move(pass));
+    passes.clear();
   }
 
+  /// Coalesce adjacent AdaptorPasses into one large adaptor. This runs
+  /// recursively through the pipeline graph.
+  void coalesceAdjacentAdaptorPasses();
+
   /// The name of the operation that passes of this pass manager operate on.
   OperationName name;
 
@@ -126,6 +117,62 @@ struct OpPassManagerImpl {
 } // end namespace detail
 } // end namespace mlir
 
+/// Coalesce adjacent AdaptorPasses into one large adaptor. This runs
+/// recursively through the pipeline graph.
+void OpPassManagerImpl::coalesceAdjacentAdaptorPasses() {
+  // Bail out early if there are no adaptor passes.
+  if (llvm::none_of(passes, [](std::unique_ptr<Pass> &pass) {
+        return isAdaptorPass(pass.get());
+      }))
+    return;
+
+  // Walk the pass list and merge adjacent adaptors.
+  OpToOpPassAdaptorBase *lastAdaptor = nullptr;
+  for (auto it = passes.begin(), e = passes.end(); it != e; ++it) {
+    // Check to see if this pass is an adaptor.
+    if (auto *currentAdaptor = getAdaptorPassBase(it->get())) {
+      // If it is the first adaptor in a possible chain, remember it and
+      // continue.
+      if (!lastAdaptor) {
+        lastAdaptor = currentAdaptor;
+        continue;
+      }
+
+      // Otherwise, merge into the existing adaptor and delete the current one.
+      currentAdaptor->mergeInto(*lastAdaptor);
+      it->reset();
+
+      // If the verifier is enabled, then next pass is a verifier run so
+      // drop it. Verifier passes are inserted after every pass, so this one
+      // would be a duplicate.
+      if (verifyPasses) {
+        assert(std::next(it) != e && isa<VerifierPass>(*std::next(it)));
+        (++it)->reset();
+      }
+    } else if (lastAdaptor && !isa<VerifierPass>(*it)) {
+      // If this pass is not an adaptor and not a verifier pass, then coalesce
+      // and forget any existing adaptor.
+      for (auto &pm : lastAdaptor->getPassManagers())
+        pm.getImpl().coalesceAdjacentAdaptorPasses();
+      lastAdaptor = nullptr;
+    }
+  }
+
+  // If there was an adaptor at the end of the manager, coalesce it as well.
+  if (lastAdaptor) {
+    for (auto &pm : lastAdaptor->getPassManagers())
+      pm.getImpl().coalesceAdjacentAdaptorPasses();
+  }
+
+  // Now that the adaptors have been merged, erase the empty slot corresponding
+  // to the merged adaptors that were nulled-out in the loop above.
+  llvm::erase_if(passes, std::logical_not<std::unique_ptr<Pass>>());
+}
+
+//===----------------------------------------------------------------------===//
+// OpPassManager
+//===----------------------------------------------------------------------===//
+
 OpPassManager::OpPassManager(OperationName name, bool disableThreads,
                              bool verifyPasses)
     : impl(new OpPassManagerImpl(name, disableThreads, verifyPasses)) {
@@ -136,11 +183,14 @@ OpPassManager::OpPassManager(OperationName name, bool disableThreads,
          "OpPassManager only supports operating on operations marked as "
          "'IsolatedFromAbove'");
 }
-OpPassManager::OpPassManager(const OpPassManager &rhs)
-    : impl(new OpPassManagerImpl(rhs.impl->name, rhs.impl->disableThreads,
-                                 rhs.impl->verifyPasses)) {
+OpPassManager::OpPassManager(OpPassManager &&rhs) : impl(std::move(rhs.impl)) {}
+OpPassManager::OpPassManager(const OpPassManager &rhs) { *this = rhs; }
+OpPassManager &OpPassManager::operator=(const OpPassManager &rhs) {
+  impl.reset(new OpPassManagerImpl(rhs.impl->name, rhs.impl->disableThreads,
+                                   rhs.impl->verifyPasses));
   for (auto &pass : rhs.impl->passes)
     impl->passes.emplace_back(pass->clone());
+  return *this;
 }
 
 OpPassManager::~OpPassManager() {}
@@ -157,23 +207,19 @@ LogicalResult OpPassManager::run(Operation *op, AnalysisManager am) {
 /// Nest a new operation pass manager for the given operation kind under this
 /// pass manager.
 OpPassManager &OpPassManager::nest(const OperationName &nestedName) {
-  // Check to see if an existing nested pass manager already exists.
-  if (auto *nestedPM = impl->getLastNestedPM()) {
-    if (nestedPM->getOpName() == nestedName)
-      return *nestedPM;
+  OpPassManager nested(nestedName, impl->disableThreads, impl->verifyPasses);
+
+  /// Create an adaptor for this pass. If multi-threading is disabled, then
+  /// create a synchronous adaptor.
+  if (impl->disableThreads || !llvm::llvm_is_multithreaded()) {
+    auto *adaptor = new OpToOpPassAdaptor(std::move(nested));
+    addPass(std::unique_ptr<Pass>(adaptor));
+    return adaptor->getPassManagers().front();
   }
 
-  std::unique_ptr<OpPassManager> nested(
-      new OpPassManager(nestedName, impl->disableThreads, impl->verifyPasses));
-  auto &nestedRef = *nested;
-
-  /// Create an executor adaptor for this pass. If multi-threading is disabled,
-  /// then create a synchronous adaptor.
-  if (impl->disableThreads || !llvm::llvm_is_multithreaded())
-    addPass(std::make_unique<OpToOpPassAdaptor>(std::move(nested)));
-  else
-    addPass(std::make_unique<OpToOpPassAdaptorParallel>(std::move(nested)));
-  return nestedRef;
+  auto *adaptor = new OpToOpPassAdaptorParallel(std::move(nested));
+  addPass(std::unique_ptr<Pass>(adaptor));
+  return adaptor->getPassManagers().front();
 }
 OpPassManager &OpPassManager::nest(StringRef nestedName) {
   return nest(OperationName(nestedName, getContext()));
@@ -227,32 +273,81 @@ static LogicalResult runPipeline(OpPassManager &pm, Operation *op,
   return result;
 }
 
-OpToOpPassAdaptor::OpToOpPassAdaptor(std::unique_ptr<OpPassManager> mgr)
-    : mgr(std::move(mgr)) {}
-OpToOpPassAdaptor::OpToOpPassAdaptor(const OpToOpPassAdaptor &rhs)
-    : mgr(new OpPassManager(*rhs.mgr)) {}
+/// Find an operation pass manager that can operate on an operation of the given
+/// type, or nullptr if one does not exist.
+static OpPassManager *findPassManagerFor(MutableArrayRef<OpPassManager> mgrs,
+                                         const OperationName &name) {
+  auto it = llvm::find_if(
+      mgrs, [&](OpPassManager &mgr) { return mgr.getOpName() == name; });
+  return it == mgrs.end() ? nullptr : &*it;
+}
+
+OpToOpPassAdaptorBase::OpToOpPassAdaptorBase(OpPassManager &&mgr) {
+  mgrs.emplace_back(std::move(mgr));
+}
+
+/// Merge the current pass adaptor into given 'rhs'.
+void OpToOpPassAdaptorBase::mergeInto(OpToOpPassAdaptorBase &rhs) {
+  for (auto &pm : mgrs) {
+    // If an existing pass manager exists, then merge the given pass manager
+    // into it.
+    if (auto *existingPM = findPassManagerFor(rhs.mgrs, pm.getOpName())) {
+      pm.getImpl().mergeInto(existingPM->getImpl());
+    } else {
+      // Otherwise, add the given pass manager to the list.
+      rhs.mgrs.emplace_back(std::move(pm));
+    }
+  }
+  mgrs.clear();
+
+  // After coalescing, sort the pass managers within rhs by name.
+  llvm::array_pod_sort(rhs.mgrs.begin(), rhs.mgrs.end(),
+                       [](const OpPassManager *lhs, const OpPassManager *rhs) {
+                         return lhs->getOpName().getStringRef().compare(
+                             rhs->getOpName().getStringRef());
+                       });
+}
+
+OpToOpPassAdaptor::OpToOpPassAdaptor(OpPassManager &&mgr)
+    : OpToOpPassAdaptorBase(std::move(mgr)) {}
 
 /// Run the held pipeline over all nested operations.
 void OpToOpPassAdaptor::runOnOperation() {
   auto am = getAnalysisManager();
+  auto currentThreadID = llvm::get_threadid();
+  auto *instrumentor = am.getPassInstrumentor();
   for (auto &region : getOperation()->getRegions()) {
     for (auto &block : region) {
       for (auto &op : block) {
+        auto *mgr = findPassManagerFor(mgrs, op.getName());
+        if (!mgr)
+          continue;
+
         // Run the held pipeline over the current operation.
-        if (op.getName() == mgr->getOpName() &&
-            failed(runPipeline(*mgr, &op, am.slice(&op))))
+        if (instrumentor)
+          instrumentor->runBeforePipeline(mgr->getOpName(), currentThreadID);
+        auto result = runPipeline(*mgr, &op, am.slice(&op));
+        if (instrumentor)
+          instrumentor->runAfterPipeline(mgr->getOpName(), currentThreadID);
+
+        if (failed(result))
           return signalPassFailure();
       }
     }
   }
 }
 
-OpToOpPassAdaptorParallel::OpToOpPassAdaptorParallel(
-    std::unique_ptr<OpPassManager> mgr)
-    : mgr(std::move(mgr)) {}
-OpToOpPassAdaptorParallel::OpToOpPassAdaptorParallel(
-    const OpToOpPassAdaptorParallel &rhs)
-    : mgr(std::make_unique<OpPassManager>(*rhs.mgr)) {}
+OpToOpPassAdaptorParallel::OpToOpPassAdaptorParallel(OpPassManager &&mgr)
+    : OpToOpPassAdaptorBase(std::move(mgr)) {}
+
+/// Utility functor that checks if the two ranges of pass managers have a size
+/// mismatch.
+static bool hasSizeMismatch(ArrayRef<OpPassManager> lhs,
+                            ArrayRef<OpPassManager> rhs) {
+  return lhs.size() != rhs.size() ||
+         llvm::any_of(llvm::seq<size_t>(0, lhs.size()),
+                      [&](size_t i) { return lhs[i].size() != rhs[i].size(); });
+}
 
 // Run the held pipeline asynchronously across the functions within the module.
 void OpToOpPassAdaptorParallel::runOnOperation() {
@@ -260,8 +355,8 @@ void OpToOpPassAdaptorParallel::runOnOperation() {
 
   // Create the async executors if they haven't been created, or if the main
   // pipeline has changed.
-  if (asyncExecutors.empty() || asyncExecutors.front().size() != mgr->size())
-    asyncExecutors = {llvm::hardware_concurrency(), *mgr};
+  if (asyncExecutors.empty() || hasSizeMismatch(asyncExecutors.front(), mgrs))
+    asyncExecutors.assign(llvm::hardware_concurrency(), mgrs);
 
   // Run a prepass over the module to collect the operations to execute over.
   // This ensures that an analysis manager exists for each operation, as well as
@@ -270,8 +365,8 @@ void OpToOpPassAdaptorParallel::runOnOperation() {
   for (auto &region : getOperation()->getRegions()) {
     for (auto &block : region) {
       for (auto &op : block) {
-        // Add this operation iff the name matches the current pass manager.
-        if (op.getName() == mgr->getOpName())
+        // Add this operation iff the name matches the any of the pass managers.
+        if (findPassManagerFor(mgrs, op.getName()))
           opAMPairs.emplace_back(&op, am.slice(&op));
       }
     }
@@ -284,13 +379,17 @@ void OpToOpPassAdaptorParallel::runOnOperation() {
   // An index for the current operation/analysis manager pair.
   std::atomic<unsigned> opIt(0);
 
+  // Get the current thread for this adaptor.
+  auto parentThreadID = llvm::get_threadid();
+  auto *instrumentor = am.getPassInstrumentor();
+
   // An atomic failure variable for the async executors.
   std::atomic<bool> passFailed(false);
   llvm::parallel::for_each(
       llvm::parallel::par, asyncExecutors.begin(),
       std::next(asyncExecutors.begin(),
                 std::min(asyncExecutors.size(), opAMPairs.size())),
-      [&](OpPassManager &pm) {
+      [&](MutableArrayRef<OpPassManager> pms) {
         for (auto e = opAMPairs.size(); !passFailed && opIt < e;) {
           // Get the next available operation index.
           unsigned nextID = opIt++;
@@ -300,9 +399,19 @@ void OpToOpPassAdaptorParallel::runOnOperation() {
           // Set the order id for this thread in the diagnostic handler.
           diagHandler.setOrderIDForThread(nextID);
 
-          // Run the executor over the current operation.
+          // Get the pass manager for this operation and execute it.
           auto &it = opAMPairs[nextID];
-          if (failed(runPipeline(pm, it.first, it.second))) {
+          auto *pm = findPassManagerFor(pms, it.first->getName());
+          assert(pm && "expected valid pass manager for operation");
+
+          if (instrumentor)
+            instrumentor->runBeforePipeline(pm->getOpName(), parentThreadID);
+          auto pipelineResult = runPipeline(*pm, it.first, it.second);
+          if (instrumentor)
+            instrumentor->runAfterPipeline(pm->getOpName(), parentThreadID);
+
+          // Handle a failed pipeline result.
+          if (failed(pipelineResult)) {
             passFailed = true;
             break;
           }
@@ -314,14 +423,14 @@ void OpToOpPassAdaptorParallel::runOnOperation() {
     signalPassFailure();
 }
 
-/// Utility function to return the operation name that the given adaptor pass
-/// operates on. Return None if the given pass is not an adaptor pass.
-Optional<StringRef> mlir::detail::getAdaptorPassOpName(Pass *pass) {
+/// Utility function to convert the given class to the base adaptor it is an
+/// adaptor pass, returns nullptr otherwise.
+OpToOpPassAdaptorBase *mlir::detail::getAdaptorPassBase(Pass *pass) {
   if (auto *adaptor = dyn_cast<OpToOpPassAdaptor>(pass))
-    return adaptor->getPassManager().getOpName().getStringRef();
+    return adaptor;
   if (auto *adaptor = dyn_cast<OpToOpPassAdaptorParallel>(pass))
-    return adaptor->getPassManager().getOpName().getStringRef();
-  return llvm::None;
+    return adaptor;
+  return nullptr;
 }
 
 //===----------------------------------------------------------------------===//
@@ -329,7 +438,7 @@ Optional<StringRef> mlir::detail::getAdaptorPassOpName(Pass *pass) {
 //===----------------------------------------------------------------------===//
 
 PassManager::PassManager(MLIRContext *ctx, bool verifyPasses)
-    : opPassManager(OperationName(ModuleOp::getOperationName(), ctx),
+    : OpPassManager(OperationName(ModuleOp::getOperationName(), ctx),
                     /*disableThreads=*/false, verifyPasses),
       passTiming(false) {}
 
@@ -337,19 +446,18 @@ PassManager::~PassManager() {}
 
 /// Run the passes within this manager on the provided module.
 LogicalResult PassManager::run(ModuleOp module) {
+  // Before running, make sure to coalesce any adjacent pass adaptors in the
+  // pipeline.
+  getImpl().coalesceAdjacentAdaptorPasses();
+
+  // Construct an analysis manager for the pipeline and run it.
   ModuleAnalysisManager am(module, instrumentor.get());
-  return opPassManager.run(module, am);
+  return OpPassManager::run(module, am);
 }
 
 /// Disable support for multi-threading within the pass manager.
 void PassManager::disableMultithreading(bool disable) {
-  opPassManager.getImpl().disableThreads = disable;
-}
-
-/// Add an opaque pass pointer to the current manager. This takes ownership
-/// over the provided pass pointer.
-void PassManager::addPass(std::unique_ptr<Pass> pass) {
-  opPassManager.addPass(std::move(pass));
+  getImpl().disableThreads = disable;
 }
 
 /// Add the provided instrumentation to the pass manager. This takes ownership
@@ -439,6 +547,22 @@ struct PassInstrumentorImpl {
 PassInstrumentor::PassInstrumentor() : impl(new PassInstrumentorImpl()) {}
 PassInstrumentor::~PassInstrumentor() {}
 
+/// See PassInstrumentation::runBeforePipeline for details.
+void PassInstrumentor::runBeforePipeline(const OperationName &name,
+                                         uint64_t parentThreadID) {
+  llvm::sys::SmartScopedLock<true> instrumentationLock(impl->mutex);
+  for (auto &instr : impl->instrumentations)
+    instr->runBeforePipeline(name, parentThreadID);
+}
+
+/// See PassInstrumentation::runAfterPipeline for details.
+void PassInstrumentor::runAfterPipeline(const OperationName &name,
+                                        uint64_t parentThreadID) {
+  llvm::sys::SmartScopedLock<true> instrumentationLock(impl->mutex);
+  for (auto &instr : impl->instrumentations)
+    instr->runAfterPipeline(name, parentThreadID);
+}
+
 /// See PassInstrumentation::runBeforePass for details.
 void PassInstrumentor::runBeforePass(Pass *pass, Operation *op) {
   llvm::sys::SmartScopedLock<true> instrumentationLock(impl->mutex);
diff --git a/third_party/mlir/lib/Pass/PassDetail.h b/third_party/mlir/lib/Pass/PassDetail.h
index 8188cc164ae..29bb04d3ad2 100644
--- a/third_party/mlir/lib/Pass/PassDetail.h
+++ b/third_party/mlir/lib/Pass/PassDetail.h
@@ -18,10 +18,9 @@
 #define MLIR_PASS_PASSDETAIL_H_
 
 #include "mlir/Pass/Pass.h"
+#include "mlir/Pass/PassManager.h"
 
 namespace mlir {
-class OpPassManager;
-
 namespace detail {
 
 //===----------------------------------------------------------------------===//
@@ -37,60 +36,60 @@ class VerifierPass : public OperationPass<VerifierPass> {
 // OpToOpPassAdaptor
 //===----------------------------------------------------------------------===//
 
+/// A base class for Op-to-Op adaptor passes.
+class OpToOpPassAdaptorBase {
+public:
+  OpToOpPassAdaptorBase(OpPassManager &&mgr);
+  OpToOpPassAdaptorBase(const OpToOpPassAdaptorBase &rhs) = default;
+
+  /// Merge the current pass adaptor into given 'rhs'.
+  void mergeInto(OpToOpPassAdaptorBase &rhs);
+
+  /// Returns the pass managers held by this adaptor.
+  MutableArrayRef<OpPassManager> getPassManagers() { return mgrs; }
+
+protected:
+  // A set of adaptors to run.
+  SmallVector<OpPassManager, 1> mgrs;
+};
+
 /// An adaptor pass used to run operation passes over nested operations
 /// synchronously on a single thread.
-class OpToOpPassAdaptor : public OperationPass<OpToOpPassAdaptor> {
+class OpToOpPassAdaptor : public OperationPass<OpToOpPassAdaptor>,
+                          public OpToOpPassAdaptorBase {
 public:
-  OpToOpPassAdaptor(std::unique_ptr<OpPassManager> mgr);
-  OpToOpPassAdaptor(const OpToOpPassAdaptor &rhs);
+  OpToOpPassAdaptor(OpPassManager &&mgr);
 
   /// Run the held pipeline over all operations.
   void runOnOperation() override;
-
-  /// Returns the nested pass manager for this adaptor.
-  OpPassManager &getPassManager() { return *mgr; }
-
-private:
-  std::unique_ptr<OpPassManager> mgr;
 };
 
 /// An adaptor pass used to run operation passes over nested operations
 /// asynchronously across multiple threads.
 class OpToOpPassAdaptorParallel
-    : public OperationPass<OpToOpPassAdaptorParallel> {
+    : public OperationPass<OpToOpPassAdaptorParallel>,
+      public OpToOpPassAdaptorBase {
 public:
-  OpToOpPassAdaptorParallel(std::unique_ptr<OpPassManager> mgr);
-  OpToOpPassAdaptorParallel(const OpToOpPassAdaptorParallel &rhs);
+  OpToOpPassAdaptorParallel(OpPassManager &&mgr);
 
   /// Run the held pipeline over all operations.
   void runOnOperation() override;
 
-  /// Returns the nested pass manager for this adaptor.
-  OpPassManager &getPassManager() { return *mgr; }
-
 private:
-  // The main pass executor for this adaptor.
-  std::unique_ptr<OpPassManager> mgr;
-
   // A set of executors, cloned from the main executor, that run asynchronously
   // on different threads.
-  std::vector<OpPassManager> asyncExecutors;
+  SmallVector<SmallVector<OpPassManager, 1>, 8> asyncExecutors;
 };
 
-/// Utility function to return if a pass refers to an OpToOpAdaptorPass
-/// instance.
-inline bool isOpToOpAdaptorPass(Pass *pass) {
-  return isa<OpToOpPassAdaptorParallel>(pass) || isa<OpToOpPassAdaptor>(pass);
-}
+/// Utility function to convert the given class to the base adaptor it is an
+/// adaptor pass, returns nullptr otherwise.
+OpToOpPassAdaptorBase *getAdaptorPassBase(Pass *pass);
 
 /// Utility function to return if a pass refers to an adaptor pass. Adaptor
-/// passes are those that internally execute a pipeline, such as the
-/// OpToOpPassAdaptor.
-inline bool isAdaptorPass(Pass *pass) { return isOpToOpAdaptorPass(pass); }
-
-/// Utility function to return the operation name that the given adaptor pass
-/// operates on. Return None if the given pass is not an adaptor pass.
-Optional<StringRef> getAdaptorPassOpName(Pass *pass);
+/// passes are those that internally execute a pipeline.
+inline bool isAdaptorPass(Pass *pass) {
+  return isa<OpToOpPassAdaptorParallel>(pass) || isa<OpToOpPassAdaptor>(pass);
+}
 
 } // end namespace detail
 } // end namespace mlir
diff --git a/third_party/mlir/lib/Pass/PassManagerOptions.cpp b/third_party/mlir/lib/Pass/PassManagerOptions.cpp
index 055e81cbd1b..5701d30260e 100644
--- a/third_party/mlir/lib/Pass/PassManagerOptions.cpp
+++ b/third_party/mlir/lib/Pass/PassManagerOptions.cpp
@@ -25,9 +25,6 @@ using namespace mlir;
 
 namespace {
 struct PassManagerOptions {
-  typedef llvm::cl::list<const mlir::PassRegistryEntry *, bool, PassNameParser>
-      PassOptionList;
-
   PassManagerOptions();
 
   //===--------------------------------------------------------------------===//
@@ -38,8 +35,8 @@ struct PassManagerOptions {
   //===--------------------------------------------------------------------===//
   // IR Printing
   //===--------------------------------------------------------------------===//
-  PassOptionList printBefore;
-  PassOptionList printAfter;
+  PassPipelineCLParser printBefore;
+  PassPipelineCLParser printAfter;
   llvm::cl::opt<bool> printBeforeAll;
   llvm::cl::opt<bool> printAfterAll;
   llvm::cl::opt<bool> printModuleScope;
@@ -72,10 +69,8 @@ PassManagerOptions::PassManagerOptions()
       //===----------------------------------------------------------------===//
       // IR Printing
       //===----------------------------------------------------------------===//
-      printBefore("print-ir-before",
-                  llvm::cl::desc("Print IR before specified passes")),
-      printAfter("print-ir-after",
-                 llvm::cl::desc("Print IR after specified passes")),
+      printBefore("print-ir-before", "Print IR before specified passes"),
+      printAfter("print-ir-after", "Print IR after specified passes"),
       printBeforeAll("print-ir-before-all",
                      llvm::cl::desc("Print IR before each pass"),
                      llvm::cl::init(false)),
@@ -112,12 +107,12 @@ void PassManagerOptions::addPrinterInstrumentation(PassManager &pm) {
   if (printBeforeAll) {
     // If we are printing before all, then just return true for the filter.
     shouldPrintBeforePass = [](Pass *) { return true; };
-  } else if (printBefore.getNumOccurrences() != 0) {
+  } else if (printBefore.hasAnyOccurrences()) {
     // Otherwise if there are specific passes to print before, then check to see
     // if the pass info for the current pass is included in the list.
     shouldPrintBeforePass = [&](Pass *pass) {
       auto *passInfo = pass->lookupPassInfo();
-      return passInfo && llvm::is_contained(printBefore, passInfo);
+      return passInfo && printBefore.contains(passInfo);
     };
   }
 
@@ -125,12 +120,12 @@ void PassManagerOptions::addPrinterInstrumentation(PassManager &pm) {
   if (printAfterAll) {
     // If we are printing after all, then just return true for the filter.
     shouldPrintAfterPass = [](Pass *) { return true; };
-  } else if (printAfter.getNumOccurrences() != 0) {
+  } else if (printAfter.hasAnyOccurrences()) {
     // Otherwise if there are specific passes to print after, then check to see
     // if the pass info for the current pass is included in the list.
     shouldPrintAfterPass = [&](Pass *pass) {
       auto *passInfo = pass->lookupPassInfo();
-      return passInfo && llvm::is_contained(printAfter, passInfo);
+      return passInfo && printAfter.contains(passInfo);
     };
   }
 
diff --git a/third_party/mlir/lib/Pass/PassRegistry.cpp b/third_party/mlir/lib/Pass/PassRegistry.cpp
index 0d857619aeb..4da45156ed4 100644
--- a/third_party/mlir/lib/Pass/PassRegistry.cpp
+++ b/third_party/mlir/lib/Pass/PassRegistry.cpp
@@ -22,6 +22,7 @@
 #include "llvm/Support/ManagedStatic.h"
 
 using namespace mlir;
+using namespace detail;
 
 /// Static mapping of all of the registered passes.
 static llvm::ManagedStatic<llvm::DenseMap<const PassID *, PassInfo>>
@@ -34,7 +35,7 @@ static llvm::ManagedStatic<llvm::StringMap<PassPipelineInfo>>
 /// Utility to create a default registry function from a pass instance.
 static PassRegistryFunction
 buildDefaultRegistryFn(PassAllocatorFunction allocator) {
-  return [=](PassManager &pm) { pm.addPass(allocator()); };
+  return [=](OpPassManager &pm) { pm.addPass(allocator()); };
 }
 
 //===----------------------------------------------------------------------===//
@@ -82,15 +83,253 @@ const PassInfo *mlir::Pass::lookupPassInfo(const PassID *passID) {
   return &it->getSecond();
 }
 
+//===----------------------------------------------------------------------===//
+// TextualPassPipeline Parser
+//===----------------------------------------------------------------------===//
+
+namespace {
+/// This class represents a textual description of a pass pipeline.
+class TextualPipeline {
+public:
+  /// Try to initialize this pipeline with the given pipeline text. An option is
+  /// given to enable accurate error reporting.
+  LogicalResult initialize(StringRef text, llvm::cl::Option &opt);
+
+  /// Add the internal pipeline elements to the provided pass manager.
+  void addToPipeline(OpPassManager &pm) const;
+
+private:
+  /// A struct to capture parsed pass pipeline names.
+  ///
+  /// A pipeline is defined as a series of names, each of which may in itself
+  /// recursively contain a nested pipeline. A name is either the name of a pass
+  /// (e.g. "cse") or the name of an operation type (e.g. "func"). If the name
+  /// is the name of a pass, the InnerPipeline is empty, since passes cannot
+  /// contain inner pipelines.
+  struct PipelineElement {
+    StringRef name;
+    const PassRegistryEntry *registryEntry;
+    std::vector<PipelineElement> innerPipeline;
+  };
+
+  /// Parse the given pipeline text into the internal pipeline vector. This
+  /// function only parses the structure of the pipeline, and does not resolve
+  /// its elements.
+  LogicalResult parsePipelineText(StringRef text, llvm::cl::Option &opt);
+
+  /// Resolve the elements of the pipeline, i.e. connect passes and pipelines to
+  /// the corresponding registry entry.
+  LogicalResult
+  resolvePipelineElements(MutableArrayRef<PipelineElement> elements,
+                          llvm::cl::Option &opt);
+
+  /// Resolve a single element of the pipeline.
+  LogicalResult resolvePipelineElement(PipelineElement &element,
+                                       llvm::cl::Option &opt);
+
+  /// Add the given pipeline elements to the provided pass manager.
+  void addToPipeline(ArrayRef<PipelineElement> elements,
+                     OpPassManager &pm) const;
+
+  std::vector<PipelineElement> pipeline;
+};
+
+} // end anonymous namespace
+
+/// Try to initialize this pipeline with the given pipeline text. An option is
+/// given to enable accurate error reporting.
+LogicalResult TextualPipeline::initialize(StringRef text,
+                                          llvm::cl::Option &opt) {
+  // Parse the provided pipeline string.
+  if (failed(parsePipelineText(text, opt)))
+    return failure(opt.error("failed to parse pass pipeline: `" + text + "'"));
+  return resolvePipelineElements(pipeline, opt);
+}
+
+/// Add the internal pipeline elements to the provided pass manager.
+void TextualPipeline::addToPipeline(OpPassManager &pm) const {
+  addToPipeline(pipeline, pm);
+}
+
+/// Parse the given pipeline text into the internal pipeline vector. This
+/// function only parses the structure of the pipeline, and does not resolve
+/// its elements.
+LogicalResult TextualPipeline::parsePipelineText(StringRef text,
+                                                 llvm::cl::Option &opt) {
+  SmallVector<std::vector<PipelineElement> *, 4> pipelineStack = {&pipeline};
+  for (;;) {
+    std::vector<PipelineElement> &pipeline = *pipelineStack.back();
+    size_t pos = text.find_first_of(",()");
+    pipeline.push_back({text.substr(0, pos).trim(), {}});
+
+    // If we have a single terminating name, we're done.
+    if (pos == text.npos)
+      break;
+
+    char sep = text[pos];
+    text = text.substr(pos + 1);
+
+    // Just a name ending in a comma, continue.
+    if (sep == ',')
+      continue;
+
+    if (sep == '(') {
+      // Push the inner pipeline onto the stack to continue processing.
+      pipelineStack.push_back(&pipeline.back().innerPipeline);
+      continue;
+    }
+
+    // When handling the close parenthesis, we greedily consume them to avoid
+    // empty strings in the pipeline.
+    assert(sep == ')' && "Bogus separator!");
+    do {
+      // If we try to pop the outer pipeline we have unbalanced parentheses.
+      if (pipelineStack.size() == 1)
+        return failure(
+            opt.error("encountered extra closing ')' creating unbalanced "
+                      "parentheses while parsing pipeline"));
+
+      pipelineStack.pop_back();
+    } while (text.consume_front(")"));
+
+    // Check if we've finished parsing.
+    if (text.empty())
+      break;
+
+    // Otherwise, the end of an inner pipeline always has to be followed by
+    // a comma, and then we can continue.
+    if (!text.consume_front(","))
+      return failure(opt.error("expected ',' after parsing pipeline near: " +
+                               pipeline.back().name));
+  }
+
+  // Check for unbalanced parentheses.
+  if (pipelineStack.size() > 1)
+    return failure(
+        opt.error("encountered unbalanced parentheses while parsing pipeline"));
+
+  assert(pipelineStack.back() == &pipeline &&
+         "wrong pipeline at the bottom of the stack");
+  return success();
+}
+
+/// Resolve the elements of the pipeline, i.e. connect passes and pipelines to
+/// the corresponding registry entry.
+LogicalResult TextualPipeline::resolvePipelineElements(
+    MutableArrayRef<PipelineElement> elements, llvm::cl::Option &opt) {
+  for (auto &elt : elements)
+    if (failed(resolvePipelineElement(elt, opt)))
+      return failure();
+  return success();
+}
+
+/// Resolve a single element of the pipeline.
+LogicalResult TextualPipeline::resolvePipelineElement(PipelineElement &element,
+                                                      llvm::cl::Option &opt) {
+  // If the inner pipeline of this element is not empty, this is an operation
+  // pipeline.
+  if (!element.innerPipeline.empty())
+    return resolvePipelineElements(element.innerPipeline, opt);
+
+  // Otherwise, this must be a pass or pass pipeline.
+  // Check to see if a pipeline was registered with this name.
+  auto pipelineRegistryIt = passPipelineRegistry->find(element.name);
+  if (pipelineRegistryIt != passPipelineRegistry->end()) {
+    element.registryEntry = &pipelineRegistryIt->second;
+    return success();
+  }
+
+  // If not, then this must be a specific pass name.
+  for (auto &passIt : *passRegistry) {
+    if (passIt.second.getPassArgument() == element.name) {
+      element.registryEntry = &passIt.second;
+      return success();
+    }
+  }
+
+  // Emit an error for the unknown pass.
+  opt.error("'" + element.name +
+            "' does not refer to a registered pass or pass pipeline");
+  return failure();
+}
+
+/// Add the given pipeline elements to the provided pass manager.
+void TextualPipeline::addToPipeline(ArrayRef<PipelineElement> elements,
+                                    OpPassManager &pm) const {
+  for (auto &elt : elements) {
+    if (elt.registryEntry)
+      elt.registryEntry->addToPipeline(pm);
+    else
+      addToPipeline(elt.innerPipeline, pm.nest(elt.name));
+  }
+}
+
 //===----------------------------------------------------------------------===//
 // PassNameParser
 //===----------------------------------------------------------------------===//
 
-PassNameParser::PassNameParser(llvm::cl::Option &opt)
-    : llvm::cl::parser<const PassRegistryEntry *>(opt) {}
+namespace {
+/// This struct represents the possible data entries in a parsed pass pipeline
+/// list.
+struct PassArgData {
+  PassArgData() : registryEntry(nullptr) {}
+  PassArgData(const PassRegistryEntry *registryEntry)
+      : registryEntry(registryEntry) {}
+
+  /// This field is used when the parsed option corresponds to a registered pass
+  /// or pass pipeline.
+  const PassRegistryEntry *registryEntry;
+
+  /// This field is used when the parsed option corresponds to an explicit
+  /// pipeline.
+  TextualPipeline pipeline;
+};
+} // end anonymous namespace
+
+namespace llvm {
+namespace cl {
+/// Define a valid OptionValue for the command line pass argument.
+template <>
+struct OptionValue<PassArgData> final
+    : OptionValueBase<PassArgData, /*isClass=*/true> {
+  OptionValue(const PassArgData &value) { this->setValue(value); }
+  OptionValue() = default;
+  void anchor() override {}
+
+  bool hasValue() const { return true; }
+  const PassArgData &getValue() const { return value; }
+  void setValue(const PassArgData &value) { this->value = value; }
+
+  PassArgData value;
+};
+} // end namespace cl
+} // end namespace llvm
+
+namespace {
+
+/// The name for the command line option used for parsing the textual pass
+/// pipeline.
+static constexpr llvm::StringLiteral passPipelineArg = "pass-pipeline";
+
+/// Adds command line option for each registered pass or pass pipeline, as well
+/// as textual pass pipelines.
+struct PassNameParser : public llvm::cl::parser<PassArgData> {
+  PassNameParser(llvm::cl::Option &opt) : llvm::cl::parser<PassArgData>(opt) {}
+
+  void initialize();
+  void printOptionInfo(const llvm::cl::Option &opt,
+                       size_t globalWidth) const override;
+  bool parse(llvm::cl::Option &opt, StringRef argName, StringRef arg,
+             PassArgData &value);
+};
+} // namespace
 
 void PassNameParser::initialize() {
-  llvm::cl::parser<const PassRegistryEntry *>::initialize();
+  llvm::cl::parser<PassArgData>::initialize();
+
+  /// Add an entry for the textual pass pipeline option.
+  addLiteralOption(passPipelineArg, PassArgData(),
+                   "A textual description of a pass pipeline to run");
 
   /// Add the pass entries.
   for (const auto &kv : *passRegistry) {
@@ -112,6 +351,62 @@ void PassNameParser::printOptionInfo(const llvm::cl::Option &O,
                           const PassNameParser::OptionInfo *VT2) {
                          return VT1->Name.compare(VT2->Name);
                        });
-  using llvm::cl::parser;
-  parser<const PassRegistryEntry *>::printOptionInfo(O, GlobalWidth);
+  llvm::cl::parser<PassArgData>::printOptionInfo(O, GlobalWidth);
+}
+
+bool PassNameParser::parse(llvm::cl::Option &opt, StringRef argName,
+                           StringRef arg, PassArgData &value) {
+  // Handle the pipeline option explicitly.
+  if (argName == passPipelineArg)
+    return failed(value.pipeline.initialize(arg, opt));
+
+  // Otherwise, default to the base for handling.
+  return llvm::cl::parser<PassArgData>::parse(opt, argName, arg, value);
+}
+
+//===----------------------------------------------------------------------===//
+// PassPipelineCLParser
+//===----------------------------------------------------------------------===//
+
+namespace mlir {
+namespace detail {
+struct PassPipelineCLParserImpl {
+  PassPipelineCLParserImpl(StringRef arg, StringRef description)
+      : passList(arg, llvm::cl::desc(description)) {
+    passList.setValueExpectedFlag(llvm::cl::ValueExpected::ValueOptional);
+  }
+
+  /// The set of passes and pass pipelines to run.
+  llvm::cl::list<PassArgData, bool, PassNameParser> passList;
+};
+} // end namespace detail
+} // end namespace mlir
+
+/// Construct a pass pipeline parser with the given command line description.
+PassPipelineCLParser::PassPipelineCLParser(StringRef arg, StringRef description)
+    : impl(std::make_unique<detail::PassPipelineCLParserImpl>(arg,
+                                                              description)) {}
+PassPipelineCLParser::~PassPipelineCLParser() {}
+
+/// Returns true if this parser contains any valid options to add.
+bool PassPipelineCLParser::hasAnyOccurrences() const {
+  return impl->passList.getNumOccurrences() != 0;
+}
+
+/// Returns true if the given pass registry entry was registered at the
+/// top-level of the parser, i.e. not within an explicit textual pipeline.
+bool PassPipelineCLParser::contains(const PassRegistryEntry *entry) const {
+  return llvm::any_of(impl->passList, [&](const PassArgData &data) {
+    return data.registryEntry == entry;
+  });
+}
+
+/// Adds the passes defined by this parser entry to the given pass manager.
+void PassPipelineCLParser::addToPipeline(OpPassManager &pm) const {
+  for (auto &passIt : impl->passList) {
+    if (passIt.registryEntry)
+      passIt.registryEntry->addToPipeline(pm);
+    else
+      passIt.pipeline.addToPipeline(pm);
+  }
 }
diff --git a/third_party/mlir/lib/Pass/PassTiming.cpp b/third_party/mlir/lib/Pass/PassTiming.cpp
index b58bd7cdd4e..3c5a37d70d3 100644
--- a/third_party/mlir/lib/Pass/PassTiming.cpp
+++ b/third_party/mlir/lib/Pass/PassTiming.cpp
@@ -54,8 +54,22 @@ struct TimeRecord {
   double wall, user;
 };
 
+/// An enumeration of the different types of timers.
+enum class TimerKind {
+  /// This timer represents an ordered collection of pass timers, corresponding
+  /// to a pass pipeline.
+  Pipeline,
+
+  /// This timer represents a collection of pipeline timers.
+  PipelineCollection,
+
+  /// This timer represents an analysis or pass timer.
+  PassOrAnalysis
+};
+
 struct Timer {
-  explicit Timer(std::string &&name) : name(std::move(name)) {}
+  explicit Timer(std::string &&name, TimerKind kind)
+      : name(std::move(name)), kind(kind) {}
 
   /// Start the timer.
   void start() { startTime = std::chrono::system_clock::now(); }
@@ -68,11 +82,11 @@ struct Timer {
   }
 
   /// Get or create a child timer with the provided name and id.
-  Timer *getChildTimer(const void *id,
+  Timer *getChildTimer(const void *id, TimerKind kind,
                        std::function<std::string()> &&nameBuilder) {
     auto &child = children[id];
     if (!child)
-      child.reset(new Timer(nameBuilder()));
+      child.reset(new Timer(nameBuilder(), kind));
     return child.get();
   }
 
@@ -102,39 +116,40 @@ struct Timer {
     if (wallTime < other.wallTime)
       wallTime = other.wallTime;
     userTime += other.userTime;
-    mergeChildren(std::move(other.children), /*isStructural=*/false);
+    mergeChildren(std::move(other.children));
   }
 
   /// Merge the timer chilren in 'otherChildren' with the children of this
-  /// timer. If 'isStructural' is true, the children are merged lexographically
-  /// and 'otherChildren' must have the same number of elements as the children
-  /// of this timer. Otherwise, the timer children are merged based upon the
-  /// given timer key.
-  void mergeChildren(ChildrenMap &&otherChildren, bool isStructural) {
+  /// timer.
+  void mergeChildren(ChildrenMap &&otherChildren) {
     // Check for an empty children list.
     if (children.empty()) {
       children = std::move(otherChildren);
       return;
     }
 
-    if (isStructural) {
-      // If this is a structural merge, the number of children must be the same.
+    // Pipeline merges are handled separately as the children are merged
+    // lexographically.
+    if (kind == TimerKind::Pipeline) {
       assert(children.size() == otherChildren.size() &&
-             "structural merge requires the same number of children");
-      auto it = children.begin(), otherIt = otherChildren.begin();
-      for (auto e = children.end(); it != e; ++it, ++otherIt)
-        it->second->merge(std::move(*otherIt->second));
+             "pipeline merge requires the same number of children");
+      for (auto it : llvm::zip(children, otherChildren))
+        std::get<0>(it).second->merge(std::move(*std::get<1>(it).second));
       return;
     }
 
-    // Otherwise, we merge based upon the child timers key.
-    for (auto &otherChild : otherChildren) {
-      auto &child = children[otherChild.first];
-      if (!child)
-        child = std::move(otherChild.second);
-      else
-        child->merge(std::move(*otherChild.second));
-    }
+    // Otherwise, we merge children based upon their timer key.
+    for (auto &otherChild : otherChildren)
+      mergeChild(std::move(otherChild));
+  }
+
+  /// Merge in the given child timer and id into this timer.
+  void mergeChild(ChildrenMap::value_type &&childIt) {
+    auto &child = children[childIt.first];
+    if (!child)
+      child = std::move(childIt.second);
+    else
+      child->merge(std::move(*childIt.second));
   }
 
   /// Raw timing information.
@@ -147,6 +162,9 @@ struct Timer {
 
   /// A descriptive name for this timer.
   std::string name;
+
+  /// The type of timer this instance represents.
+  TimerKind kind;
 };
 
 struct PassTiming : public PassInstrumentation {
@@ -154,6 +172,10 @@ struct PassTiming : public PassInstrumentation {
   ~PassTiming() { print(); }
 
   /// Setup the instrumentation hooks.
+  void runBeforePipeline(const OperationName &name,
+                         uint64_t parentThreadID) override;
+  void runAfterPipeline(const OperationName &name,
+                        uint64_t parentThreadID) override;
   void runBeforePass(Pass *pass, Operation *) override { startPassTimer(pass); }
   void runAfterPass(Pass *pass, Operation *) override;
   void runAfterPassFailed(Pass *pass, Operation *op) override {
@@ -174,11 +196,13 @@ struct PassTiming : public PassInstrumentation {
   /// Start a new timer for the given analysis.
   void startAnalysisTimer(llvm::StringRef name, AnalysisID *id);
 
-  /// Stop a pass timer.
-  void stopPassTimer(Pass *pass);
-
-  /// Stop the last active timer.
-  void stopTimer();
+  /// Pop the last active timer for the current thread.
+  Timer *popLastActiveTimer() {
+    auto tid = llvm::get_threadid();
+    auto &activeTimers = activeThreadTimers[tid];
+    assert(!activeTimers.empty() && "expected active timer");
+    return activeTimers.pop_back_val();
+  }
 
   /// Print the timing result in list mode.
   void printResultsAsList(raw_ostream &os, Timer *root, TimeRecord totalTime);
@@ -188,22 +212,24 @@ struct PassTiming : public PassInstrumentation {
                               TimeRecord totalTime);
 
   /// Returns a timer for the provided identifier and name.
-  Timer *getTimer(const void *id, std::function<std::string()> &&nameBuilder) {
+  Timer *getTimer(const void *id, TimerKind kind,
+                  std::function<std::string()> &&nameBuilder) {
     auto tid = llvm::get_threadid();
 
     // If there is no active timer then add to the root timer.
     auto &activeTimers = activeThreadTimers[tid];
+    Timer *parentTimer;
     if (activeTimers.empty()) {
       auto &rootTimer = rootTimers[tid];
       if (!rootTimer)
-        rootTimer.reset(new Timer("root"));
-      auto *timer = rootTimer->getChildTimer(id, std::move(nameBuilder));
-      activeTimers.push_back(timer);
-      return timer;
+        rootTimer.reset(new Timer("root", TimerKind::Pipeline));
+      parentTimer = rootTimer.get();
+    } else {
+      // Otherwise, add this to the active timer.
+      parentTimer = activeTimers.back();
     }
 
-    // Otherwise, add this to the active timer.
-    auto timer = activeTimers.back()->getChildTimer(id, std::move(nameBuilder));
+    auto timer = parentTimer->getChildTimer(id, kind, std::move(nameBuilder));
     activeTimers.push_back(timer);
     return timer;
   }
@@ -216,14 +242,62 @@ struct PassTiming : public PassInstrumentation {
 
   /// The display mode to use when printing the timing results.
   PassTimingDisplayMode displayMode;
+
+  /// A mapping of pipeline timers that need to be merged into the parent
+  /// collection. The timers are mapped to the thread id of the parent thread to
+  /// merge into.
+  DenseMap<uint64_t, SmallVector<Timer::ChildrenMap::value_type, 4>>
+      pipelinesToMerge;
 };
 } // end anonymous namespace
 
+void PassTiming::runBeforePipeline(const OperationName &name,
+                                   uint64_t parentThreadID) {
+  // We don't actually want to time the piplelines, they gather their total
+  // from their held passes.
+  getTimer(name.getAsOpaquePointer(), TimerKind::Pipeline,
+           [&] { return ("'" + name.getStringRef() + "' Pipeline").str(); });
+}
+
+void PassTiming::runAfterPipeline(const OperationName &name,
+                                  uint64_t parentThreadID) {
+  // Pop the timer for the pipeline.
+  auto tid = llvm::get_threadid();
+  auto &activeTimers = activeThreadTimers[tid];
+  assert(!activeTimers.empty() && "expected active timer");
+  activeTimers.pop_back();
+
+  // If the current thread is the same as the parent, there is nothing left to
+  // do.
+  if (tid == parentThreadID)
+    return;
+
+  // Otherwise, mark the pipeline timer for merging into the correct parent
+  // thread.
+  assert(activeTimers.empty() && "expected parent timer to be root");
+  auto *parentTimer = rootTimers[tid].get();
+  assert(parentTimer->children.size() == 1 &&
+         parentTimer->children.count(name.getAsOpaquePointer()) &&
+         "expected a single pipeline timer");
+  pipelinesToMerge[parentThreadID].push_back(
+      std::move(*parentTimer->children.begin()));
+  rootTimers.erase(tid);
+}
+
 /// Start a new timer for the given pass.
 void PassTiming::startPassTimer(Pass *pass) {
-  Timer *timer = getTimer(pass, [pass]() -> std::string {
-    if (auto pipelineName = getAdaptorPassOpName(pass))
-      return ("'" + *pipelineName + "' Pipeline").str();
+  auto kind = isAdaptorPass(pass) ? TimerKind::PipelineCollection
+                                  : TimerKind::PassOrAnalysis;
+  Timer *timer = getTimer(pass, kind, [pass]() -> std::string {
+    if (auto *adaptor = getAdaptorPassBase(pass)) {
+      std::string name = "Pipeline Collection : [";
+      llvm::raw_string_ostream os(name);
+      interleaveComma(adaptor->getPassManagers(), os, [&](OpPassManager &pm) {
+        os << '\'' << pm.getOpName() << '\'';
+      });
+      os << ']';
+      return os.str();
+    }
     return pass->getName();
   });
 
@@ -235,34 +309,23 @@ void PassTiming::startPassTimer(Pass *pass) {
 
 /// Start a new timer for the given analysis.
 void PassTiming::startAnalysisTimer(llvm::StringRef name, AnalysisID *id) {
-  Timer *timer = getTimer(id, [name] { return "(A) " + name.str(); });
+  Timer *timer = getTimer(id, TimerKind::PassOrAnalysis,
+                          [name] { return "(A) " + name.str(); });
   timer->start();
 }
 
 /// Stop a pass timer.
 void PassTiming::runAfterPass(Pass *pass, Operation *) {
-  auto tid = llvm::get_threadid();
-  auto &activeTimers = activeThreadTimers[tid];
-  assert(!activeTimers.empty() && "expected active timer");
-  Timer *timer = activeTimers.pop_back_val();
+  Timer *timer = popLastActiveTimer();
 
   // If this is an OpToOpPassAdaptorParallel, then we need to merge in the
-  // timing data for the other threads.
+  // timing data for the pipelines running on other threads.
   if (isa<OpToOpPassAdaptorParallel>(pass)) {
-    // The asychronous pipeline timers should exist as children of root timers
-    // for other threads.
-    for (auto &rootTimer : llvm::make_early_inc_range(rootTimers)) {
-      // Skip the current thread.
-      if (rootTimer.first == tid)
-        continue;
-      // Check that this thread has no active timers.
-      assert(activeThreadTimers[tid].empty() && "expected no active timers");
-
-      // Structurally merge this timers children into the parallel
-      // module-to-function pass timer.
-      timer->mergeChildren(std::move(rootTimer.second->children),
-                           /*isStructural=*/true);
-      rootTimers.erase(rootTimer.first);
+    auto toMerge = pipelinesToMerge.find(llvm::get_threadid());
+    if (toMerge != pipelinesToMerge.end()) {
+      for (auto &it : toMerge->second)
+        timer->mergeChild(std::move(it));
+      pipelinesToMerge.erase(toMerge);
     }
     return;
   }
@@ -275,10 +338,7 @@ void PassTiming::runAfterPass(Pass *pass, Operation *) {
 
 /// Stop a timer.
 void PassTiming::runAfterAnalysis(llvm::StringRef, AnalysisID *, Operation *) {
-  auto &activeTimers = activeThreadTimers[llvm::get_threadid()];
-  assert(!activeTimers.empty() && "expected active timer");
-  Timer *timer = activeTimers.pop_back_val();
-  timer->stop();
+  popLastActiveTimer()->stop();
 }
 
 /// Utility to print the timer heading information.
@@ -340,8 +400,8 @@ void PassTiming::printResultsAsList(raw_ostream &os, Timer *root,
   llvm::StringMap<TimeRecord> mergedTimings;
 
   std::function<void(Timer *)> addTimer = [&](Timer *timer) {
-    // Check for timing information.
-    if (timer->wallTime.count())
+    // Only add timing information for passes and analyses.
+    if (timer->kind == TimerKind::PassOrAnalysis)
       mergedTimings[timer->name] += timer->getTotalTime();
     for (auto &children : timer->children)
       addTimer(children.second.get());
@@ -372,9 +432,33 @@ void PassTiming::printResultsAsPipeline(raw_ostream &os, Timer *root,
                                         TimeRecord totalTime) {
   std::function<void(unsigned, Timer *)> printTimer = [&](unsigned indent,
                                                           Timer *timer) {
+    // If this is a timer for a pipeline collection and the collection only has
+    // one pipeline child, then only print the child.
+    if (timer->kind == TimerKind::PipelineCollection &&
+        timer->children.size() == 1)
+      return printTimer(indent, timer->children.begin()->second.get());
+
     printTimeEntry(os, indent, timer->name, timer->getTotalTime(), totalTime);
-    for (auto &children : timer->children)
-      printTimer(indent + 2, children.second.get());
+
+    // If this timer is a pipeline, then print the children in-order.
+    if (timer->kind == TimerKind::Pipeline) {
+      for (auto &child : timer->children)
+        printTimer(indent + 2, child.second.get());
+      return;
+    }
+
+    // Otherwise, sort the children by name to give a deterministic ordering
+    // when emitting the time.
+    SmallVector<Timer *, 4> children;
+    children.reserve(timer->children.size());
+    for (auto &child : timer->children)
+      children.push_back(child.second.get());
+    llvm::array_pod_sort(children.begin(), children.end(),
+                         [](Timer *const *lhs, Timer *const *rhs) {
+                           return (*lhs)->name.compare((*rhs)->name);
+                         });
+    for (auto &child : children)
+      printTimer(indent + 2, child);
   };
 
   // Print each of the top level timers.
diff --git a/third_party/mlir/lib/Support/JitRunner.cpp b/third_party/mlir/lib/Support/JitRunner.cpp
index 549b1ad479a..f87664d621a 100644
--- a/third_party/mlir/lib/Support/JitRunner.cpp
+++ b/third_party/mlir/lib/Support/JitRunner.cpp
@@ -81,14 +81,18 @@ static llvm::cl::list<const llvm::PassInfo *, bool, llvm::PassNameParser>
                llvm::cl::cat(optFlags));
 
 // CLI variables for -On options.
-static llvm::cl::opt<bool> optO0("O0", llvm::cl::desc("Run opt O0 passes"),
-                                 llvm::cl::cat(optFlags));
-static llvm::cl::opt<bool> optO1("O1", llvm::cl::desc("Run opt O1 passes"),
-                                 llvm::cl::cat(optFlags));
-static llvm::cl::opt<bool> optO2("O2", llvm::cl::desc("Run opt O2 passes"),
-                                 llvm::cl::cat(optFlags));
-static llvm::cl::opt<bool> optO3("O3", llvm::cl::desc("Run opt O3 passes"),
-                                 llvm::cl::cat(optFlags));
+static llvm::cl::opt<bool>
+    optO0("O0", llvm::cl::desc("Run opt passes and codegen at O0"),
+          llvm::cl::cat(optFlags));
+static llvm::cl::opt<bool>
+    optO1("O1", llvm::cl::desc("Run opt passes and codegen at O1"),
+          llvm::cl::cat(optFlags));
+static llvm::cl::opt<bool>
+    optO2("O2", llvm::cl::desc("Run opt passes and codegen at O2"),
+          llvm::cl::cat(optFlags));
+static llvm::cl::opt<bool>
+    optO3("O3", llvm::cl::desc("Run opt passes and codegen at O3"),
+          llvm::cl::cat(optFlags));
 
 static llvm::cl::OptionCategory clOptionsCategory("linking options");
 static llvm::cl::list<std::string>
@@ -178,14 +182,34 @@ static LogicalResult convertAffineStandardToLLVMIR(ModuleOp module) {
   return manager.run(module);
 }
 
+static llvm::Optional<unsigned> getCommandLineOptLevel() {
+  llvm::Optional<unsigned> optLevel;
+  llvm::SmallVector<std::reference_wrapper<llvm::cl::opt<bool>>, 4> optFlags{
+      optO0, optO1, optO2, optO3};
+
+  // Determine if there is an optimization flag present.
+  for (unsigned j = 0; j < 4; ++j) {
+    auto &flag = optFlags[j].get();
+    if (flag) {
+      optLevel = j;
+      break;
+    }
+  }
+  return optLevel;
+}
+
 // JIT-compile the given module and run "entryPoint" with "args" as arguments.
 static Error
 compileAndExecute(ModuleOp module, StringRef entryPoint,
                   std::function<llvm::Error(llvm::Module *)> transformer,
                   void **args) {
+  Optional<llvm::CodeGenOpt::Level> jitCodeGenOptLevel;
+  if (auto clOptLevel = getCommandLineOptLevel())
+    jitCodeGenOptLevel =
+        static_cast<llvm::CodeGenOpt::Level>(clOptLevel.getValue());
   SmallVector<StringRef, 4> libs(clSharedLibs.begin(), clSharedLibs.end());
-  auto expectedEngine =
-      mlir::ExecutionEngine::create(module, transformer, libs);
+  auto expectedEngine = mlir::ExecutionEngine::create(module, transformer,
+                                                      jitCodeGenOptLevel, libs);
   if (!expectedEngine)
     return expectedEngine.takeError();
 
@@ -296,26 +320,24 @@ int mlir::JitRunnerMain(
   initializeLLVM();
   mlir::initializeLLVMPasses();
 
-  llvm::SmallVector<std::reference_wrapper<llvm::cl::opt<bool>>, 4> optFlags{
-      optO0, optO1, optO2, optO3};
-
   llvm::cl::ParseCommandLineOptions(argc, argv, "MLIR CPU execution driver\n");
 
-  llvm::SmallVector<const llvm::PassInfo *, 4> passes;
-  llvm::Optional<unsigned> optLevel;
+  llvm::Optional<unsigned> optLevel = getCommandLineOptLevel();
+  llvm::SmallVector<std::reference_wrapper<llvm::cl::opt<bool>>, 4> optFlags{
+      optO0, optO1, optO2, optO3};
   unsigned optCLIPosition = 0;
   // Determine if there is an optimization flag present, and its CLI position
   // (optCLIPosition).
   for (unsigned j = 0; j < 4; ++j) {
     auto &flag = optFlags[j].get();
     if (flag) {
-      optLevel = j;
       optCLIPosition = flag.getPosition();
       break;
     }
   }
   // Generate vector of pass information, plus the index at which we should
   // insert any optimization passes in that vector (optPosition).
+  llvm::SmallVector<const llvm::PassInfo *, 4> passes;
   unsigned optPosition = 0;
   for (unsigned i = 0, e = llvmPasses.size(); i < e; ++i) {
     passes.push_back(llvmPasses[i]);
diff --git a/third_party/mlir/lib/Support/MlirOptMain.cpp b/third_party/mlir/lib/Support/MlirOptMain.cpp
index 0b234e64897..055692d64c4 100644
--- a/third_party/mlir/lib/Support/MlirOptMain.cpp
+++ b/third_party/mlir/lib/Support/MlirOptMain.cpp
@@ -45,10 +45,10 @@ using llvm::SMLoc;
 /// This typically parses the main source file, runs zero or more optimization
 /// passes, then prints the output.
 ///
-static LogicalResult
-performActions(raw_ostream &os, bool verifyDiagnostics, bool verifyPasses,
-               SourceMgr &sourceMgr, MLIRContext *context,
-               const std::vector<const mlir::PassRegistryEntry *> &passList) {
+static LogicalResult performActions(raw_ostream &os, bool verifyDiagnostics,
+                                    bool verifyPasses, SourceMgr &sourceMgr,
+                                    MLIRContext *context,
+                                    const PassPipelineCLParser &passPipeline) {
   OwningModuleRef module(parseSourceFile(sourceMgr, context));
   if (!module)
     return failure();
@@ -57,9 +57,8 @@ performActions(raw_ostream &os, bool verifyDiagnostics, bool verifyPasses,
   PassManager pm(context, verifyPasses);
   applyPassManagerCLOptions(pm);
 
-  // Run each of the passes that were selected.
-  for (const auto *passEntry : passList)
-    passEntry->addToPipeline(pm);
+  // Build the provided pipeline.
+  passPipeline.addToPipeline(pm);
 
   // Run the pipeline.
   if (failed(pm.run(*module)))
@@ -72,10 +71,10 @@ performActions(raw_ostream &os, bool verifyDiagnostics, bool verifyPasses,
 
 /// Parses the memory buffer.  If successfully, run a series of passes against
 /// it and print the result.
-static LogicalResult
-processBuffer(raw_ostream &os, std::unique_ptr<MemoryBuffer> ownedBuffer,
-              bool verifyDiagnostics, bool verifyPasses,
-              const std::vector<const mlir::PassRegistryEntry *> &passList) {
+static LogicalResult processBuffer(raw_ostream &os,
+                                   std::unique_ptr<MemoryBuffer> ownedBuffer,
+                                   bool verifyDiagnostics, bool verifyPasses,
+                                   const PassPipelineCLParser &passPipeline) {
   // Tell sourceMgr about this buffer, which is what the parser will pick up.
   SourceMgr sourceMgr;
   sourceMgr.AddNewSourceBuffer(std::move(ownedBuffer), SMLoc());
@@ -88,7 +87,7 @@ processBuffer(raw_ostream &os, std::unique_ptr<MemoryBuffer> ownedBuffer,
   if (!verifyDiagnostics) {
     SourceMgrDiagnosticHandler sourceMgrHandler(sourceMgr, &context);
     return performActions(os, verifyDiagnostics, verifyPasses, sourceMgr,
-                          &context, passList);
+                          &context, passPipeline);
   }
 
   SourceMgrDiagnosticVerifierHandler sourceMgrHandler(sourceMgr, &context);
@@ -97,7 +96,7 @@ processBuffer(raw_ostream &os, std::unique_ptr<MemoryBuffer> ownedBuffer,
   // these actions succeed or fail, we only care what diagnostics they produce
   // and whether they match our expectations.
   performActions(os, verifyDiagnostics, verifyPasses, sourceMgr, &context,
-                 passList);
+                 passPipeline);
 
   // Verify the diagnostic handler to make sure that each of the diagnostics
   // matched.
@@ -108,10 +107,11 @@ processBuffer(raw_ostream &os, std::unique_ptr<MemoryBuffer> ownedBuffer,
 /// according to the normal processBuffer logic.  This is primarily used to
 /// allow a large number of small independent parser tests to be put into a
 /// single test, but could be used for other purposes as well.
-static LogicalResult splitAndProcessFile(
-    raw_ostream &os, std::unique_ptr<MemoryBuffer> originalBuffer,
-    bool verifyDiagnostics, bool verifyPasses,
-    const std::vector<const mlir::PassRegistryEntry *> &passList) {
+static LogicalResult
+splitAndProcessFile(raw_ostream &os,
+                    std::unique_ptr<MemoryBuffer> originalBuffer,
+                    bool verifyDiagnostics, bool verifyPasses,
+                    const PassPipelineCLParser &passPipeline) {
   const char marker[] = "// -----";
   auto *origMemBuffer = originalBuffer.get();
   SmallVector<StringRef, 8> sourceBuffers;
@@ -132,24 +132,24 @@ static LogicalResult splitAndProcessFile(
         subBuffer, origMemBuffer->getBufferIdentifier() +
                        Twine(" split at line #") + Twine(splitLine));
     if (failed(processBuffer(os, std::move(subMemBuffer), verifyDiagnostics,
-                             verifyPasses, passList)))
+                             verifyPasses, passPipeline)))
       hadUnexpectedResult = true;
   }
 
   return failure(hadUnexpectedResult);
 }
 
-LogicalResult
-mlir::MlirOptMain(raw_ostream &os, std::unique_ptr<MemoryBuffer> buffer,
-                  const std::vector<const mlir::PassRegistryEntry *> &passList,
-                  bool splitInputFile, bool verifyDiagnostics,
-                  bool verifyPasses) {
+LogicalResult mlir::MlirOptMain(raw_ostream &os,
+                                std::unique_ptr<MemoryBuffer> buffer,
+                                const PassPipelineCLParser &passPipeline,
+                                bool splitInputFile, bool verifyDiagnostics,
+                                bool verifyPasses) {
   // The split-input-file mode is a very specific mode that slices the file
   // up into small pieces and checks each independently.
   if (splitInputFile)
     return splitAndProcessFile(os, std::move(buffer), verifyDiagnostics,
-                               verifyPasses, passList);
+                               verifyPasses, passPipeline);
 
   return processBuffer(os, std::move(buffer), verifyDiagnostics, verifyPasses,
-                       passList);
+                       passPipeline);
 }
diff --git a/third_party/mlir/lib/Transforms/CMakeLists.txt b/third_party/mlir/lib/Transforms/CMakeLists.txt
index e256c28ce93..184a6f952c9 100644
--- a/third_party/mlir/lib/Transforms/CMakeLists.txt
+++ b/third_party/mlir/lib/Transforms/CMakeLists.txt
@@ -5,6 +5,7 @@ add_llvm_library(MLIRTransforms
   Canonicalizer.cpp
   CSE.cpp
   DialectConversion.cpp
+  Inliner.cpp
   LoopCoalescing.cpp
   LoopFusion.cpp
   LoopInvariantCodeMotion.cpp
@@ -19,6 +20,7 @@ add_llvm_library(MLIRTransforms
   SimplifyAffineStructures.cpp
   StripDebugInfo.cpp
   Vectorize.cpp
+  ViewOpGraph.cpp
   ViewRegionGraph.cpp
 
   ADDITIONAL_HEADER_DIRS
diff --git a/third_party/mlir/lib/Transforms/DialectConversion.cpp b/third_party/mlir/lib/Transforms/DialectConversion.cpp
index 903e13ba11e..af14f68015f 100644
--- a/third_party/mlir/lib/Transforms/DialectConversion.cpp
+++ b/third_party/mlir/lib/Transforms/DialectConversion.cpp
@@ -820,8 +820,10 @@ OperationLegalizer::legalizePattern(Operation *op, RewritePattern *pattern,
   for (unsigned i = curState.numCreatedOperations,
                 e = rewriterImpl.createdOps.size();
        i != e; ++i) {
-    if (failed(legalize(rewriterImpl.createdOps[i], rewriter))) {
-      LLVM_DEBUG(llvm::dbgs() << "-- FAIL: Generated operation was illegal.\n");
+    Operation *op = rewriterImpl.createdOps[i];
+    if (failed(legalize(op, rewriter))) {
+      LLVM_DEBUG(llvm::dbgs() << "-- FAIL: Generated operation '"
+                              << op->getName() << "' was illegal.\n");
       return cleanupFailure();
     }
   }
diff --git a/third_party/mlir/lib/Transforms/Inliner.cpp b/third_party/mlir/lib/Transforms/Inliner.cpp
new file mode 100644
index 00000000000..49685cadba5
--- /dev/null
+++ b/third_party/mlir/lib/Transforms/Inliner.cpp
@@ -0,0 +1,59 @@
+//===- Inliner.cpp - Pass to inline function calls ------------------------===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+
+#include "mlir/Dialect/StandardOps/Ops.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/Function.h"
+#include "mlir/IR/Module.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Transforms/InliningUtils.h"
+#include "mlir/Transforms/Passes.h"
+#include "llvm/ADT/StringSet.h"
+
+using namespace mlir;
+
+// TODO(riverriddle) This pass should currently only be used for basic testing
+// of inlining functionality.
+namespace {
+struct Inliner : public ModulePass<Inliner> {
+  void runOnModule() override {
+    auto module = getModule();
+
+    // Collect each of the direct function calls within the module.
+    SmallVector<CallOp, 16> callOps;
+    for (auto &f : module)
+      f.walk([&](CallOp callOp) { callOps.push_back(callOp); });
+
+    // Build the inliner interface.
+    InlinerInterface interface(&getContext());
+
+    // Try to inline each of the call operations.
+    for (auto &call : callOps) {
+      if (failed(inlineFunction(
+              interface, module.lookupSymbol<FuncOp>(call.getCallee()), call,
+              llvm::to_vector<8>(call.getArgOperands()),
+              llvm::to_vector<8>(call.getResults()), call.getLoc())))
+        continue;
+
+      // If the inlining was successful then erase the call.
+      call.erase();
+    }
+  }
+};
+} // end anonymous namespace
+
+static PassRegistration<Inliner> pass("inline", "Inline function calls");
diff --git a/third_party/mlir/lib/Transforms/Utils/CMakeLists.txt b/third_party/mlir/lib/Transforms/Utils/CMakeLists.txt
index 3c08f45e8b6..4e1dc5e4b4e 100644
--- a/third_party/mlir/lib/Transforms/Utils/CMakeLists.txt
+++ b/third_party/mlir/lib/Transforms/Utils/CMakeLists.txt
@@ -1,6 +1,7 @@
 add_llvm_library(MLIRTransformUtils
   FoldUtils.cpp
   GreedyPatternRewriteDriver.cpp
+  InliningUtils.cpp
   LoopFusionUtils.cpp
   LoopUtils.cpp
   RegionUtils.cpp
diff --git a/third_party/mlir/lib/Transforms/Utils/InliningUtils.cpp b/third_party/mlir/lib/Transforms/Utils/InliningUtils.cpp
new file mode 100644
index 00000000000..901599ce023
--- /dev/null
+++ b/third_party/mlir/lib/Transforms/Utils/InliningUtils.cpp
@@ -0,0 +1,287 @@
+//===- InliningUtils.cpp ---- Misc utilities for inlining -----------------===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+//
+// This file implements miscellaneous inlining utilities.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Transforms/InliningUtils.h"
+
+#include "mlir/IR/BlockAndValueMapping.h"
+#include "mlir/IR/Function.h"
+#include "mlir/IR/Operation.h"
+#include "llvm/ADT/MapVector.h"
+#include "llvm/Support/raw_ostream.h"
+
+#define DEBUG_TYPE "inlining"
+
+using namespace mlir;
+
+/// Remap locations from the inlined blocks with CallSiteLoc locations with the
+/// provided caller location.
+static void
+remapInlinedLocations(llvm::iterator_range<Region::iterator> inlinedBlocks,
+                      Location callerLoc) {
+  DenseMap<Location, Location> mappedLocations;
+  auto remapOpLoc = [&](Operation *op) {
+    auto it = mappedLocations.find(op->getLoc());
+    if (it == mappedLocations.end()) {
+      auto newLoc = CallSiteLoc::get(op->getLoc(), callerLoc);
+      it = mappedLocations.try_emplace(op->getLoc(), newLoc).first;
+    }
+    op->setLoc(it->second);
+  };
+  for (auto &block : inlinedBlocks)
+    block.walk(remapOpLoc);
+}
+
+static void
+remapInlinedOperands(llvm::iterator_range<Region::iterator> inlinedBlocks,
+                     BlockAndValueMapping &mapper) {
+  auto remapOperands = [&](Operation *op) {
+    for (auto &operand : op->getOpOperands())
+      if (auto *mappedOp = mapper.lookupOrNull(operand.get()))
+        operand.set(mappedOp);
+  };
+  for (auto &block : inlinedBlocks)
+    block.walk(remapOperands);
+}
+
+//===----------------------------------------------------------------------===//
+// InlinerInterface
+//===----------------------------------------------------------------------===//
+
+InlinerInterface::~InlinerInterface() {}
+
+bool InlinerInterface::isLegalToInline(
+    Region *dest, Region *src, BlockAndValueMapping &valueMapping) const {
+  // Regions can always be inlined into functions.
+  if (isa<FuncOp>(dest->getParentOp()))
+    return true;
+
+  auto *handler = getInterfaceFor(dest->getParentOp());
+  return handler ? handler->isLegalToInline(src, dest, valueMapping) : false;
+}
+
+bool InlinerInterface::isLegalToInline(
+    Operation *op, Region *dest, BlockAndValueMapping &valueMapping) const {
+  auto *handler = getInterfaceFor(op);
+  return handler ? handler->isLegalToInline(op, dest, valueMapping) : false;
+}
+
+bool InlinerInterface::shouldAnalyzeRecursively(Operation *op) const {
+  auto *handler = getInterfaceFor(op);
+  return handler ? handler->shouldAnalyzeRecursively(op) : true;
+}
+
+/// Handle the given inlined terminator by replacing it with a new operation
+/// as necessary.
+void InlinerInterface::handleTerminator(Operation *op, Block *newDest) const {
+  auto *handler = getInterfaceFor(op);
+  assert(handler && "expected valid dialect handler");
+  handler->handleTerminator(op, newDest);
+}
+
+/// Handle the given inlined terminator by replacing it with a new operation
+/// as necessary.
+void InlinerInterface::handleTerminator(Operation *op,
+                                        ArrayRef<Value *> valuesToRepl) const {
+  auto *handler = getInterfaceFor(op);
+  assert(handler && "expected valid dialect handler");
+  handler->handleTerminator(op, valuesToRepl);
+}
+
+/// Utility to check that all of the operations within 'src' can be inlined.
+static bool isLegalToInline(InlinerInterface &interface, Region *src,
+                            Region *insertRegion,
+                            BlockAndValueMapping &valueMapping) {
+  for (auto &block : *src) {
+    for (auto &op : block) {
+      // Check this operation.
+      if (!interface.isLegalToInline(&op, insertRegion, valueMapping))
+        return false;
+      // Check any nested regions.
+      if (interface.shouldAnalyzeRecursively(&op) &&
+          llvm::any_of(op.getRegions(), [&](Region &region) {
+            return !isLegalToInline(interface, &region, insertRegion,
+                                    valueMapping);
+          }))
+        return false;
+    }
+  }
+  return true;
+}
+
+//===----------------------------------------------------------------------===//
+// Inline Methods
+//===----------------------------------------------------------------------===//
+
+LogicalResult mlir::inlineRegion(InlinerInterface &interface, Region *src,
+                                 Operation *inlinePoint,
+                                 BlockAndValueMapping &mapper,
+                                 ArrayRef<Value *> resultsToReplace,
+                                 llvm::Optional<Location> inlineLoc,
+                                 bool shouldCloneInlinedRegion) {
+  // We expect the region to have at least one block.
+  if (src->empty())
+    return failure();
+
+  // Check that all of the region arguments have been mapped.
+  auto *srcEntryBlock = &src->front();
+  if (llvm::any_of(srcEntryBlock->getArguments(),
+                   [&](BlockArgument *arg) { return !mapper.contains(arg); }))
+    return failure();
+
+  // The insertion point must be within a block.
+  Block *insertBlock = inlinePoint->getBlock();
+  if (!insertBlock)
+    return failure();
+  Region *insertRegion = insertBlock->getParent();
+
+  // Check that the operations within the source region are valid to inline.
+  if (!interface.isLegalToInline(insertRegion, src, mapper) ||
+      !isLegalToInline(interface, src, insertRegion, mapper))
+    return failure();
+
+  // Split the insertion block.
+  Block *postInsertBlock =
+      insertBlock->splitBlock(++inlinePoint->getIterator());
+
+  // Check to see if the region is being cloned, or moved inline. In either
+  // case, move the new blocks after the 'insertBlock' to improve IR
+  // readability.
+  if (shouldCloneInlinedRegion)
+    src->cloneInto(insertRegion, postInsertBlock->getIterator(), mapper);
+  else
+    insertRegion->getBlocks().splice(postInsertBlock->getIterator(),
+                                     src->getBlocks(), src->begin(),
+                                     src->end());
+
+  // Get the range of newly inserted blocks.
+  auto newBlocks = llvm::make_range(std::next(insertBlock->getIterator()),
+                                    postInsertBlock->getIterator());
+  Block *firstNewBlock = &*newBlocks.begin();
+
+  // Remap the locations of the inlined operations if a valid source location
+  // was provided.
+  if (inlineLoc && !inlineLoc->isa<UnknownLoc>())
+    remapInlinedLocations(newBlocks, *inlineLoc);
+
+  // If the blocks were moved in-place, make sure to remap any necessary
+  // operands.
+  if (!shouldCloneInlinedRegion)
+    remapInlinedOperands(newBlocks, mapper);
+
+  // Handle the case where only a single block was inlined.
+  if (std::next(newBlocks.begin()) == newBlocks.end()) {
+    // Have the interface handle the terminator of this block.
+    auto *firstBlockTerminator = firstNewBlock->getTerminator();
+    interface.handleTerminator(firstBlockTerminator, resultsToReplace);
+    firstBlockTerminator->erase();
+
+    // Merge the post insert block into the cloned entry block.
+    firstNewBlock->getOperations().splice(firstNewBlock->end(),
+                                          postInsertBlock->getOperations());
+    postInsertBlock->erase();
+  } else {
+    // Otherwise, there were multiple blocks inlined. Add arguments to the post
+    // insertion block to represent the results to replace.
+    for (Value *resultToRepl : resultsToReplace) {
+      resultToRepl->replaceAllUsesWith(
+          postInsertBlock->addArgument(resultToRepl->getType()));
+    }
+
+    /// Handle the terminators for each of the new blocks.
+    for (auto &newBlock : newBlocks)
+      interface.handleTerminator(newBlock.getTerminator(), postInsertBlock);
+  }
+
+  // Splice the instructions of the inlined entry block into the insert block.
+  insertBlock->getOperations().splice(insertBlock->end(),
+                                      firstNewBlock->getOperations());
+  firstNewBlock->erase();
+  return success();
+}
+
+/// This function is an overload of the above 'inlineRegion' that allows for
+/// providing the set of operands ('inlinedOperands') that should be used
+/// in-favor of the region arguments when inlining.
+LogicalResult mlir::inlineRegion(InlinerInterface &interface, Region *src,
+                                 Operation *inlinePoint,
+                                 ArrayRef<Value *> inlinedOperands,
+                                 ArrayRef<Value *> resultsToReplace,
+                                 llvm::Optional<Location> inlineLoc,
+                                 bool shouldCloneInlinedRegion) {
+  // We expect the region to have at least one block.
+  if (src->empty())
+    return failure();
+
+  auto *entryBlock = &src->front();
+  if (inlinedOperands.size() != entryBlock->getNumArguments())
+    return failure();
+
+  // Map the provided call operands to the arguments of the region.
+  BlockAndValueMapping mapper;
+  for (unsigned i = 0, e = inlinedOperands.size(); i != e; ++i) {
+    // Verify that the types of the provided values match the function argument
+    // types.
+    BlockArgument *regionArg = entryBlock->getArgument(i);
+    if (inlinedOperands[i]->getType() != regionArg->getType())
+      return failure();
+    mapper.map(regionArg, inlinedOperands[i]);
+  }
+
+  // Call into the main region inliner function.
+  return inlineRegion(interface, src, inlinePoint, mapper, resultsToReplace,
+                      inlineLoc, shouldCloneInlinedRegion);
+}
+
+/// This function inlines a FuncOp into another. This function returns failure
+/// if it is not possible to inline this FuncOp. If the function returned
+/// failure, then no changes to the module have been made.
+///
+/// Note that this only does one level of inlining.  For example, if the
+/// instruction 'call B' is inlined, and 'B' calls 'C', then the call to 'C' now
+/// exists in the instruction stream.  Similarly this will inline a recursive
+/// FuncOp by one level.
+///
+LogicalResult mlir::inlineFunction(InlinerInterface &interface, FuncOp callee,
+                                   Operation *inlinePoint,
+                                   ArrayRef<Value *> callOperands,
+                                   ArrayRef<Value *> callResults,
+                                   Location inlineLoc) {
+  // We don't inline if the provided callee function is a declaration.
+  assert(callee && "expected valid function to inline");
+  if (callee.isExternal())
+    return failure();
+
+  // Verify that the provided arguments match the function arguments.
+  if (callOperands.size() != callee.getNumArguments())
+    return failure();
+
+  // Verify that the provided values to replace match the function results.
+  auto funcResultTypes = callee.getType().getResults();
+  if (callResults.size() != funcResultTypes.size())
+    return failure();
+  for (unsigned i = 0, e = callResults.size(); i != e; ++i)
+    if (callResults[i]->getType() != funcResultTypes[i])
+      return failure();
+
+  // Call into the main region inliner function.
+  return inlineRegion(interface, &callee.getBody(), inlinePoint, callOperands,
+                      callResults, inlineLoc);
+}
diff --git a/third_party/mlir/lib/Transforms/Utils/RegionUtils.cpp b/third_party/mlir/lib/Transforms/Utils/RegionUtils.cpp
index 9974e47c2c1..24c38c4f2b9 100644
--- a/third_party/mlir/lib/Transforms/Utils/RegionUtils.cpp
+++ b/third_party/mlir/lib/Transforms/Utils/RegionUtils.cpp
@@ -32,8 +32,9 @@ void mlir::replaceAllUsesInRegionWith(Value *orig, Value *replacement,
   }
 }
 
-void mlir::getUsedValuesDefinedAbove(Region &region, Region &limit,
-                                     llvm::SetVector<Value *> &values) {
+void mlir::visitUsedValuesDefinedAbove(
+    Region &region, Region &limit,
+    llvm::function_ref<void(OpOperand *)> callback) {
   assert(limit.isAncestor(&region) &&
          "expected isolation limit to be an ancestor of the given region");
 
@@ -45,12 +46,25 @@ void mlir::getUsedValuesDefinedAbove(Region &region, Region &limit,
     properAncestors.insert(reg);
   }
 
-  region.walk([&values, &properAncestors](Operation *op) {
-    for (Value *operand : op->getOperands())
-      // Collect values that are used by an operation and defined in a proper
-      // ancestor of region.
-      if (properAncestors.count(operand->getParentRegion()))
-        values.insert(operand);
+  region.walk([callback, &properAncestors](Operation *op) {
+    for (OpOperand &operand : op->getOpOperands())
+      // Callback on values defined in a proper ancestor of region.
+      if (properAncestors.count(operand.get()->getParentRegion()))
+        callback(&operand);
+  });
+}
+
+void mlir::visitUsedValuesDefinedAbove(
+    llvm::MutableArrayRef<Region> regions,
+    llvm::function_ref<void(OpOperand *)> callback) {
+  for (Region &region : regions)
+    visitUsedValuesDefinedAbove(region, region, callback);
+}
+
+void mlir::getUsedValuesDefinedAbove(Region &region, Region &limit,
+                                     llvm::SetVector<Value *> &values) {
+  visitUsedValuesDefinedAbove(region, limit, [&](OpOperand *operand) {
+    values.insert(operand->get());
   });
 }
 
diff --git a/third_party/mlir/lib/Transforms/ViewOpGraph.cpp b/third_party/mlir/lib/Transforms/ViewOpGraph.cpp
new file mode 100644
index 00000000000..afb65c7d148
--- /dev/null
+++ b/third_party/mlir/lib/Transforms/ViewOpGraph.cpp
@@ -0,0 +1,163 @@
+//===- ViewOpGraph.cpp - View/write op graphviz graphs --------------------===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+
+#include "mlir/Transforms/ViewOpGraph.h"
+#include "mlir/IR/Block.h"
+#include "mlir/IR/Operation.h"
+#include "mlir/Pass/Pass.h"
+#include "llvm/Support/CommandLine.h"
+
+// NOLINTNEXTLINE
+static llvm::cl::opt<int> elideIfLarger(
+    "print-op-graph-elide-if-larger",
+    llvm::cl::desc("Upper limit to emit elements attribute rather than elide"),
+    llvm::cl::init(16));
+
+namespace llvm {
+
+// Specialize GraphTraits to treat Block as a graph of Operations as nodes and
+// uses as edges.
+template <> struct GraphTraits<mlir::Block *> {
+  using GraphType = mlir::Block *;
+  using NodeRef = mlir::Operation *;
+
+  using ChildIteratorType = mlir::UseIterator;
+  static ChildIteratorType child_begin(NodeRef n) {
+    return ChildIteratorType(n);
+  }
+  static ChildIteratorType child_end(NodeRef n) {
+    return ChildIteratorType(n, /*end=*/true);
+  }
+
+  // Operation's destructor is private so use Operation* instead and use
+  // mapped iterator.
+  static mlir::Operation *AddressOf(mlir::Operation &op) { return &op; }
+  using nodes_iterator =
+      mapped_iterator<mlir::Block::iterator, decltype(&AddressOf)>;
+  static nodes_iterator nodes_begin(mlir::Block *b) {
+    return nodes_iterator(b->begin(), &AddressOf);
+  }
+  static nodes_iterator nodes_end(mlir::Block *b) {
+    return nodes_iterator(b->end(), &AddressOf);
+  }
+};
+
+// Specialize DOTGraphTraits to produce more readable output.
+template <>
+struct DOTGraphTraits<mlir::Block *> : public DefaultDOTGraphTraits {
+  using DefaultDOTGraphTraits::DefaultDOTGraphTraits;
+  static std::string getNodeLabel(mlir::Operation *op, mlir::Block *);
+};
+
+std::string DOTGraphTraits<mlir::Block *>::getNodeLabel(mlir::Operation *op,
+                                                        mlir::Block *b) {
+  // Reuse the print output for the node labels.
+  std::string ostr;
+  raw_string_ostream os(ostr);
+  os << op->getName() << "\n";
+  for (auto attr : op->getAttrs()) {
+    os << '\n' << attr.first << ": ";
+    // Always emit splat attributes.
+    if (attr.second.isa<mlir::SplatElementsAttr>()) {
+      attr.second.print(os);
+      continue;
+    }
+
+    // Elide "big" elements attributes.
+    auto elements = attr.second.dyn_cast<mlir::ElementsAttr>();
+    if (elements && elements.getNumElements() > elideIfLarger) {
+      os << "...";
+      continue;
+    }
+
+    // Print all other attributes.
+    attr.second.print(os);
+  }
+  return os.str();
+}
+
+} // end namespace llvm
+
+namespace {
+// PrintOpPass is simple pass to write graph per function.
+// Note: this is a module pass only to avoid interleaving on the same ostream
+// due to multi-threading over functions.
+struct PrintOpPass : public mlir::ModulePass<PrintOpPass> {
+  explicit PrintOpPass(llvm::raw_ostream &os = llvm::errs(),
+                       bool short_names = false, const llvm::Twine &title = "")
+      : os(os), title(title.str()), short_names(short_names) {}
+
+  std::string getOpName(mlir::Operation &op) {
+    auto symbolAttr = op.getAttrOfType<mlir::StringAttr>(
+        mlir::SymbolTable::getSymbolAttrName());
+    if (symbolAttr)
+      return symbolAttr.getValue();
+    ++unnamedOpCtr;
+    return (op.getName().getStringRef() + llvm::utostr(unnamedOpCtr)).str();
+  }
+
+  // Print all the ops in a module.
+  void processModule(mlir::ModuleOp module) {
+    for (mlir::Operation &op : module) {
+      // Modules may actually be nested, recurse on nesting.
+      if (auto nestedModule = llvm::dyn_cast<mlir::ModuleOp>(op)) {
+        processModule(nestedModule);
+        continue;
+      }
+      auto opName = getOpName(op);
+      for (mlir::Region &region : op.getRegions()) {
+        for (auto indexed_block : llvm::enumerate(region)) {
+          // Suffix block number if there are more than 1 block.
+          auto blockName = region.getBlocks().size() == 1
+                               ? ""
+                               : ("__" + llvm::utostr(indexed_block.index()));
+          llvm::WriteGraph(os, &indexed_block.value(), short_names,
+                           llvm::Twine(title) + opName + blockName);
+        }
+      }
+    }
+  }
+
+  void runOnModule() override { processModule(getModule()); }
+
+private:
+  llvm::raw_ostream &os;
+  std::string title;
+  int unnamedOpCtr = 0;
+  bool short_names;
+};
+} // namespace
+
+void mlir::viewGraph(mlir::Block &block, const llvm::Twine &name,
+                     bool shortNames, const llvm::Twine &title,
+                     llvm::GraphProgram::Name program) {
+  llvm::ViewGraph(&block, name, shortNames, title, program);
+}
+
+llvm::raw_ostream &mlir::writeGraph(llvm::raw_ostream &os, mlir::Block &block,
+                                    bool shortNames, const llvm::Twine &title) {
+  return llvm::WriteGraph(os, &block, shortNames, title);
+}
+
+std::unique_ptr<mlir::ModulePassBase>
+mlir::createPrintOpGraphPass(llvm::raw_ostream &os, bool shortNames,
+                             const llvm::Twine &title) {
+  return std::make_unique<PrintOpPass>(os, shortNames, title);
+}
+
+static mlir::PassRegistration<PrintOpPass> pass("print-op-graph",
+                                                "Print op graph per region");
diff --git a/third_party/mlir/mlir_configure.bzl b/third_party/mlir/mlir_configure.bzl
new file mode 100644
index 00000000000..6ffbd90ecfc
--- /dev/null
+++ b/third_party/mlir/mlir_configure.bzl
@@ -0,0 +1,22 @@
+"""Repository rule for MLIR autoconfiguration."""
+
+def _mlir_configure_impl(repository_ctx):
+    repository_ctx.file("WORKSPACE", "")
+    label = Label("@org_tensorflow//third_party/mlir:mlir_configure.bzl")
+    for entry in repository_ctx.path(label).dirname.readdir():
+        repository_ctx.symlink(entry, entry.basename)
+
+mlir_configure = repository_rule(
+    implementation = _mlir_configure_impl,
+)
+"""Detects and configures the MLIR configuration.
+
+Add the following to your WORKSPACE FILE:
+
+```python
+mlir_configure(name = "local_config_mlir")
+```
+
+Args:
+  name: A unique name for this workspace rule.
+"""
diff --git a/third_party/mlir/tblgen.bzl b/third_party/mlir/tblgen.bzl
index a5a0bcd2463..da57bccf70f 100644
--- a/third_party/mlir/tblgen.bzl
+++ b/third_party/mlir/tblgen.bzl
@@ -1,6 +1,6 @@
 """BUILD extensions for MLIR table generation."""
 
-def gentbl(name, tblgen, td_file, tbl_outs, td_srcs = []):
+def gentbl(name, tblgen, td_file, tbl_outs, td_srcs = [], td_includes = [], strip_include_prefix = None):
     """gentbl() generates tabular code from a table definition file.
 
     Args:
@@ -11,6 +11,8 @@ def gentbl(name, tblgen, td_file, tbl_outs, td_srcs = []):
         options passed to tblgen, and the out is the corresponding output file
         produced.
       td_srcs: A list of table definition files included transitively.
+      td_includes: A list of include paths for relative includes.
+      strip_include_prefix: attribute to pass through to cc_library.
     """
     srcs = []
     srcs += td_srcs
@@ -19,8 +21,10 @@ def gentbl(name, tblgen, td_file, tbl_outs, td_srcs = []):
 
     # Add google_mlir/include directory as include so derived op td files can
     # import relative to that.
-    td_includes = "-I external/local_config_mlir/include -I external/org_tensorflow "
-    td_includes += "-I $$(dirname $(location %s)) " % td_file
+    td_includes_str = "-I external/local_config_mlir/include -I external/org_tensorflow "
+    for td_include in td_includes:
+        td_includes_str += "-I %s " % td_include
+    td_includes_str += "-I $$(dirname $(location %s)) " % td_file
     for (opts, out) in tbl_outs:
         rule_suffix = "_".join(opts.replace("-", "_").replace("=", "_").split(" "))
         native.genrule(
@@ -31,13 +35,19 @@ def gentbl(name, tblgen, td_file, tbl_outs, td_srcs = []):
             message = "Generating code from table: %s" % td_file,
             cmd = (("$(location %s) %s %s $(location %s) -o $@") % (
                 tblgen,
-                td_includes,
+                td_includes_str,
                 opts,
                 td_file,
             )),
         )
 
+    # List of opts that do not generate cc files.
+    skip_opts = ["-gen-op-doc"]
+    hdrs = [f for (opts, f) in tbl_outs if opts not in skip_opts]
     native.cc_library(
         name = name,
-        textual_hdrs = [f for (_, f) in tbl_outs],
+        # include_prefix does not apply to textual_hdrs.
+        hdrs = hdrs if strip_include_prefix else [],
+        strip_include_prefix = strip_include_prefix,
+        textual_hdrs = hdrs,
     )
diff --git a/third_party/mlir/test/BUILD b/third_party/mlir/test/BUILD
index 5d07029ed39..c94a3973a38 100644
--- a/third_party/mlir/test/BUILD
+++ b/third_party/mlir/test/BUILD
@@ -19,6 +19,7 @@ cc_library(
 
 gentbl(
     name = "TestOpsIncGen",
+    strip_include_prefix = "lib/TestDialect",
     tbl_outs = [
         (
             "-gen-op-decls",
@@ -44,10 +45,7 @@ cc_library(
     name = "TestDialect",
     srcs = [
         "lib/TestDialect/TestDialect.cpp",
-        "lib/TestDialect/TestOps.cpp.inc",
-        "lib/TestDialect/TestOps.h.inc",
         "lib/TestDialect/TestPatterns.cpp",
-        "lib/TestDialect/TestPatterns.inc",
     ],
     hdrs = [
         "lib/TestDialect/TestDialect.h",
@@ -66,16 +64,33 @@ cc_library(
     alwayslink = 1,
 )
 
+cc_library(
+    name = "TestPass",
+    srcs = [
+        "lib/Pass/TestPassManager.cpp",
+    ],
+    deps = [
+        "@llvm//:support",
+        "@local_config_mlir//:IR",
+        "@local_config_mlir//:Pass",
+        "@local_config_mlir//:Support",
+    ],
+    alwayslink = 1,
+)
+
 cc_library(
     name = "TestTransforms",
     srcs = [
         "lib/Transforms/TestConstantFold.cpp",
+        "lib/Transforms/TestInlining.cpp",
         "lib/Transforms/TestLoopFusion.cpp",
         "lib/Transforms/TestLoopMapping.cpp",
         "lib/Transforms/TestLoopParametricTiling.cpp",
         "lib/Transforms/TestVectorizationUtils.cpp",
     ],
+    includes = ["lib/TestDialect"],
     deps = [
+        ":TestDialect",
         "@llvm//:support",
         "@local_config_mlir//:AffineOps",
         "@local_config_mlir//:Analysis",
diff --git a/third_party/mlir/test/lib/CMakeLists.txt b/third_party/mlir/test/lib/CMakeLists.txt
index 860376bd52b..091dfc9e3be 100644
--- a/third_party/mlir/test/lib/CMakeLists.txt
+++ b/third_party/mlir/test/lib/CMakeLists.txt
@@ -1,2 +1,3 @@
+add_subdirectory(Pass)
 add_subdirectory(TestDialect)
 add_subdirectory(Transforms)
diff --git a/third_party/mlir/test/lib/Pass/CMakeLists.txt b/third_party/mlir/test/lib/Pass/CMakeLists.txt
new file mode 100644
index 00000000000..3289a7add94
--- /dev/null
+++ b/third_party/mlir/test/lib/Pass/CMakeLists.txt
@@ -0,0 +1,10 @@
+add_llvm_library(MLIRTestPass
+  TestPassManager.cpp
+
+  ADDITIONAL_HEADER_DIRS
+  ${MLIR_MAIN_INCLUDE_DIR}/mlir/Pass
+  )
+target_link_libraries(MLIRTestPass
+  MLIRIR
+  MLIRPass
+  )
diff --git a/third_party/mlir/test/lib/Pass/TestPassManager.cpp b/third_party/mlir/test/lib/Pass/TestPassManager.cpp
new file mode 100644
index 00000000000..7e4d8a7a80e
--- /dev/null
+++ b/third_party/mlir/test/lib/Pass/TestPassManager.cpp
@@ -0,0 +1,54 @@
+//===- TestPassManager.cpp - Test pass manager functionality --------------===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+
+#include "mlir/IR/Function.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Pass/PassManager.h"
+
+using namespace mlir;
+
+namespace {
+struct TestModulePass : public ModulePass<TestModulePass> {
+  void runOnModule() final {}
+};
+struct TestFunctionPass : public FunctionPass<TestFunctionPass> {
+  void runOnFunction() final {}
+};
+} // namespace
+
+static void testNestedPipeline(OpPassManager &pm) {
+  // Nest a module pipeline that contains:
+  /// A module pass.
+  auto &modulePM = pm.nest<ModuleOp>();
+  modulePM.addPass(std::make_unique<TestModulePass>());
+  /// A nested function pass.
+  auto &nestedFunctionPM = modulePM.nest<FuncOp>();
+  nestedFunctionPM.addPass(std::make_unique<TestFunctionPass>());
+
+  // Nest a function pipeline that contains a single pass.
+  auto &functionPM = pm.nest<FuncOp>();
+  functionPM.addPass(std::make_unique<TestFunctionPass>());
+}
+
+static PassRegistration<TestModulePass>
+    unusedMP("test-module-pass", "Test a module pass in the pass manager");
+static PassRegistration<TestFunctionPass>
+    unusedFP("test-function-pass", "Test a function pass in the pass manager");
+
+static PassPipelineRegistration
+    unused("test-pm-nested-pipeline",
+           "Test a nested pipeline in the pass manager", testNestedPipeline);
diff --git a/third_party/mlir/test/lib/TestDialect/TestDialect.cpp b/third_party/mlir/test/lib/TestDialect/TestDialect.cpp
index 84d4ed82cc7..21240a3cd8d 100644
--- a/third_party/mlir/test/lib/TestDialect/TestDialect.cpp
+++ b/third_party/mlir/test/lib/TestDialect/TestDialect.cpp
@@ -19,6 +19,7 @@
 #include "mlir/IR/PatternMatch.h"
 #include "mlir/IR/TypeUtilities.h"
 #include "mlir/Transforms/FoldUtils.h"
+#include "mlir/Transforms/InliningUtils.h"
 
 using namespace mlir;
 
@@ -33,11 +34,51 @@ struct TestOpFolderDialectInterface : public OpFolderDialectInterface {
   /// Registered hook to check if the given region, which is attached to an
   /// operation that is *not* isolated from above, should be used when
   /// materializing constants.
-  virtual bool shouldMaterializeInto(Region *region) const {
+  bool shouldMaterializeInto(Region *region) const final {
     // If this is a one region operation, then insert into it.
     return isa<OneRegionOp>(region->getParentOp());
   }
 };
+
+/// This class defines the interface for handling inlining with standard
+/// operations.
+struct TestInlinerInterface : public DialectInlinerInterface {
+  using DialectInlinerInterface::DialectInlinerInterface;
+
+  //===--------------------------------------------------------------------===//
+  // Analysis Hooks
+  //===--------------------------------------------------------------------===//
+
+  bool isLegalToInline(Operation *, Region *,
+                       BlockAndValueMapping &) const final {
+    return true;
+  }
+
+  bool shouldAnalyzeRecursively(Operation *op) const {
+    // Analyze recursively if this is not a functional region operation, it
+    // froms a separate functional scope.
+    return !isa<FunctionalRegionOp>(op);
+  }
+
+  //===--------------------------------------------------------------------===//
+  // Transformation Hooks
+  //===--------------------------------------------------------------------===//
+
+  /// Handle the given inlined terminator by replacing it with a new operation
+  /// as necessary.
+  void handleTerminator(Operation *op,
+                        ArrayRef<Value *> valuesToRepl) const final {
+    // Only handle "test.return" here.
+    auto returnOp = dyn_cast<TestReturnOp>(op);
+    if (!returnOp)
+      return;
+
+    // Replace the values directly with the return operands.
+    assert(returnOp.getNumOperands() == valuesToRepl.size());
+    for (const auto &it : llvm::enumerate(returnOp.getOperands()))
+      valuesToRepl[it.index()]->replaceAllUsesWith(it.value());
+  }
+};
 } // end anonymous namespace
 
 //===----------------------------------------------------------------------===//
@@ -50,7 +91,7 @@ TestDialect::TestDialect(MLIRContext *context)
 #define GET_OP_LIST
 #include "TestOps.cpp.inc"
       >();
-  addInterfaces<TestOpFolderDialectInterface>();
+  addInterfaces<TestOpFolderDialectInterface, TestInlinerInterface>();
   allowUnknownOperations();
 }
 
@@ -81,6 +122,42 @@ static void print(OpAsmPrinter *p, IsolatedRegionOp op) {
   p->printRegion(op.region(), /*printEntryBlockArgs=*/false);
 }
 
+//===----------------------------------------------------------------------===//
+// Test WrapRegionOp - wrapping op exercising `parseGenericOperation()`.
+//===----------------------------------------------------------------------===//
+
+static ParseResult parseWrappingRegionOp(OpAsmParser *parser,
+                                         OperationState *result) {
+  if (parser->parseOptionalKeyword("wraps"))
+    return failure();
+
+  // Parse the wrapped op in a region
+  Region &body = *result->addRegion();
+  body.push_back(new Block);
+  Block &block = body.back();
+  Operation *wrapped_op = parser->parseGenericOperation(&block, block.begin());
+  if (!wrapped_op)
+    return failure();
+
+  // Create a return terminator in the inner region, pass as operand to the
+  // terminator the returned values from the wrapped operation.
+  SmallVector<Value *, 8> return_operands(wrapped_op->getResults());
+  OpBuilder builder(parser->getBuilder().getContext());
+  builder.setInsertionPointToEnd(&block);
+  builder.create<TestReturnOp>(result->location, return_operands);
+
+  // Get the results type for the wrapping op from the terminator operands.
+  Operation &return_op = body.back().back();
+  result->types.append(return_op.operand_type_begin(),
+                       return_op.operand_type_end());
+  return success();
+}
+
+static void print(OpAsmPrinter *p, WrappingRegionOp op) {
+  *p << op.getOperationName() << " wraps ";
+  p->printGenericOp(&op.region().front().front());
+}
+
 //===----------------------------------------------------------------------===//
 // Test PolyForOp - parse list of region arguments.
 //===----------------------------------------------------------------------===//
diff --git a/third_party/mlir/test/lib/TestDialect/TestOps.td b/third_party/mlir/test/lib/TestDialect/TestOps.td
index ee7a3965bb9..1fda7d41356 100644
--- a/third_party/mlir/test/lib/TestDialect/TestOps.td
+++ b/third_party/mlir/test/lib/TestDialect/TestOps.td
@@ -174,6 +174,11 @@ def SizedRegionOp : TEST_Op<"sized_region_op", []> {
   let regions = (region SizedRegion<2>:$my_region, SizedRegion<1>);
 }
 
+def FunctionalRegionOp : TEST_Op<"functional_region_op", []> {
+  let regions = (region AnyRegion:$body);
+  let results = (outs FunctionType);
+}
+
 //===----------------------------------------------------------------------===//
 // Test Traits
 //===----------------------------------------------------------------------===//
@@ -263,10 +268,16 @@ def SingleBlockImplicitTerminatorOp : TEST_Op<"SingleBlockImplicitTerminator",
   let regions = (region SizedRegion<1>:$region);
 }
 
-def I32ElementsAttributesOp : TEST_Op<"i32ElementsAttr"> {
+def I32ElementsAttrOp : TEST_Op<"i32ElementsAttr"> {
   let arguments = (ins I32ElementsAttr:$attr);
 }
 
+def IsNotScalar : Constraint<CPred<"$0.getType().getRank() != 0">>;
+
+def UpdateAttr : Pat<(I32ElementsAttrOp $attr),
+                     (I32ElementsAttrOp ConstantAttr<I32ElementsAttr, "0">),
+                     [(IsNotScalar $attr)]>;
+
 //===----------------------------------------------------------------------===//
 // Test Patterns
 //===----------------------------------------------------------------------===//
@@ -746,6 +757,20 @@ def IsolatedRegionOp : TEST_Op<"isolated_region", [IsolatedFromAbove]> {
   let printer = [{ return ::print(p, *this); }];
 }
 
+def WrappingRegionOp : TEST_Op<"wrapping_region",
+    [SingleBlockImplicitTerminator<"TestReturnOp">]> {
+  let summary =  "wrapping region operation";
+  let description = [{
+    Test op wrapping another op in a region, to test calling
+    parseGenericOperation from the custom parser.
+  }];
+
+  let results = (outs Variadic<AnyType>:$outputs);
+  let regions = (region SizedRegion<1>:$region);
+  let parser = [{ return ::parse$cppClass(parser, result); }];
+  let printer = [{ return ::print(p, *this); }];
+}
+
 def PolyForOp : TEST_Op<"polyfor">
 {
   let summary =  "polyfor operation";
diff --git a/third_party/mlir/test/lib/Transforms/CMakeLists.txt b/third_party/mlir/test/lib/Transforms/CMakeLists.txt
index fa66eb34af0..a9c7217503e 100644
--- a/third_party/mlir/test/lib/Transforms/CMakeLists.txt
+++ b/third_party/mlir/test/lib/Transforms/CMakeLists.txt
@@ -1,6 +1,7 @@
 add_llvm_library(MLIRTestTransforms
   TestConstantFold.cpp
   TestLoopFusion.cpp
+  TestInlining.cpp
   TestLoopMapping.cpp
   TestLoopParametricTiling.cpp
   TestVectorizationUtils.cpp
@@ -8,11 +9,14 @@ add_llvm_library(MLIRTestTransforms
   ADDITIONAL_HEADER_DIRS
   ${MLIR_MAIN_INCLUDE_DIR}/mlir/Transforms
   )
+include_directories(${CMAKE_CURRENT_SOURCE_DIR}/../TestDialect)
+include_directories(${CMAKE_CURRENT_BINARY_DIR}/../TestDialect)
 add_dependencies(MLIRTestTransforms MLIRStandardOpsIncGen)
 target_link_libraries(MLIRTestTransforms
   MLIRAffineOps
   MLIRAnalysis
   MLIRLoopOps
   MLIRPass
+  MLIRTestDialect
   MLIRVectorOps
   )
diff --git a/third_party/mlir/test/lib/Transforms/TestInlining.cpp b/third_party/mlir/test/lib/Transforms/TestInlining.cpp
new file mode 100644
index 00000000000..0571dc62b73
--- /dev/null
+++ b/third_party/mlir/test/lib/Transforms/TestInlining.cpp
@@ -0,0 +1,73 @@
+//===- TestInlining.cpp - Pass to inline calls in the test dialect --------===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+//
+// TODO(riverriddle) This pass is only necessary because the main inlining pass
+// has no abstracted away the call+callee relationship. When the inlining
+// interface has this support, this pass should be removed.
+//
+//===----------------------------------------------------------------------===//
+
+#include "TestDialect.h"
+#include "mlir/Dialect/StandardOps/Ops.h"
+#include "mlir/IR/Function.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Transforms/InliningUtils.h"
+#include "mlir/Transforms/Passes.h"
+#include "llvm/ADT/StringSet.h"
+
+using namespace mlir;
+
+namespace {
+struct Inliner : public FunctionPass<Inliner> {
+  void runOnFunction() override {
+    auto function = getFunction();
+
+    // Collect each of the direct function calls within the module.
+    SmallVector<CallIndirectOp, 16> callers;
+    function.walk([&](CallIndirectOp caller) { callers.push_back(caller); });
+
+    // Build the inliner interface.
+    InlinerInterface interface(&getContext());
+
+    // Try to inline each of the call operations.
+    for (auto caller : callers) {
+      auto callee = dyn_cast_or_null<FunctionalRegionOp>(
+          caller.getCallee()->getDefiningOp());
+      if (!callee)
+        continue;
+
+      // Inline the functional region operation, but only clone the internal
+      // region if there is more than one use.
+      if (failed(inlineRegion(
+              interface, &callee.body(), caller,
+              llvm::to_vector<8>(caller.getArgOperands()),
+              llvm::to_vector<8>(caller.getResults()), caller.getLoc(),
+              /*shouldCloneInlinedRegion=*/!callee.getResult()->hasOneUse())))
+        continue;
+
+      // If the inlining was successful then erase the call and callee if
+      // possible.
+      caller.erase();
+      if (callee.use_empty())
+        callee.erase();
+    }
+  }
+};
+} // end anonymous namespace
+
+static PassRegistration<Inliner> pass("test-inline",
+                                      "Test inlining region calls");
diff --git a/third_party/mlir/tools/mlir-opt/CMakeLists.txt b/third_party/mlir/tools/mlir-opt/CMakeLists.txt
index ff12852e347..2f57eb02f18 100644
--- a/third_party/mlir/tools/mlir-opt/CMakeLists.txt
+++ b/third_party/mlir/tools/mlir-opt/CMakeLists.txt
@@ -40,6 +40,7 @@ set(LIBS
   MLIRStandardToLLVM
   MLIRTransforms
   MLIRTestDialect
+  MLIRTestPass
   MLIRTestTransforms
   MLIRSupport
   MLIRVectorOps
diff --git a/third_party/mlir/tools/mlir-opt/mlir-opt.cpp b/third_party/mlir/tools/mlir-opt/mlir-opt.cpp
index 3f9dbcde61e..d01f66d4e0b 100644
--- a/third_party/mlir/tools/mlir-opt/mlir-opt.cpp
+++ b/third_party/mlir/tools/mlir-opt/mlir-opt.cpp
@@ -56,18 +56,14 @@ static cl::opt<bool>
                  cl::desc("Run the verifier after each transformation pass"),
                  cl::init(true));
 
-static std::vector<const PassRegistryEntry *> *passList;
-
 int main(int argc, char **argv) {
   InitLLVM y(argc, argv);
 
   // Register any pass manager command line options.
   registerPassManagerCLOptions();
+  PassPipelineCLParser passPipeline("", "Compiler passes to run");
 
   // Parse pass names in main to ensure static initialization completed.
-  llvm::cl::list<const PassRegistryEntry *, bool, PassNameParser> passList(
-      "", llvm::cl::desc("Compiler passes to run"));
-  ::passList = &passList;
   cl::ParseCommandLineOptions(argc, argv, "MLIR modular optimizer driver\n");
 
   // Set up the input file.
@@ -84,6 +80,6 @@ int main(int argc, char **argv) {
     exit(1);
   }
 
-  return failed(MlirOptMain(output->os(), std::move(file), passList,
+  return failed(MlirOptMain(output->os(), std::move(file), passPipeline,
                             splitInputFile, verifyDiagnostics, verifyPasses));
 }
diff --git a/third_party/nccl/nccl_configure.bzl b/third_party/nccl/nccl_configure.bzl
index c0c03feb143..4b341d02d22 100644
--- a/third_party/nccl/nccl_configure.bzl
+++ b/third_party/nccl/nccl_configure.bzl
@@ -1,4 +1,3 @@
-# -*- Python -*-
 """Repository rule for NCCL configuration.
 
 `nccl_configure` depends on the following environment variables:
diff --git a/third_party/repo.bzl b/third_party/repo.bzl
index 4fc91afb36c..d8ff01a4a84 100644
--- a/third_party/repo.bzl
+++ b/third_party/repo.bzl
@@ -61,12 +61,9 @@ def _repos_are_siblings():
     return Label("@foo//bar").workspace_root.startswith("../")
 
 # Apply a patch_file to the repository root directory
-# Runs 'git apply' on Unix, 'patch -p1' on Windows.
+# Runs 'patch -p1' on both Windows and Unix.
 def _apply_patch(ctx, patch_file):
-    if _is_windows(ctx):
-        patch_command = ["patch", "-p1", "-d", ctx.path("."), "-i", ctx.path(patch_file)]
-    else:
-        patch_command = ["git", "apply", "-v", ctx.path(patch_file)]
+    patch_command = ["patch", "-p1", "-d", ctx.path("."), "-i", ctx.path(patch_file)]
     cmd = _wrap_bash_cmd(ctx, patch_command)
     _execute_and_check_ret_code(ctx, cmd)
 
diff --git a/third_party/systemlibs/syslibs_configure.bzl b/third_party/systemlibs/syslibs_configure.bzl
index f83c0dd3d5f..4452300f622 100644
--- a/third_party/systemlibs/syslibs_configure.bzl
+++ b/third_party/systemlibs/syslibs_configure.bzl
@@ -1,4 +1,3 @@
-# -*- Python -*-
 """Repository rule for system library autoconfiguration.
 
 `syslibs_configure` depends on the following environment variables:
diff --git a/third_party/tensorrt/tensorrt_configure.bzl b/third_party/tensorrt/tensorrt_configure.bzl
index 603732fce21..ebae2dbb89f 100644
--- a/third_party/tensorrt/tensorrt_configure.bzl
+++ b/third_party/tensorrt/tensorrt_configure.bzl
@@ -1,4 +1,3 @@
-# -*- Python -*-
 """Repository rule for TensorRT configuration.
 
 `tensorrt_configure` depends on the following environment variables:
@@ -27,8 +26,8 @@ _TF_TENSORRT_HEADERS_V6 = [
     "NvUtils.h",
     "NvInferPlugin.h",
     "NvInferVersion.h",
-    "NvInferRTSafe.h",
-    "NvInferRTExt.h",
+    "NvInferRuntime.h",
+    "NvInferRuntimeCommon.h",
     "NvInferPluginUtils.h",
 ]
 
diff --git a/third_party/toolchains/cpus/arm/arm_compiler_configure.bzl b/third_party/toolchains/cpus/arm/arm_compiler_configure.bzl
index 3fa4f2090dd..8433acfc916 100644
--- a/third_party/toolchains/cpus/arm/arm_compiler_configure.bzl
+++ b/third_party/toolchains/cpus/arm/arm_compiler_configure.bzl
@@ -1,4 +1,3 @@
-# -*- Python -*-
 """Repository rule for arm compiler autoconfiguration."""
 
 def _tpl(repository_ctx, tpl, substitutions = {}, out = None):
diff --git a/third_party/toolchains/cpus/arm/cc_config.bzl.tpl b/third_party/toolchains/cpus/arm/cc_config.bzl.tpl
index 4149206a7cf..bfe91e711bc 100644
--- a/third_party/toolchains/cpus/arm/cc_config.bzl.tpl
+++ b/third_party/toolchains/cpus/arm/cc_config.bzl.tpl
@@ -331,17 +331,13 @@ def _impl(ctx):
                             flags = [
                                 "-std=c++11",
                                 "-isystem",
-                                "%{ARM_COMPILER_PATH}%/arm-linux-gnueabihf/include/c++/4.9.3/",
+			 	"%{ARM_COMPILER_PATH}%/lib/gcc/arm-rpi-linux-gnueabihf/6.5.0/include",
                                 "-isystem",
-                                "%{ARM_COMPILER_PATH}%/arm-linux-gnueabihf/sysroot/usr/include/",
+                                "%{ARM_COMPILER_PATH}%/lib/gcc/arm-rpi-linux-gnueabihf/6.5.0/include-fixed",
                                 "-isystem",
-                                "%{ARM_COMPILER_PATH}%/arm-linux-gnueabihf/libc/usr/include/",
+                                "%{ARM_COMPILER_PATH}%/arm-rpi-linux-gnueabihf/sysroot/usr/include/",
                                 "-isystem",
-                                "%{ARM_COMPILER_PATH}%/lib/gcc/arm-linux-gnueabihf/4.9.3/include",
-                                "-isystem",
-                                "%{ARM_COMPILER_PATH}%/lib/gcc/arm-linux-gnueabihf/4.9.3/include-fixed",
-                                "-isystem",
-                                "%{ARM_COMPILER_PATH}%/local_include",
+		                "%{ARM_COMPILER_PATH}%/arm-rpi-linux-gnueabihf/include/c++/6.5.0/",
                                 "-isystem",
                                 "%{PYTHON_INCLUDE_PATH}%",
                                 "-isystem",
@@ -563,12 +559,10 @@ def _impl(ctx):
 
     if (ctx.attr.cpu == "armeabi"):
         cxx_builtin_include_directories = [
-                "%{ARM_COMPILER_PATH}%/arm-linux-gnueabihf/include/c++/4.9.3/",
-                "%{ARM_COMPILER_PATH}%/arm-linux-gnueabihf/sysroot/usr/include/",
-                "%{ARM_COMPILER_PATH}%/arm-linux-gnueabihf/libc/usr/include/",
-                "%{ARM_COMPILER_PATH}%/lib/gcc/arm-linux-gnueabihf/4.9.3/include",
-                "%{ARM_COMPILER_PATH}%/lib/gcc/arm-linux-gnueabihf/4.9.3/include-fixed",
-                "%{ARM_COMPILER_PATH}%/local_include",
+                "%{ARM_COMPILER_PATH}%/lib/gcc/arm-rpi-linux-gnueabihf/6.5.0/include",
+                "%{ARM_COMPILER_PATH}%/lib/gcc/arm-rpi-linux-gnueabihf/6.5.0/include-fixed",
+                "%{ARM_COMPILER_PATH}%/arm-rpi-linux-gnueabihf/sysroot/usr/include/",
+		"%{ARM_COMPILER_PATH}%/arm-rpi-linux-gnueabihf/include/c++/6.5.0/",
                 "/usr/include",
                 "/tmp/openblas_install/include/",
             ]
@@ -585,44 +579,44 @@ def _impl(ctx):
         tool_paths = [
             tool_path(
                 name = "ar",
-                path = "%{ARM_COMPILER_PATH}%/bin/arm-linux-gnueabihf-ar",
+                path = "%{ARM_COMPILER_PATH}%/bin/arm-rpi-linux-gnueabihf-ar",
             ),
             tool_path(name = "compat-ld", path = "/bin/false"),
             tool_path(
                 name = "cpp",
-                path = "%{ARM_COMPILER_PATH}%/bin/arm-linux-gnueabihf-cpp",
+                path = "%{ARM_COMPILER_PATH}%/bin/arm-rpi-linux-gnueabihf-cpp",
             ),
             tool_path(
                 name = "dwp",
-                path = "%{ARM_COMPILER_PATH}%/bin/arm-linux-gnueabihf-dwp",
+                path = "%{ARM_COMPILER_PATH}%/bin/arm-rpi-linux-gnueabihf-dwp",
             ),
             tool_path(
                 name = "gcc",
-                path = "%{ARM_COMPILER_PATH}%/bin/arm-linux-gnueabihf-gcc",
+                path = "%{ARM_COMPILER_PATH}%/bin/arm-rpi-linux-gnueabihf-gcc",
             ),
             tool_path(
                 name = "gcov",
-                path = "%{ARM_COMPILER_PATH}%/bin/arm-linux-gnueabihf-gcov",
+                path = "%{ARM_COMPILER_PATH}%/bin/arm-rpi-linux-gnueabihf-gcov",
             ),
             tool_path(
                 name = "ld",
-                path = "%{ARM_COMPILER_PATH}%/bin/arm-linux-gnueabihf-ld",
+                path = "%{ARM_COMPILER_PATH}%/bin/arm-rpi-linux-gnueabihf-ld",
             ),
             tool_path(
                 name = "nm",
-                path = "%{ARM_COMPILER_PATH}%/bin/arm-linux-gnueabihf-nm",
+                path = "%{ARM_COMPILER_PATH}%/bin/arm-rpi-linux-gnueabihf-nm",
             ),
             tool_path(
                 name = "objcopy",
-                path = "%{ARM_COMPILER_PATH}%/bin/arm-linux-gnueabihf-objcopy",
+                path = "%{ARM_COMPILER_PATH}%/bin/arm-rpi-linux-gnueabihf-objcopy",
             ),
             tool_path(
                 name = "objdump",
-                path = "%{ARM_COMPILER_PATH}%/bin/arm-linux-gnueabihf-objdump",
+                path = "%{ARM_COMPILER_PATH}%/bin/arm-rpi-linux-gnueabihf-objdump",
             ),
             tool_path(
                 name = "strip",
-                path = "%{ARM_COMPILER_PATH}%/bin/arm-linux-gnueabihf-strip",
+                path = "%{ARM_COMPILER_PATH}%/bin/arm-rpi-linux-gnueabihf-strip",
             ),
         ]
     elif (ctx.attr.cpu == "local"):